In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn. ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import FunctionTransformer
from scipy.io import arff
import scipy.stats as stats
import matplotlib.pyplot as plt
# Read data

In [2]:
df = pd.read_csv("waterquality_output.csv")

## Features Extraction

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 534 entries, 0 to 533
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           534 non-null    int64  
 1   TEMP                 534 non-null    float64
 2   DO                   534 non-null    float64
 3   pH                   534 non-null    float64
 4   CONDUCTIVITY         534 non-null    float64
 5   BOD                  534 non-null    float64
 6   NITRATE_N_NITRITE_N  534 non-null    float64
 7   FECAL_COLIFORM       534 non-null    float64
 8   TOTAL_COLIFORM       534 non-null    float64
 9   potability           534 non-null    int64  
dtypes: float64(8), int64(2)
memory usage: 41.8 KB


In [4]:
df.isnull().sum()

Unnamed: 0             0
TEMP                   0
DO                     0
pH                     0
CONDUCTIVITY           0
BOD                    0
NITRATE_N_NITRITE_N    0
FECAL_COLIFORM         0
TOTAL_COLIFORM         0
potability             0
dtype: int64

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,TEMP,DO,pH,CONDUCTIVITY,BOD,NITRATE_N_NITRITE_N,FECAL_COLIFORM,TOTAL_COLIFORM,potability
0,0,0.802575,0.392638,0.214286,0.028972,0.04244,0.043956,1e-05,3e-06,1
1,1,0.600858,0.368098,0.202381,0.009616,0.038462,0.043956,0.000232,8e-06,0
2,2,0.656652,0.337423,0.178571,0.013154,0.05305,0.197802,0.00019,6e-06,1
3,3,0.613734,0.337423,0.178571,0.01382,0.071618,0.078022,0.00029,1.2e-05,1
4,4,0.652361,0.349693,0.190476,0.010615,0.039788,0.059121,0.000145,6e-06,1


In [6]:
df = df.drop(['Unnamed: 0'], axis=1)
df.head()

Unnamed: 0,TEMP,DO,pH,CONDUCTIVITY,BOD,NITRATE_N_NITRITE_N,FECAL_COLIFORM,TOTAL_COLIFORM,potability
0,0.802575,0.392638,0.214286,0.028972,0.04244,0.043956,1e-05,3e-06,1
1,0.600858,0.368098,0.202381,0.009616,0.038462,0.043956,0.000232,8e-06,0
2,0.656652,0.337423,0.178571,0.013154,0.05305,0.197802,0.00019,6e-06,1
3,0.613734,0.337423,0.178571,0.01382,0.071618,0.078022,0.00029,1.2e-05,1
4,0.652361,0.349693,0.190476,0.010615,0.039788,0.059121,0.000145,6e-06,1


In [7]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
df.values[:,:-1],
df.values[:,-1:],
test_size=0.4,
random_state=42)
y_train = y_train.ravel()
y_test = y_test.ravel()
print('Training dataset shape:', X_train.shape, y_train.shape)
print('Testing dataset shape:', X_test.shape, y_test.shape)

Training dataset shape: (320, 8) (320,)
Testing dataset shape: (214, 8) (214,)


## Random Forest

In [8]:
# Build RF classifier to use in feature selection
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

In [9]:
# Build step forward feature selection
sfs1 = sfs(clf,
k_features=4, 
forward=True, 
floating=False, 
verbose=2,
scoring='accuracy',
cv=0)


In [10]:
# Perform SFFS
sfs1 = sfs1.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    6.6s finished

[2023-05-30 12:39:37] Features: 1/4 -- score: 0.934375[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    1.5s finished

[2023-05-30 12:39:39] Features: 2/4 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    1.2s finished

[2023-05-30 12:39:40] Features: 3/4 -- score: 1.0[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:

In [11]:
# Which features?
feat_cols = list(sfs1.k_feature_idx_)
print(feat_cols)

[0, 1, 2, 3]


In [12]:
# Build full model with selected features
clf = RandomForestClassifier(n_estimators=1000, random_state=150, max_depth=7)
model = clf.fit(X_train[:, feat_cols], y_train)

In [13]:
y_train_pred = clf.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))
y_test_pred = clf.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 1.000
Testing accuracy on selected features: 0.977


In [14]:
# Build full model on ALL features, for comparison
clf = RandomForestClassifier(n_estimators=1000, random_state=150, max_depth=7)
clf.fit(X_train, y_train)
y_train_pred = clf.predict(X_train)
print('Training accuracy on all features: %.3f' % acc(y_train, y_train_pred))
y_test_pred = clf.predict(X_test)
print('Testing accuracy on all features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on all features: 1.000
Testing accuracy on all features: 0.977


### Boosting - Ada Boost

In [15]:

adb1 = AdaBoostClassifier(RandomForestClassifier(),n_estimators = 5, learning_rate = 1) #implement AdaBoost on 5 decision tree 
adb1.fit(X_train,y_train) #fit into train dataset

In [16]:
#Boosting accuracy for test dataset
adb1.score(X_test,y_test)

0.9719626168224299

### Bagging

In [17]:
bg1 = BaggingClassifier(RandomForestClassifier(), max_samples= 0.5, max_features = 1.0, n_estimators = 20) #each bag contains 50% samples of training data, all features used and 20 decision tree
bg1.fit(X_train,y_train) #fit into train dataset

In [18]:
#Bagging accuracy for test dataset
bg1.score(X_test,y_test)

0.9766355140186916

## Decision Tree

In [19]:
#Decision Tree
dt = DecisionTreeClassifier() #import module
dt.fit(X_train,y_train) #fit into train dataset

In [20]:
#Decision Tree accuracy of test dataset
dt.score(X_test,y_test)

0.9532710280373832

In [21]:
sfs2 = sfs(dt,
k_features=4,
forward=True,
floating=False,
verbose=2,
scoring='accuracy',
cv=5)


In [22]:
# Perform SFFS
sfs2 = sfs2.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.0s finished

[2023-05-30 12:39:55] Features: 1/4 -- score: 0.88125[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s finished

[2023-05-30 12:39:55] Features: 2/4 -- score: 0.95625[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished

[2023-05-30 12:39:55] Features: 3/4 -- score: 0.959375[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s re

In [23]:
# Build full model with selected features
dt = DecisionTreeClassifier(random_state=150, max_depth=7)
dt.fit(X_train[:, feat_cols], y_train)

In [24]:
# Which features?
feat_cols = list(sfs2.k_feature_idx_)
print(feat_cols)

[0, 1, 4, 6]


In [25]:
y_train_pred = dt.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))
y_test_pred = dt.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.972
Testing accuracy on selected features: 0.963


### Boosting - Ada Boost

In [26]:
adb = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators = 5, learning_rate = 1) #implement AdaBoost on 5 decision tree 
adb.fit(X_train,y_train) #fit into train dataset

In [27]:
#Boosting accuracy for test dataset
adb.score(X_test,y_test)

0.9532710280373832

### Bagging

In [28]:
bg = BaggingClassifier(DecisionTreeClassifier(), max_samples= 0.5, max_features = 1.0, n_estimators = 20) #each bag contains 50% samples of training data, all features used and 20 decision tree
bg.fit(X_train,y_train) #fit into train dataset

In [29]:
#Bagging accuracy for test dataset
bg.score(X_test,y_test)

0.9766355140186916

## SVM

In [30]:
from sklearn.svm import SVC

In [31]:
svc=SVC(C=100.0)
model = svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print('Model accuracy score with default hyperparameters: {0:0.4f}'. format(acc(y_test, y_pred)))

Model accuracy score with default hyperparameters: 0.9813


In [32]:
sfs3 = sfs(svc,
k_features=5,
forward=True,
floating=False,
verbose=2,
scoring='accuracy',
cv=5)

In [33]:
sfs3 = sfs3.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.2s finished

[2023-05-30 12:39:55] Features: 1/5 -- score: 0.915625[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s finished

[2023-05-30 12:39:55] Features: 2/5 -- score: 0.965625[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s finished

[2023-05-30 12:39:56] Features: 3/5 -- score: 0.984375[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s 

In [34]:
y_train_pred = dt.predict(X_train[:, feat_cols])
print('Training accuracy on selected features: %.3f' % acc(y_train, y_train_pred))
y_test_pred = dt.predict(X_test[:, feat_cols])
print('Testing accuracy on selected features: %.3f' % acc(y_test, y_test_pred))

Training accuracy on selected features: 0.972
Testing accuracy on selected features: 0.963


## Voting Classifier

In [35]:
# Voting Classifier - Multiple Model Ensemble 

dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
svm = SVC(kernel = 'poly', degree = 2 )

In [36]:
evc = VotingClassifier( estimators= [('dt',dt), ('rf', rf),('svm',svm)], voting = 'hard') #4 classifier for Voting Classifier and hard voting (votes on labels not probabilities)

In [37]:
evc.fit(X_train,y_train) #fit model

In [38]:
evc.score(X_test, y_test)

0.9766355140186916

# GUI

In [39]:
df.head()

Unnamed: 0,TEMP,DO,pH,CONDUCTIVITY,BOD,NITRATE_N_NITRITE_N,FECAL_COLIFORM,TOTAL_COLIFORM,potability
0,0.802575,0.392638,0.214286,0.028972,0.04244,0.043956,1e-05,3e-06,1
1,0.600858,0.368098,0.202381,0.009616,0.038462,0.043956,0.000232,8e-06,0
2,0.656652,0.337423,0.178571,0.013154,0.05305,0.197802,0.00019,6e-06,1
3,0.613734,0.337423,0.178571,0.01382,0.071618,0.078022,0.00029,1.2e-05,1
4,0.652361,0.349693,0.190476,0.010615,0.039788,0.059121,0.000145,6e-06,1


In [40]:
import pickle
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [41]:
pickle.dump(model, open("model.pkl", "wb"))

In [42]:
def predict_rainfall(TEMP, DO, pH, CONDUCTIVITY, BOD, NITRATE_N_NITRITE_N, FECAL_COLIFORM, TOTAL_COLIFORM):
    TEMP = (TEMP-10.5)/(33.8-10.5)
    DO = (DO-0)/(16.3-0)
    pH = (pH-6.3)/(14.7-6.3)
    CONDUCTIVITY = (CONDUCTIVITY-39)/(24062-39)
    BOD = (BOD-0.2)/(75.6-0.2)
    NITRATE_N_NITRITE_N = (NITRATE_N_NITRITE_N-0)/(45.5-0)
    FECAL_COLIFORM = (FECAL_COLIFORM-0)/(310417-0)
    TOTAL_COLIFORM = (TOTAL_COLIFORM-1)/(23816667-1)
    
    
    # Load the model
    model = pickle.load(open("model.pkl", "rb"))
    
    # Predict the water quality
    result = model.predict([[TEMP, DO, pH, CONDUCTIVITY, BOD, NITRATE_N_NITRITE_N, FECAL_COLIFORM, TOTAL_COLIFORM]])
    
    # Return the water quality
    if result == 0 :
        return "dangerous"
    else : 
        return "safe"
   


In [43]:
app = gr.Interface(
    predict_rainfall,
    inputs=[
        gr.Number(value=0, label="Temperature"),
        gr.Number(value=0, label="Dissolved Oxygen"),
        gr.Number(value=0, label="pH"),
        gr.Number(value=0, label="CONDUCTIVITY"),
        gr.Number(value=0, label="BOD"),
        gr.Number(value=0, label="NITRATE_N_NITRITE_N"),
        gr.Number(value=0, label="FECAL_COLIFORM"),
        gr.Number(value=0, label="TOTAL_COLIFORM"),
    ],
    outputs=[gr.Text(label="Water Quality")],
)

app.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


