In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data = pd.read_csv("/content/drive/MyDrive/Autumn 2024/Machine Learning/Week6/water_potability.csv")
data

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
3271,4.668102,193.681735,47580.991603,7.166639,359.948574,526.424171,13.894419,66.687695,4.435821,1
3272,7.808856,193.553212,17329.802160,8.061362,,392.449580,19.903225,,2.798243,1
3273,9.419510,175.762646,33155.578218,7.350233,,432.044783,11.039070,69.845400,3.298875,1
3274,5.126763,230.603758,11983.869376,6.303357,,402.883113,11.168946,77.488213,4.708658,1


In [6]:
data.isnull().sum()

Unnamed: 0,0
ph,491
Hardness,0
Solids,0
Chloramines,0
Sulfate,781
Conductivity,0
Organic_carbon,0
Trihalomethanes,162
Turbidity,0
Potability,0


In [7]:
data = data.dropna()

In [27]:
data['Potability'].value_counts()

Unnamed: 0_level_0,count
Potability,Unnamed: 1_level_1
0.0,1200
1.0,811


In [28]:
zero  = data[data['Potability']==0]   #zero values in Potability column
one = data[data['Potability']==1]  # one values in Potability column


from sklearn.utils import resample
# minority class that  is 1, we need to upsample/increase that class so that there is no bias
# n_samples = 1998 means we want 1998 sample of class 1, since there are 1998 samples of class 0
data_minority_upsampled = resample(one, replace = True, n_samples = 1998)
#concatenate
data = pd.concat([zero, data_minority_upsampled])


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler


scaler = StandardScaler()
 #scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [8]:
X = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
y = data['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
#Random forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

mismatch_indices_rf = np.where(y_pred != y_test)[0]
print(mismatch_indices_rf)

Accuracy: 0.67
[  1   3   4   7  10  13  17  19  22  25  30  31  33  34  37  42  53  55
  59  62  66  68  71  72  73  74  77  78  79  80  83  85  87  90  92  96
  97  99 100 103 106 111 112 115 121 123 131 137 140 143 147 148 150 153
 162 167 171 173 174 176 183 187 189 191 193 194 195 197 201 204 207 212
 213 215 223 228 231 232 235 240 243 247 249 250 252 255 257 261 262 265
 271 272 273 275 285 288 299 300 302 303 304 306 309 314 316 319 326 327
 329 330 331 334 335 341 342 345 352 355 356 361 364 365 367 368 370 376
 377 383 387 391 393 396 398 401]


In [12]:
#svm
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

mismatch_indices_svm = np.where(y_pred != y_test)[0]
print(mismatch_indices_svm)

Accuracy: 0.57
[  4   7  11  15  19  22  24  25  30  31  34  39  41  43  47  53  55  59
  62  63  64  66  67  72  74  76  79  80  83  84  88  90  92  93  95  96
  97  99 100 104 105 106 108 109 110 111 115 121 122 123 129 131 132 135
 137 140 143 147 153 159 165 166 167 171 173 174 176 177 181 183 184 190
 191 192 193 194 197 199 201 203 204 206 210 212 213 214 216 221 222 223
 224 230 231 232 235 237 239 240 247 249 250 252 254 257 258 259 261 265
 271 272 273 275 278 283 284 288 290 293 297 299 301 302 303 306 307 308
 309 310 319 320 326 327 329 330 331 334 335 341 342 343 345 347 348 351
 352 354 355 356 357 361 364 367 368 370 371 373 376 378 379 380 381 382
 383 384 387 390 391 396 397 398 399 400]


In [24]:
# Feature Selection - Forward Selection
from sklearn.model_selection import cross_val_score


selected_features = []
model = SVC()


num_features_to_select = 5


while len(selected_features) < num_features_to_select:
   best_score = -1
   best_feature = None


   for feature_idx in range(X.shape[1]):
       if feature_idx in selected_features:
           continue


       # Try adding the feature to the selected set
       candidate_features = selected_features + [feature_idx]


       # Evaluate the model's performance using cross-validation
       scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
       mean_score = np.mean(scores)


       # Keep track of the best-performing feature
       if mean_score > best_score:
           best_score = mean_score
           best_feature = feature_idx


   if best_feature is not None:
       selected_features.append(best_feature)
       print(f"Selected Feature {len(selected_features)}: {best_feature}, Mean Accuracy: {best_score:.4f}")











Selected Feature 1: 4, Mean Accuracy: 0.6007
Selected Feature 2: 5, Mean Accuracy: 0.5997
Selected Feature 3: 0, Mean Accuracy: 0.5972
Selected Feature 4: 3, Mean Accuracy: 0.5972
Selected Feature 5: 1, Mean Accuracy: 0.5972


In [23]:
# Feature Selection - Forward Selection
from sklearn.model_selection import cross_val_score


selected_features = []
model = RandomForestClassifier()


num_features_to_select = 5


while len(selected_features) < num_features_to_select:
   best_score = -1
   best_feature = None


   for feature_idx in range(X.shape[1]):
       if feature_idx in selected_features:
           continue


       # Try adding the feature to the selected set
       candidate_features = selected_features + [feature_idx]


       # Evaluate the model's performance using cross-validation
       scores = cross_val_score(model, X.iloc[:, candidate_features], y, cv=5, scoring='accuracy')
       mean_score = np.mean(scores)


       # Keep track of the best-performing feature
       if mean_score > best_score:
           best_score = mean_score
           best_feature = feature_idx


   if best_feature is not None:
       selected_features.append(best_feature)
       print(f"Selected Feature {len(selected_features)}: {best_feature}, Mean Accuracy: {best_score:.4f}")


Selected Feature 1: 1, Mean Accuracy: 0.5291
Selected Feature 2: 4, Mean Accuracy: 0.5694
Selected Feature 3: 0, Mean Accuracy: 0.6335
Selected Feature 4: 3, Mean Accuracy: 0.6370
Selected Feature 5: 2, Mean Accuracy: 0.6450


In [18]:
# Imputing into null values
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy='mean')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
