In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from imblearn.over_sampling import SMOTE, ADASYN
import numpy as np
from sklearn.model_selection import cross_val_score


In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv("water_potability.csv")

In [None]:


# Imputing into null values
imputer = SimpleImputer(strategy='mean')
data = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
data["Potability"].value_counts()

0.0    1998
1.0    1278
Name: Potability, dtype: int64

In [None]:
zero  = data[data['Potability']==0]   #zero values in Potability column
one = data[data['Potability']==1]  # one values in Potability column

from sklearn.utils import resample
# minority class that  is 1, we need to upsample/increase that class so that there is no bias
# n_samples = 1998 means we want 1998 sample of class 1, since there are 1998 samples of class 0
data_minority_upsampled = resample(one, replace = True, n_samples = 1998)
#concatenate
data = pd.concat([zero, data_minority_upsampled])

# X_raw = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', , 'Trihalomethanes', 'Turbidity']]
X_raw = data[['ph', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Turbidity']]
y_raw = data['Potability']

#from imblearn.over_sampling import BorderlineSMOTE
#X, y = ADASYN().fit_resample(X_raw, y_raw)

#from sklearn.utils import shuffle
#data = shuffle(data) # shuffling so that there is particular sequence

In [None]:
data.isnull().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

In [None]:
# Preprocessing 01

# X = data_imputed[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']]

X = data[['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']]
y = data['Potability']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
# scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Preprocessing 02

def imputing(df):
  for col in df.columns:
      if df[col].isnull().any():
          median_col = df[col].median()
          df[col].fillna(median_col, inplace=True)
  return df

df = imputing(data)

# Define X and y
X = df.drop('Potability', axis=1)
y = df['Potability']

# Apply log transformation only to features
X_log_transformed = np.log1p(X)


# Apply SMOTE only to the target variable
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_log_transformed, y)

# Scale the features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X_resampled)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)

In [None]:
Xs = scaler.fit_transform(X)

In [None]:
# Feature Selection - Forward Selection

selected_features = []
rf_model = RandomForestClassifier()

num_features_to_select = 5

while len(selected_features) < num_features_to_select:
    best_score = -1
    best_feature = None

    for feature_idx in range(Xs.shape[1]):
        if feature_idx in selected_features:
            continue

        # Try adding the feature to the selected set
        candidate_features = selected_features + [feature_idx]

        # Evaluate the model's performance using cross-validation
        scores = cross_val_score(rf_model, Xs[:, candidate_features], y, cv=5, scoring='accuracy')
        mean_score = np.mean(scores)

        # Keep track of the best-performing feature
        if mean_score > best_score:
            best_score = mean_score
            best_feature = feature_idx

    if best_feature is not None:
        selected_features.append(best_feature)
        print(f"Selected Feature {len(selected_features)}: {best_feature}, Mean Accuracy: {best_score:.4f}")

Selected Feature 1: 1, Mean Accuracy: 0.7322
Selected Feature 2: 2, Mean Accuracy: 0.7487
Selected Feature 3: 0, Mean Accuracy: 0.7690
Selected Feature 4: 4, Mean Accuracy: 0.7848
Selected Feature 5: 3, Mean Accuracy: 0.7975


In [None]:
X_train = X_train[:, 0:9]
X_test = X_test[:, 0:9]

In [None]:
#Random forest
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

mismatch_indices_rf = np.where(y_pred != y_test)[0]
print(mismatch_indices_rf)

Accuracy: 0.82
[ 11  13  14  25  30  36  38  39  40  43  44  50  61  87  88  93  94 104
 117 124 129 134 137 138 152 154 161 181 183 188 191 199 202 207 208 220
 225 228 232 250 261 265 277 287 294 299 300 303 307 308 310 311 316 325
 326 330 332 337 340 341 361 365 367 371 376 380 384 387 394 397 406 408
 415 417 432 434 448 451 479 488 489 494 500 506 514 520 523 528 529 532
 533 536 543 545 548 549 562 564 572 576 582 583 586 591 594 600 606 608
 612 615 620 625 627 634 646 655 657 660 664 668 676 678 691 701 704 707
 712 716 722 725 729 738 741 746 748 757 758 764 774 790 799]


In [None]:
#svm
svm_model = SVC()
svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

mismatch_indices_svm = np.where(y_pred != y_test)[0]
print(mismatch_indices_svm)

Accuracy: 0.64
[  0   4   6   8  10  11  12  18  20  21  24  29  30  33  36  38  41  43
  46  47  50  52  54  55  60  65  66  69  71  72  74  79  81  82  84  88
  91  94  95  96  98 104 105 107 109 110 115 119 121 124 125 128 131 133
 137 140 142 146 148 149 150 158 164 165 168 172 174 184 186 188 189 192
 193 195 202 205 206 209 211 212 214 219 220 227 228 230 236 238 240 248
 249 253 259 263 267 268 273 281 282 283 286 287 294 297 310 318 321 322
 327 329 333 334 338 341 342 343 347 348 351 359 360 366 369 370 375 377
 380 383 384 388 389 392 393 394 395 397 399 401 402 404 405 406 412 413
 416 417 421 424 425 426 428 430 432 433 434 435 438 439 444 445 453 455
 457 458 460 466 469 470 471 476 478 480 482 488 492 497 498 499 502 505
 509 511]


In [None]:
# Neural Network 01

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Build the Sequential model
model_nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Optional dropout layer for regularization
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model_nn.compile(optimizer=Adam(learning_rate=0.001),  # Adjust learning rate as needed
              loss='binary_crossentropy',  # Binary crossentropy for binary classification
              metrics=['accuracy'])

# Define early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model_nn.fit(X_train, y_train,
                    epochs=200,  # Adjust epochs as needed
                    batch_size=32,  # Adjust batch size as needed
                    validation_data=(X_test, y_test),
                    callbacks=[early_stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [None]:
# Evaluate the model
y_pred = model_nn.predict(X_test_scaled)
y_pred_bin = []
for i in y_pred:
  if i<0.5:
    y_pred_bin.append(0)
  else:
    y_pred_bin.append(1)

accuracy = accuracy_score(y_test, y_pred_bin)
print(f"Accuracy: {accuracy:.2f}")

mismatch_indices_nn = np.where(y_pred_bin != y_test)[0]
print(mismatch_indices_nn)

Accuracy: 0.68
[  1   4   7   8  11  13  20  21  27  45  50  51  53  61  62  63  64  66
  67  68  71  77  80  81  84  87  90  92  98  99 100 102 107 108 115 116
 127 130 133 134 137 139 142 144 146 147 166 173 177 180 181 184 186 188
 189 191 193 194 199 200 201 206 212 217 219 221 224 227 232 235 237 242
 244 246 247 250 251 252 253 254 255 256 259 261 264 267 272 279 282 285
 287 290 291 292 293 294 296 298 303 305 307 308 310 315 316 317 322 325
 327 329 334 337 342 343 348 354 356 359 369 371 373 378 379 381 382 383
 384 389 398 407 408 411 412 413 417 418 419 421 422 424 427 430 433 434
 438 439 440 444 449 458 460 461 463 466 467 470 471 472 477 479 480 481
 489 490 491 492 494 496 501 504 510 511 516 517 520 522 523 524 526 531
 535 543 552 553 554 559 562 564 566 572 573 575 576 579 582 583 586 588
 593 595 598 600 602 609 610 612 615 624 631 636 638 640 659 662 663 664
 673 679 681 682 683 685 686 688 691 692 693 695 700 704 705 706 711 722
 723 725 730 731 732 736 739 741 745

In [None]:
# Indices with common errors

common_mismatch = list(set(set(mismatch_indices_svm).intersection(set(mismatch_indices_nn))).intersection(set(mismatch_indices_rf)))
print(common_mismatch)

df_dic = {
    'Actual': y_test.iloc[common_mismatch].values
    #'Predicted': y_pred[common_mismatch]
}

for i in range(0, 9):
  X = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
  df_dic[ X[i] ] = X_test.iloc[common_mismatch, i].tolist()

# Display mismatched values
mismatch_df = pd.DataFrame(df_dic)

print("Common Mismatched Predictions:")
mismatch_df.head(50)

[1, 4, 522, 11, 523, 13, 524, 526, 531, 20, 535, 27, 543, 553, 554, 45, 559, 50, 51, 562, 53, 572, 61, 62, 573, 64, 66, 67, 68, 579, 582, 71, 583, 586, 77, 81, 84, 598, 87, 90, 602, 92, 609, 610, 99, 100, 102, 614, 615, 107, 108, 624, 115, 116, 628, 631, 636, 638, 127, 641, 130, 134, 137, 139, 652, 142, 144, 147, 663, 668, 166, 681, 682, 683, 173, 685, 686, 177, 691, 180, 181, 182, 692, 693, 695, 186, 188, 189, 704, 193, 194, 705, 706, 711, 200, 201, 717, 722, 213, 725, 217, 730, 731, 221, 736, 227, 739, 741, 745, 235, 751, 242, 754, 244, 755, 246, 247, 251, 763, 765, 254, 769, 259, 261, 264, 267, 780, 781, 783, 784, 785, 787, 789, 795, 285, 801, 291, 292, 293, 803, 804, 296, 807, 298, 809, 816, 818, 307, 308, 310, 825, 315, 316, 829, 830, 322, 835, 325, 327, 839, 329, 840, 843, 334, 337, 342, 343, 854, 858, 862, 865, 354, 866, 869, 359, 874, 876, 367, 369, 881, 371, 883, 373, 884, 888, 378, 379, 381, 382, 383, 894, 895, 896, 389, 398, 917, 407, 408, 411, 412, 413, 926, 927, 417, 419, 

Unnamed: 0,Actual,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,1.0,6.643159,188.913541,32873.820022,6.791509,333.848842,336.561501,14.70681,67.844849,4.562198
1,1.0,6.61535,179.240661,26392.863612,9.30916,333.775777,496.363562,12.786595,78.262369,4.453443
2,1.0,5.628407,226.830043,28334.491937,8.012497,293.924273,351.161213,15.019241,35.178662,5.050683
3,1.0,7.714313,191.69264,19789.63608,7.856557,340.326314,377.926368,18.917027,58.692204,4.425134
4,1.0,7.830608,213.249258,18821.389612,6.475113,316.572415,405.278738,16.872395,55.90577,3.551568
5,1.0,6.35029,190.383738,14905.393852,5.53783,333.775777,446.840605,13.983567,67.817096,4.265233
6,1.0,6.597292,191.787442,25039.354696,7.294577,395.739529,501.532653,14.695391,80.050031,4.318305
7,1.0,6.903074,206.922504,17947.988114,7.048017,333.775777,601.985223,11.77511,58.176255,4.473887
8,1.0,7.736313,225.063103,19496.848592,7.158343,289.945985,433.974022,15.153817,74.765101,3.700917
9,0.0,6.380717,266.01541,21250.935634,4.854335,357.241027,358.185473,27.006707,59.937785,4.53202


In [None]:
#gaussianNB
gnb_model = GaussianNB()
gnb_model.fit(X_train, y_train)
y_pred = gnb_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")

mismatch_indices = np.where(y_pred != y_test)[0]
print(mismatch_indices)

Accuracy: 0.53
[  1   3   7   8  11  12  15  16  19  21  22  23  25  26  28  29  30  33
  35  38  40  41  43  45  46  48  52  54  55  57  58  59  60  63  64  66
  67  69  72  75  78  80  81  82  83  86  87  91  93  94  96 100 104 105
 106 112 117 120 122 123 126 128 130 133 134 137 140 142 146 147 148 150
 151 157 158 159 162 165 166 168 170 172 173 175 177 178 182 184 185 186
 189 192 197 198 202 204 207 208 211 215 218 219 223 230 233 234 235 239
 240 242 244 245 246 252 255 256 258 259 260 261 262 266 267 270 271 275
 277 278 279 280 282 285 286 287 291 292 294 295 299 302 303 304 305 307
 310 311 314 318 319 322 324 326 327 329 335 336 340 342 344 346 347 348
 349 355 356 357 358 359 360 362 363 364 366 371 373 376 379 380 384 385
 386 389 392 394 397 399 401 403 405 408 411 413 414 416 418 419 423 424
 425 430 432 433 437 439 441 442 444 447 449 452 458 460 465 469 470 471
 475 477 480 481 483 484 487 488 490 491 492 494 497 499 503 505 509 512
 513 514 516 517 520 521 522 523 524

In [None]:
# Find indices where predictions do not match actual labels
mismatch_indices = np.where(y_pred != y_test)[0]

df_dic = {
    'Actual': y_test.iloc[mismatch_indices].values,
    'Predicted': y_pred[mismatch_indices]
}

for i in range(0, 9):
  X = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
  df_dic[ X[i] ] = X_test.iloc[mismatch_indices, i].tolist()

# Display mismatched values
mismatch_df = pd.DataFrame(df_dic)

print("Mismatched Predictions:")
mismatch_df

Mismatched Predictions:


Unnamed: 0,Actual,Predicted,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,1.0,0.0,6.643159,188.913541,32873.820022,6.791509,333.848842,336.561501,14.706810,67.844849,4.562198
1,1.0,0.0,6.615350,179.240661,26392.863612,9.309160,333.775777,496.363562,12.786595,78.262369,4.453443
2,0.0,1.0,11.235426,178.596496,33773.107061,9.063042,327.650960,425.868039,17.986255,58.986652,5.147055
3,1.0,0.0,7.714313,191.692640,19789.636080,7.856557,340.326314,377.926368,18.917027,58.692204,4.425134
4,1.0,0.0,6.350290,190.383738,14905.393852,5.537830,333.775777,446.840605,13.983567,67.817096,4.265233
...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,0.0,6.984406,181.707343,18194.545310,8.263804,371.146458,201.619737,11.267398,68.192658,3.406210
196,1.0,0.0,6.702547,207.321086,17246.920347,7.708117,304.510230,329.266002,16.217303,28.878601,3.442983
197,1.0,0.0,7.164478,213.002441,32751.928963,6.292148,333.775777,490.933121,12.683767,58.252613,4.998203
198,1.0,0.0,5.433466,177.828302,31421.731633,4.584134,347.097354,490.284674,16.066439,58.416699,2.871196


In [None]:
# Find indices where predictions match actual labels
match_indices = np.where(y_pred == y_test)[0]

df_dic = {
    'Actual': y_test.iloc[match_indices].values,
    'Predicted': y_pred[match_indices]
}

for i in range(0, 9):
  X = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
  df_dic[ X[i] ] = X_test.iloc[match_indices, i].tolist()

# Display mismatched values
match_df = pd.DataFrame(df_dic)

print("Matched Predictions:")
match_df.head(50)

Matched Predictions:


Unnamed: 0,Actual,Predicted,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
0,0.0,0.0,7.080795,183.521107,20461.25271,7.333212,333.119476,356.369022,20.179029,67.019903,4.886634
1,0.0,0.0,7.846058,224.058877,23264.109968,5.922367,300.40262,387.971336,13.406737,43.075186,2.487969
2,0.0,0.0,7.160467,183.08931,6743.346066,3.803036,277.599099,428.036344,9.799625,90.035374,3.884891
3,1.0,1.0,13.175402,47.432,19237.949676,8.90702,375.147315,500.245952,12.083896,66.396293,4.106924
4,0.0,0.0,5.499489,230.308775,13902.968646,9.619575,352.084333,442.167006,14.740787,66.396293,5.846827
5,0.0,0.0,7.080795,139.331152,2912.211247,10.338234,343.318021,532.885196,11.078341,42.172824,4.093098
6,0.0,0.0,7.080795,233.846621,32496.640216,6.637384,347.986448,424.649773,13.028,55.983973,4.296189
7,0.0,0.0,8.394397,187.643411,10603.098021,7.840261,352.83564,376.241146,13.374831,58.950002,2.833901
8,0.0,0.0,7.417824,243.304691,320.942611,4.59867,336.097981,361.101769,20.421472,87.052576,3.470812
9,1.0,1.0,9.15966,195.423316,16679.335164,10.110462,301.746411,404.659103,5.196717,55.466759,5.452362


In [None]:
# Hyperparameter tuning and evaluation
for name, model in models.items():
    if name == 'XGBoost':
        param_grid_xgb = {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_child_weight': [1, 3, 5],
            'gamma': [0, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'n_estimators': [100, 200, 300]
        }
        grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=5, scoring='accuracy')
        grid_search_xgb.fit(X_train_scaled, y_train)
        best_model = grid_search_xgb.best_estimator_
    else:
        model.fit(X_train_scaled, y_train)
        best_model = model

    y_pred = best_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))
    print()

Random Forest Accuracy: 0.68
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.70      0.86      0.77       412
         1.0       0.61      0.38      0.47       244

    accuracy                           0.68       656
   macro avg       0.65      0.62      0.62       656
weighted avg       0.67      0.68      0.66       656

Confusion Matrix for Random Forest:
[[353  59]
 [152  92]]

Logistic Regression Accuracy: 0.63
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       412
         1.0       0.00      0.00      0.00       244

    accuracy                           0.63       656
   macro avg       0.31      0.50      0.39       656
weighted avg       0.39      0.63      0.48       656

Confusion Matrix for Logistic Regression:
[[412   0]
 [244   0]]



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [None]:
# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

# Hyperparameter tuning and evaluation
for name, model in models.items():
    if name == 'XGBoost':
        param_grid_xgb = {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_child_weight': [1, 3, 5],
            'gamma': [0, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'n_estimators': [100, 200, 300]
        }
        grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=5, scoring='accuracy')
        grid_search_xgb.fit(X_train_scaled, y_train)
        best_model = grid_search_xgb.best_estimator_
    else:
        model.fit(X_train_scaled, y_train)
        best_model = model

    y_pred = best_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))
    print()

# Evaluate and compare results


In [None]:
# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True)  # Enable probability for VotingClassifier
}

# Hyperparameter tuning and evaluation
for name, model in models.items():
    if name == 'XGBoost':
        param_grid_xgb = {
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
            'min_child_weight': [1, 3, 5],
            'gamma': [0, 0.1, 0.2],
            'subsample': [0.8, 1.0],
            'colsample_bytree': [0.8, 1.0],
            'n_estimators': [100, 200, 300]
        }
        grid_search_xgb = GridSearchCV(model, param_grid_xgb, cv=5, scoring='accuracy')
        grid_search_xgb.fit(X_train_scaled, y_train)
        best_model = grid_search_xgb.best_estimator_
    else:
        model.fit(X_train_scaled, y_train)
        best_model = model

    y_pred = best_model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"{name} Accuracy: {accuracy:.2f}")
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))
    print()

# Ensemble model (VotingClassifier example)
ensemble_model = VotingClassifier(estimators=[('RF', models['Random Forest']),
                                              ('LR', models['Logistic Regression']),
                                              ('XGB', models['XGBoost']),
                                              ('SVM', models['SVM'])],
                                  voting='soft')  # Soft voting for probabilities
ensemble_model.fit(X_train_scaled, y_train)
ensemble_pred = ensemble_model.predict(X_test_scaled)
ensemble_accuracy = accuracy_score(y_test, ensemble_pred)

print(f"Ensemble Model Accuracy: {ensemble_accuracy:.2f}")
print(f"Classification Report for Ensemble Model:")
print(classification_report(y_test, ensemble_pred))
print(f"Confusion Matrix for Ensemble Model:")
print(confusion_matrix(y_test, ensemble_pred))


Random Forest Accuracy: 0.68
Classification Report for Random Forest:
              precision    recall  f1-score   support

         0.0       0.70      0.86      0.77       412
         1.0       0.61      0.38      0.47       244

    accuracy                           0.68       656
   macro avg       0.65      0.62      0.62       656
weighted avg       0.67      0.68      0.66       656

Confusion Matrix for Random Forest:
[[353  59]
 [152  92]]

Logistic Regression Accuracy: 0.63
Classification Report for Logistic Regression:
              precision    recall  f1-score   support

         0.0       0.63      1.00      0.77       412
         1.0       0.00      0.00      0.00       244

    accuracy                           0.63       656
   macro avg       0.31      0.50      0.39       656
weighted avg       0.39      0.63      0.48       656

Confusion Matrix for Logistic Regression:
[[412   0]
 [244   0]]



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 