In [1]:
# Cell 1: Import Libraries and Load Data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, precision_recall_curve, confusion_matrix
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import shap
import scipy.stats as stats

# Create plots directory if it doesn't exist
if not os.path.exists('plots'):
    os.makedirs('plots')

# Load the dataset
df = pd.read_csv('water_potability.csv')

# Display basic information
print("Dataset Info:")
print(df.info())
print("\nDataset Description:")
print(df.describe())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ph               2785 non-null   float64
 1   Hardness         3276 non-null   float64
 2   Solids           3276 non-null   float64
 3   Chloramines      3276 non-null   float64
 4   Sulfate          2495 non-null   float64
 5   Conductivity     3276 non-null   float64
 6   Organic_carbon   3276 non-null   float64
 7   Trihalomethanes  3114 non-null   float64
 8   Turbidity        3276 non-null   float64
 9   Potability       3276 non-null   int64  
dtypes: float64(9), int64(1)
memory usage: 256.1 KB
None

Dataset Description:
                ph     Hardness        Solids  Chloramines      Sulfate  \
count  2785.000000  3276.000000   3276.000000  3276.000000  2495.000000   
mean      7.080795   196.369496  22014.092526     7.122277   333.775777   
std       1.594320    3

In [1]:
# Cell 2: Exploratory Data Analysis (EDA)
# Ensure all plots are saved correctly

# Missing Values Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cmap='viridis', cbar=False, yticklabels=df.index)
plt.title('Missing Values Heatmap')
plt.savefig('plots/missing_values_heatmap.png')
plt.close()

# Class Distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Potability', data=df)
plt.title('Class Distribution')
plt.savefig('plots/class_distribution.png')
plt.close()

# Correlation Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.savefig('plots/correlation_heatmap.png')
plt.close()

# Pair Plot of Key Features
key_features = ['ph', 'Hardness', 'Chloramines', 'Sulfate', 'Potability']
sns.pairplot(df[key_features], hue='Potability', diag_kind='hist')
plt.savefig('plots/pair_plot_key_features.png')
plt.close()

# Feature Distributions (Interactive with Plotly)
features = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
for feature in features:
    fig = px.histogram(df, x=feature, color='Potability', marginal='box', title=f'{feature} Distribution by Potability')
    fig.write_html(f'plots/{feature.lower()}_distribution.html')

NameError: name 'plt' is not defined

In [2]:
# Feature engineering
df['ph_hardness'] = df['ph'] * df['Hardness']  # Fixed 'pH' to 'ph'
df['chloramines_sulfate'] = df['Chloramines'] * df['Sulfate']

# Define plot directory
PLOT_DIR = r"C:\Users\iasis\Ultimate_Water_Potability_Prediction\plots"

# Graph 14: ph*Hardness Distribution (Previously Graph 5 in the snippet)
print("\n### Graph 14: ph*Hardness Distribution ###")
print("This histogram visualizes the engineered feature 'ph*Hardness', colored by 'Potability'. The interaction between pH and Hardness may capture combined effects on water potability, potentially improving model performance. We observe how this new feature distributes across potable (1) and non-potable (0) water, which can indicate its predictive power.")
fig = px.histogram(df, x='ph_hardness', color='Potability', title='ph*Hardness Distribution by Potability', 
                   nbins=30, opacity=0.7)
fig.update_layout(
    xaxis_title='ph*Hardness', 
    yaxis_title='Count',
    title_font_size=14,
    xaxis_title_font_size=12,
    yaxis_title_font_size=12,
    legend_title='Potability',
    legend_title_font_size=12,
    legend_font_size=10,
    xaxis=dict(showgrid=True, gridcolor='LightGray'),
    yaxis=dict(showgrid=True, gridcolor='LightGray'),
    plot_bgcolor='white'
)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
try:
    fig.write_html(os.path.join(PLOT_DIR, 'ph_hardness_distribution.html'))
    print("Graph 14 saved successfully.")
except Exception as e:
    print(f"Error saving Graph 14: {e}")
fig.show()

# Graph 15: Chloramines*Sulfate Distribution (Previously Graph 5 in the snippet)
print("\n### Graph 15: Chloramines*Sulfate Distribution ###")
print("This histogram shows the distribution of the engineered feature 'Chloramines*Sulfate', colored by 'Potability'. The interaction between these chemical properties might reveal patterns affecting water safety, enhancing the dataset's predictive power. Differences in distribution between potable and non-potable water suggest this feature may be useful for classification.")
fig = px.histogram(df, x='chloramines_sulfate', color='Potability', title='Chloramines*Sulfate Distribution by Potability', 
                   nbins=30, opacity=0.7)
fig.update_layout(
    xaxis_title='Chloramines*Sulfate', 
    yaxis_title='Count',
    title_font_size=14,
    xaxis_title_font_size=12,
    yaxis_title_font_size=12,
    legend_title='Potability',
    legend_title_font_size=12,
    legend_font_size=10,
    xaxis=dict(showgrid=True, gridcolor='LightGray'),
    yaxis=dict(showgrid=True, gridcolor='LightGray'),
    plot_bgcolor='white'
)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
try:
    fig.write_html(os.path.join(PLOT_DIR, 'chloramines_sulfate_distribution.html'))
    print("Graph 15 saved successfully.")
except Exception as e:
    print(f"Error saving Graph 15: {e}")
fig.show()

# KNN imputation
imputer_knn = KNNImputer(n_neighbors=5)
df_knn = pd.DataFrame(imputer_knn.fit_transform(df), columns=df.columns)

# Skip GAN imputation (using KNN as fallback)
df_gan = df_knn  # Fallback to KNN imputation

# Graph 16: Missing Values After Imputation (Previously Graph 6 in the snippet)
print("\n### Graph 16: Missing Values After Imputation ###")
print("These heatmaps compare missing values after KNN imputation and the fallback (also KNN in this case). Since all missing values are imputed, we expect no yellow (missing) regions in either heatmap, confirming that the imputation step was successful.")
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.heatmap(df_knn.isnull(), cbar=False, cmap='viridis', ax=axes[0])
axes[0].set_title('Missing Values After KNN Imputation', fontsize=14)
sns.heatmap(df_gan.isnull(), cbar=False, cmap='viridis', ax=axes[1])
axes[1].set_title('Missing Values (KNN Fallback)', fontsize=14)
try:
    plt.savefig(os.path.join(PLOT_DIR, 'missing_values_after_imputation.png'))
    print("Graph 16 saved successfully.")
except Exception as e:
    print(f"Error saving Graph 16: {e}")
plt.show()

# Use KNN-imputed data for main pipeline
X = df_knn.drop('Potability', axis=1)
y = df_knn['Potability']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Graph 17: Class Distribution After SMOTE (Previously Graph 7 in the snippet)
print("\n### Graph 17: Class Distribution After SMOTE ###")
print("This bar plot shows the class distribution after applying SMOTE. The classes are now balanced, ensuring that the model won’t be biased toward the majority class (non-potable). Equal counts of potable (1) and non-potable (0) samples confirm SMOTE’s effectiveness in addressing class imbalance.")

# FIX: legend=False is invalid for sns.countplot. Replaced with correct argument.
plt.figure(figsize=(8, 6))
sns.countplot(x=pd.Series(y_train_smote), palette='Set2')
plt.title('Class Distribution After SMOTE\n(0 = Non-Potable, 1 = Potable)', fontsize=14, pad=15)
plt.xlabel('Potability', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, axis='y', linestyle='--', alpha=0.7)
try:
    plt.savefig(os.path.join(PLOT_DIR, 'class_distribution_smote.png'))
    print("Graph 17 saved successfully.")
except Exception as e:
    print(f"Error saving Graph 17: {e}")
plt.show()

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_smote)
X_test_scaled = scaler.transform(X_test)

# Check for NaNs in scaled data
print("NaN in X_train_scaled:", np.any(np.isnan(X_train_scaled)))
print("NaN in X_test_scaled:", np.any(np.isnan(X_test_scaled)))

# Graph 18: Scaled Feature Distribution (Previously Graph 8 in the snippet)
print("\n### Graph 18: Scaled Feature Distribution ###")
print("These histograms compare the scaled distributions of 'ph' and 'Chloramines*Sulfate' for the training and test sets. Scaling ensures features are on the same scale, which is crucial for models like neural networks and gradient boosting. Similar distributions between train and test sets indicate proper scaling and data consistency.")
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
sns.histplot(X_train_scaled[:, 0], kde=True, color='blue', label='Scaled ph (Train)', ax=axes[0])
sns.histplot(X_test_scaled[:, 0], kde=True, color='orange', label='Scaled ph (Test)', ax=axes[0])
axes[0].set_title('Scaled ph Distribution', fontsize=14)
axes[0].set_xlabel('Scaled ph', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].legend()
axes[0].grid(True, linestyle='--', alpha=0.7)
sns.histplot(X_train_scaled[:, -1], kde=True, color='blue', label='Scaled Chloramines*Sulfate (Train)', ax=axes[1])
sns.histplot(X_test_scaled[:, -1], kde=True, color='orange', label='Scaled Chloramines*Sulfate (Test)', ax=axes[1])
axes[1].set_title('Scaled Chloramines*Sulfate Distribution', fontsize=14)
axes[1].set_xlabel('Scaled Chloramines*Sulfate', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].legend()
axes[1].grid(True, linestyle='--', alpha=0.7)
try:
    plt.savefig(os.path.join(PLOT_DIR, 'scaled_features_distribution.png'))
    print("Graph 18 saved successfully.")
except Exception as e:
    print(f"Error saving Graph 18: {e}")
plt.show()

# Define model directory
MODEL_DIR = r"C:\Users\iasis\Ultimate_Water_Potability_Prediction\models"

# Save scaler
try:
    with open(os.path.join(MODEL_DIR, 'scaler.pkl'), 'wb') as f:
        pickle.dump(scaler, f)
    print("Scaler saved successfully.")
except Exception as e:
    print(f"Error saving scaler: {e}")


NameError: name 'df' is not defined

In [3]:
# Define XGBoost
xgb = XGBClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}
grid_search_xgb = GridSearchCV(xgb, param_grid, cv=5, scoring='f1', n_jobs=-1)
grid_search_xgb.fit(X_train_scaled, y_train_smote)

# Best model
xgb_best = grid_search_xgb.best_estimator_
print("Best XGBoost Parameters:", grid_search_xgb.best_params_)

# Graph 9: XGBoost CV Scores
cv_results_xgb = pd.DataFrame(grid_search_xgb.cv_results_)
plt.figure(figsize=(10, 6))
plt.plot(cv_results_xgb['mean_test_score'], marker='o')
plt.title('XGBoost Cross-Validation F1-Scores During Grid Search')
plt.xlabel('Parameter Combination Index')
plt.ylabel('Mean F1-Score')
plt.savefig('plots/xgb_cv_scores.png')
plt.show()

# Save model
with open('models/xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_best, f)

NameError: name 'XGBClassifier' is not defined

In [4]:
# Define LightGBM
lgbm = LGBMClassifier(random_state=42)
param_grid_lgbm = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.01, 0.1]
}
grid_search_lgbm = GridSearchCV(lgbm, param_grid_lgbm, cv=5, scoring='f1', n_jobs=-1)
grid_search_lgbm.fit(X_train_scaled, y_train_smote)

# Best model
lgbm_best = grid_search_lgbm.best_estimator_
print("Best LightGBM Parameters:", grid_search_lgbm.best_params_)

# Graph 10: LightGBM CV Scores
cv_results_lgbm = pd.DataFrame(grid_search_lgbm.cv_results_)
plt.figure(figsize=(10, 6))
plt.plot(cv_results_lgbm['mean_test_score'], marker='o')
plt.title('LightGBM Cross-Validation F1-Scores During Grid Search')
plt.xlabel('Parameter Combination Index')
plt.ylabel('Mean F1-Score')
plt.savefig('plots/lgbm_cv_scores.png')
plt.show()

# Save model
with open('models/lgbm_model.pkl', 'wb') as f:
    pickle.dump(lgbm_best, f)

NameError: name 'LGBMClassifier' is not defined

In [5]:
# Define CatBoost
catboost = CatBoostClassifier(random_state=42, verbose=0)
param_grid_catboost = {
    'iterations': [100, 200],
    'depth': [4, 6],
    'learning_rate': [0.01, 0.1]
}
grid_search_catboost = GridSearchCV(catboost, param_grid_catboost, cv=5, scoring='f1', n_jobs=-1)
grid_search_catboost.fit(X_train_scaled, y_train_smote)

# Best model
catboost_best = grid_search_catboost.best_estimator_
print("Best CatBoost Parameters:", grid_search_catboost.best_params_)

# Graph 11: CatBoost CV Scores
cv_results_catboost = pd.DataFrame(grid_search_catboost.cv_results_)
plt.figure(figsize=(10, 6))
plt.plot(cv_results_catboost['mean_test_score'], marker='o')
plt.title('CatBoost Cross-Validation F1-Scores During Grid Search')
plt.xlabel('Parameter Combination Index')
plt.ylabel('Mean F1-Score')
plt.savefig('plots/catboost_cv_scores.png')
plt.show()

# Save model
with open('models/catboost_model.pkl', 'wb') as f:
    pickle.dump(catboost_best, f)

NameError: name 'CatBoostClassifier' is not defined

In [6]:
# Define neural network
nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile model
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model (reduced epochs for faster execution)
history = nn_model.fit(X_train_scaled, y_train_smote, epochs=20, batch_size=32, 
                      validation_split=0.2, verbose=1)

# Graph 12: Neural Network Loss Curves
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Neural Network Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.savefig('plots/nn_loss_curves.png')
plt.show()

# Graph 13: Neural Network Accuracy Curves
plt.figure(figsize=(10, 6))
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Neural Network Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.savefig('plots/nn_accuracy_curves.png')
plt.show()

# Save model
nn_model.save('models/nn_model.h5')

NameError: name 'Sequential' is not defined

In [None]:
# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=42)),
    ('lgbm', LGBMClassifier(n_estimators=100, random_state=42)),
    ('catboost', CatBoostClassifier(iterations=100, random_state=42, verbose=0))
]

# Define meta-learner
meta_learner = LogisticRegression()

# Define stacking classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_learner, cv=5)

# Train stacking model
stacking_clf.fit(X_train_scaled, y_train_smote)

# Graph 14: Feature Importance from Random Forest
rf_model = stacking_clf.named_estimators_['rf']
rf_importance = pd.Series(rf_model.feature_importances_, index=X.columns)
plt.figure(figsize=(10, 6))
rf_importance.sort_values().plot(kind='barh')
plt.title('Feature Importance from Random Forest (Stacking Base Model)')
plt.xlabel('Importance')
plt.savefig('plots/rf_feature_importance_stacking.png')
plt.show()

# Save model
with open('models/stacking_model.pkl', 'wb') as f:
    pickle.dump(stacking_clf, f)

In [None]:
# Define TPOT classifier (reduced parameters for faster execution)
tpot = TPOTClassifier(generations=3, population_size=10, cv=5, scoring='f1', random_state=42, verbosity=2, n_jobs=-1)

# Train TPOT
tpot.fit(X_train_scaled, y_train_smote)

# Graph 15: TPOT Best Pipeline Score
best_pipeline_score = tpot.fitted_pipeline_.score(X_test_scaled, y_test)
plt.figure(figsize=(8, 6))
plt.bar(['Best TPOT Pipeline'], [best_pipeline_score], color='green')
plt.title('Best TPOT Pipeline Test Score (F1)')
plt.ylabel('F1-Score')
plt.savefig('plots/tpot_best_score.png')
plt.show()

# Save model
tpot.export('models/tpot_pipeline.py')
with open('models/tpot_model.pkl', 'wb') as f:
    pickle.dump(tpot.fitted_pipeline_, f)

In [None]:
# Function to evaluate models
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"ROC-AUC: {roc_auc:.4f}")
    
    return y_pred, y_pred_proba, {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'roc_auc': roc_auc}

# Evaluate models
xgb_pred, xgb_proba, xgb_metrics = evaluate_model(xgb_best, X_test_scaled, y_test, "XGBoost")
lgbm_pred, lgbm_proba, lgbm_metrics = evaluate_model(lgbm_best, X_test_scaled, y_test, "LightGBM")
catboost_pred, catboost_proba, catboost_metrics = evaluate_model(catboost_best, X_test_scaled, y_test, "CatBoost")
nn_pred = (nn_model.predict(X_test_scaled) > 0.5).astype(int)
nn_proba = nn_model.predict(X_test_scaled)
nn_metrics = {
    'accuracy': accuracy_score(y_test, nn_pred),
    'precision': precision_score(y_test, nn_pred),
    'recall': recall_score(y_test, nn_pred),
    'f1': f1_score(y_test, nn_pred),
    'roc_auc': roc_auc_score(y_test, nn_proba)
}
print("\nNeural Network Performance:")
for metric, value in nn_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")
stacking_pred, stacking_proba, stacking_metrics = evaluate_model(stacking_clf, X_test_scaled, y_test, "Stacking")
tpot_pred, tpot_proba, tpot_metrics = evaluate_model(tpot.fitted_pipeline_, X_test_scaled, y_test, "TPOT")

# Graph 16: Confusion Matrices
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
for i, (pred, name) in enumerate([(xgb_pred, 'XGBoost'), (lgbm_pred, 'LightGBM'), (catboost_pred, 'CatBoost'), 
                                 (nn_pred, 'Neural Network'), (stacking_pred, 'Stacking'), (tpot_pred, 'TPOT')]):
    cm = confusion_matrix(y_test, pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i//3, i%3])
    axes[i//3, i%3].set_title(f'{name} Confusion Matrix')
    axes[i//3, i%3].set_xlabel('Predicted')
    axes[i//3, i%3].set_ylabel('True')
plt.tight_layout()
plt.savefig('plots/confusion_matrices.png')
plt.show()

# Graph 17: ROC Curves
plt.figure(figsize=(10, 6))
for proba, name in [(xgb_proba, 'XGBoost'), (lgbm_proba, 'LightGBM'), (catboost_proba, 'CatBoost'), 
                    (nn_proba, 'Neural Network'), (stacking_proba, 'Stacking'), (tpot_proba, 'TPOT')]:
    fpr, tpr, _ = roc_curve(y_test, proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc_score(y_test, proba):.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for All Models')
plt.legend()
plt.savefig('plots/roc_curves.png')
plt.show()

# Graph 18: Bar Plot of Model Performance
metrics_df = pd.DataFrame({
    'Model': ['XGBoost', 'LightGBM', 'CatBoost', 'Neural Network', 'Stacking', 'TPOT'],
    'Accuracy': [xgb_metrics['accuracy'], lgbm_metrics['accuracy'], catboost_metrics['accuracy'], nn_metrics['accuracy'], stacking_metrics['accuracy'], tpot_metrics['accuracy']],
    'F1-Score': [xgb_metrics['f1'], lgbm_metrics['f1'], catboost_metrics['f1'], nn_metrics['f1'], stacking_metrics['f1'], tpot_metrics['f1']],
    'ROC-AUC': [xgb_metrics['roc_auc'], lgbm_metrics['roc_auc'], catboost_metrics['roc_auc'], nn_metrics['roc_auc'], stacking_metrics['roc_auc'], tpot_metrics['roc_auc']]
})
fig = px.bar(metrics_df, x='Model', y=['Accuracy', 'F1-Score', 'ROC-AUC'], barmode='group', title='Model Performance Comparison')
fig.update_layout(xaxis_title='Model', yaxis_title='Score')
fig.write_html('plots/model_performance_bar.html')
fig.show()

# Compare KNN vs. GAN imputation (KNN only, since GAN is skipped)
xgb_gan_metrics = xgb_metrics  # Fallback

# Graph 19: Imputation Comparison
imputation_metrics = pd.DataFrame({
    'Imputation': ['KNN', 'KNN (Fallback)'],
    'Accuracy': [xgb_metrics['accuracy'], xgb_gan_metrics['accuracy']],
    'F1-Score': [xgb_metrics['f1'], xgb_gan_metrics['f1']],
    'ROC-AUC': [xgb_metrics['roc_auc'], xgb_gan_metrics['roc_auc']]
})
fig = px.bar(imputation_metrics, x='Imputation', y=['Accuracy', 'F1-Score', 'ROC-AUC'], barmode='group', title='KNN vs. KNN (Fallback) Imputation (XGBoost)')
fig.update_layout(xaxis_title='Imputation Method', yaxis_title='Score')
fig.write_html('plots/imputation_comparison.html')
fig.show()

# Save predictions
predictions = pd.DataFrame({
    'Sample_ID': range(len(y_test)),
    'True_Potability': y_test,
    'XGBoost_Pred': xgb_pred,
    'LightGBM_Pred': lgbm_pred,
    'CatBoost_Pred': catboost_pred,
    'NN_Pred': nn_pred.flatten(),
    'Stacking_Pred': stacking_pred,
    'TPOT_Pred': tpot_pred,
    'XGBoost_Proba': xgb_proba,
    'LightGBM_Proba': lgbm_proba,
    'CatBoost_Proba': catboost_proba,
    'NN_Proba': nn_proba.flatten(),
    'Stacking_Proba': stacking_proba,
    'TPOT_Proba': tpot_proba
})
predictions.to_csv('predictions.csv', index=False)

In [None]:
# Models already saved
print("Models saved in 'models/' directory:")
print("- xgb_model.pkl")
print("- lgbm_model.pkl")
print("- catboost_model.pkl")
print("- nn_model.h5")
print("- stacking_model.pkl")
print("- tpot_model.pkl")
print("- scaler.pkl")
print("Predictions saved in 'predictions.csv'")
print("Visualizations saved in 'plots/' directory")

# Graph 20: Feature Importance Across Models
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for i, (model, name) in enumerate([(xgb_best, 'XGBoost'), (lgbm_best, 'LightGBM'), (catboost_best, 'CatBoost')]):
    importance = pd.Series(model.feature_importances_, index=X.columns)
    importance.sort_values().plot(kind='barh', ax=axes[i])
    axes[i].set_title(f'{name} Feature Importance')
    axes[i].set_xlabel('Importance')
plt.tight_layout()
plt.savefig('plots/feature_importance_summary.png')
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
models = {
    'XGBoost': xgb_best,
    'LightGBM': lgbm_best,
    'CatBoost': catboost_best,
    'Stacking': stacking_clf
}
cv_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train_smote, cv=5, scoring='f1', n_jobs=-1)
    cv_scores[name] = scores
    print(f"{name} CV F1-Scores: {scores.mean():.4f} (± {scores.std():.4f})")

# Visualize CV Scores
cv_df = pd.DataFrame(cv_scores)
plt.figure(figsize=(10, 6))
sns.boxplot(data=cv_df, palette='Set2')
plt.title('5-Fold Cross-Validation F1-Scores for All Models', fontsize=14, pad=15)
plt.xlabel('Model', fontsize=12)
plt.ylabel('F1-Score', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig('plots/cv_scores_all_models.png')
plt.show()

In [None]:
import shap

# Compute SHAP values for the stacking model
explainer = shap.KernelExplainer(stacking_clf.predict_proba, X_test_scaled)
shap_values = explainer.shap_values(X_test_scaled)

# Summary Plot
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values[1], X_test_scaled, feature_names=X.columns, plot_type="bar", show=False)
plt.title('SHAP Feature Importance for Stacking Model', fontsize=14, pad=15)
plt.savefig('plots/shap_importance_stacking.png')
plt.show()

# Detailed SHAP Summary Plot
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values[1], X_test_scaled, feature_names=X.columns, show=False)
plt.title('SHAP Values Distribution for Stacking Model', fontsize=14, pad=15)
plt.savefig('plots/shap_values_stacking.png')
plt.show()

In [None]:
from dash import Dash, dcc, html, Input, Output
import plotly.express as px

# Initialize the Dash app
app = Dash(__name__)

# Layout
app.layout = html.Div([
    html.H1("Water Potability Prediction Dashboard"),
    html.H3("Explore Data and Model Predictions"),
    
    # Dropdown to select feature for distribution
    html.Label("Select Feature for Distribution:"),
    dcc.Dropdown(
        id='feature-dropdown',
        options=[
            {'label': 'ph', 'value': 'ph'},
            {'label': 'Chloramines', 'value': 'Chloramines'},
            {'label': 'ph_hardness', 'value': 'ph_hardness'},
            {'label': 'chloramines_sulfate', 'value': 'chloramines_sulfate'}
        ],
        value='ph'
    ),
    dcc.Graph(id='feature-distribution'),
    
    # Model Performance
    dcc.Graph(id='model-performance', figure=px.bar(metrics_df, x='Model', y=['Accuracy', 'F1-Score', 'ROC-AUC'], 
                                                    barmode='group', title='Model Performance Comparison'))
])

# Callback to update feature distribution
@app.callback(
    Output('feature-distribution', 'figure'),
    Input('feature-dropdown', 'value')
)
def update_distribution(selected_feature):
    fig = px.histogram(df, x=selected_feature, color='Potability', nbins=30, opacity=0.7,
                       title=f'{selected_feature} Distribution by Potability')
    fig.update_layout(
        xaxis_title=selected_feature,
        yaxis_title='Count',
        title_font_size=14,
        xaxis_title_font_size=12,
        yaxis_title_font_size=12,
        legend_title='Potability',
        xaxis=dict(showgrid=True),
        yaxis=dict(showgrid=True),
        plot_bgcolor='white'
    )
    return fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True, port=8050)

In [None]:
from scipy.stats import ttest_ind

# Perform t-tests for each feature
p_values = {}
for column in X.columns:
    potable = df_knn[df_knn['Potability'] == 1][column]
    non_potable = df_knn[df_knn['Potability'] == 0][column]
    _, p = ttest_ind(potable, non_potable, nan_policy='omit')
    p_values[column] = p

# Visualize p-values
p_values_df = pd.DataFrame.from_dict(p_values, orient='index', columns=['p-value'])
p_values_df = p_values_df.sort_values('p-value')
plt.figure(figsize=(10, 6))
sns.barplot(x='p-value', y=p_values_df.index, data=p_values_df, palette='coolwarm')
plt.axvline(0.05, color='red', linestyle='--', label='Significance Threshold (0.05)')
plt.title('p-Values of Features (Potable vs Non-Potable)', fontsize=14, pad=15)
plt.xlabel('p-Value', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.legend()
plt.grid(True, axis='x', linestyle='--', alpha=0.7)
plt.savefig('plots/p_values_features.png')
plt.show()

In [None]:
# Load saved models and scaler
with open('models/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)
with open('models/stacking_model.pkl', 'rb') as f:
    stacking_model = pickle.load(f)

# Sample new data (replace with actual new data if available)
new_data = pd.DataFrame({
    'ph': [7.0],
    'Hardness': [200.0],
    'Solids': [10000.0],
    'Chloramines': [7.0],
    'Sulfate': [300.0],
    'Conductivity': [400.0],
    'Organic_carbon': [10.0],
    'Trihalomethanes': [50.0],
    'Turbidity': [3.0],
    'ph_hardness': [7.0 * 200.0],
    'chloramines_sulfate': [7.0 * 300.0]
})

# Preprocess new data
new_data_scaled = scaler.transform(new_data)

# Predict with stacking model
prediction = stacking_model.predict(new_data_scaled)
probability = stacking_model.predict_proba(new_data_scaled)[:, 1]

# Display results
print("New Sample Prediction:")
print(f"Predicted Potability: {'Potable' if prediction[0] == 1 else 'Non-Potable'}")
print(f"Probability of Potability: {probability[0]:.4f}")