In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
data_path = os.path.join(parent_dir, 'data')
out_path = os.path.join(parent_dir, 'outputs')     # For the output

# Read the files
train = pd.read_csv(os.path.join(data_path, 'feature_engineered_data_final_v3.csv'))
labels = pd.read_csv(os.path.join(data_path, 'train_labels.csv'))

# Drop status_group if it already exists in train
if 'status_group' in train.columns:
    train = train.drop(columns=['status_group'])
# Merge training labels
df = pd.merge(train, labels, on='id')
print('Files successfully loaded and merged!')


Files successfully loaded and merged!


In [3]:
# Drop status_group if it already exists in train
if 'status_group' in train.columns:
    train = train.drop(columns=['status_group'])
df = train.merge(labels, on='id')
df = df.drop(columns=['region','lga', 'extraction_type', 'management']) #for now
df['water_quality_numeric'] = df['water_quality_numeric'].fillna(0)
df['water_risk_score_log'] = df['water_risk_score_log'].fillna(0)


In [4]:
#all categorical features (excluding target)
categorical_cols = df.select_dtypes(include='object').columns.tolist()
categorical_cols = [col for col in categorical_cols if col != 'status_group']

# One-hot encode medium-cardinality features (3 to 10 unique values)
onehot_cols = [col for col in categorical_cols if 2 < df[col].nunique() <= 10]
df_encoded = pd.get_dummies(df, columns=onehot_cols, drop_first=True)

# Label encode low-cardinality features (≤ 2 unique values)
label_enc_cols = [col for col in categorical_cols if df[col].nunique() <= 2]

label_encoders = {}
for col in label_enc_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df[col].astype(str)) 
    label_encoders[col] = le

In [5]:
df.to_csv(os.path.join(data_path, 'final_data.csv'), index=False)
print("Updated dataset saved as final_data.csv")

Updated dataset saved as final_data.csv


In [6]:
# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(df['status_group']) 

leak_cols = [col for col in df_encoded.columns if 'status_group_' in col or 'status_group.' in col]
df_encoded = df_encoded.drop(columns=leak_cols)

# Define features and target
X = df_encoded.drop(columns=['status_group','id'])
y = y_encoded

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [8]:
# pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])

# Parameter grid
param_grid = {
    'logreg__C': [0.01, 0.1],
    'logreg__penalty': ['l2'],
    'logreg__solver': ['lbfgs', 'saga']
}

# Grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)  # this is essential

# Check best_estimator_
print(" Type:", type(grid_search))
print(" Best Estimator:", grid_search.best_estimator_)

# Save to file
joblib.dump(grid_search.best_estimator_, 'best_logistic_model.pkl')


 Type: <class 'sklearn.model_selection._search.GridSearchCV'>
 Best Estimator: Pipeline(steps=[('scaler', StandardScaler()),
                ('logreg',
                 LogisticRegression(C=0.01, max_iter=1000, solver='saga'))])


['best_logistic_model.pkl']

In [9]:
# Evaluate on the test set

y_test_pred = grid_search.best_estimator_.predict(X_test)

print(" Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))

print(" Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

 Classification Report on Test Set:
                         precision    recall  f1-score   support

             functional       0.71      0.91      0.79      6278
functional needs repair       0.62      0.03      0.06       786
         non functional       0.78      0.60      0.68      4454

               accuracy                           0.73     11518
              macro avg       0.71      0.51      0.51     11518
           weighted avg       0.73      0.73      0.70     11518

 Confusion Matrix:
[[5686    5  587]
 [ 610   25  151]
 [1750   10 2694]]


In [10]:
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn import tree

mnnb = GaussianNB()
result_mnnb = mnnb.fit(X_train, y_train)
guess_mnnb = cross_val_score(mnnb, X, y, cv=2)

knn = neighbors.KNeighborsClassifier(n_neighbors=5)
result_knn = knn.fit(X_train, y_train)
guess_knn = cross_val_score(knn, X, y, cv=2)

tree = tree.DecisionTreeClassifier()
result_tree = tree.fit(X_train, y_train)
guess_tree = cross_val_score(tree, X, y, cv=2)

svc = svm.LinearSVC(loss='hinge',tol=0.001, max_iter=500)
clf = svm.SVC(decision_function_shape='ovr')
result_svm = svc.fit(X_train, y_train)
guess_svm = cross_val_score(clf, X, y, cv=2)




In [11]:
varlist = [guess_svm, guess_mnnb, guess_knn, guess_tree]
for i in varlist:
    print("%0.2f accuracy with a standard deviation of %0.2f" % (i.mean(), i.std()))

0.56 accuracy with a standard deviation of 0.01
0.53 accuracy with a standard deviation of 0.02
0.69 accuracy with a standard deviation of 0.00
0.74 accuracy with a standard deviation of 0.00


In [12]:
# predict
pred_svm = svc.predict(X_test)
pred_mnnb = mnnb.predict(X_test)
pred_knn = knn.predict(X_test)
pred_tree = tree.predict(X_test)
print(" Confusion Matrix:")
print(confusion_matrix(y_test, pred_svm))
print'(~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(confusion_matrix(y_test, pred_mnnb))
print'(~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(confusion_matrix(y_test, pred_knn))
print'(~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')
print(confusion_matrix(y_test, pred_tree))

SyntaxError: unmatched ')' (1566024704.py, line 8)