<a href="https://colab.research.google.com/github/jc890/python/blob/master/Assignment04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Diabetes Classification Case Study: EDA → Preprocessing → Models → Ensembles → Evaluation

# 0) Imports
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier

# 1) Load data
# Replace the path with the actual location of your dataset (e.g., '/content/diabetes (1).csv')
df = pd.read_csv('/content/diabetes (1).csv')

# 2) Basic EDA
display(df.head())
display(df.info())
display(df.describe())

# Outcome distribution
print("Class balance (Outcome):")
print(df['Outcome'].value_counts(dropna=False))
print(df['Outcome'].value_counts(normalize=True))








Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


None

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Class balance (Outcome):
Outcome
0    500
1    268
Name: count, dtype: int64
Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64


In [3]:
# 3) Treat zeros as missing for physiologically-invalid fields
zero_invalid_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']  # Pregnancies is valid at 0
df[zero_invalid_cols] = df[zero_invalid_cols].replace(0, np.nan)

# Missingness summary
print("\nMissing values per column after zero→NaN conversion:")
print(df.isna().sum())



Missing values per column after zero→NaN conversion:
Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64


In [4]:
# 4) Split data
X = df.drop(columns=['Outcome'])
y = df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)


In [6]:
# 5) Build pipelines
# Two imputation strategies: median (robust) and KNN (optional). We'll use median for all models to keep comparisons fair.
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

def make_pipeline(estimator, scale=True, impute='median', class_weight=None):
    steps = []
    if impute == 'median':
        steps.append(('imputer', SimpleImputer(strategy='median')))
    elif impute == 'knn':
        steps.append(('imputer', KNNImputer(n_neighbors=5, weights='distance')))
    if scale:
        steps.append(('scaler', StandardScaler()))
    # Set class_weight where available
    if hasattr(estimator, 'class_weight') and class_weight is not None:
        estimator.set_params(class_weight=class_weight)
    steps.append(('clf', estimator))
    return Pipeline(steps)

models = {
    # Baselines
    'LogisticRegression': make_pipeline(
        LogisticRegression(max_iter=200, solver='liblinear', class_weight='balanced'), scale=True, impute='median'
    ),
    'kNN': make_pipeline(
        KNeighborsClassifier(n_neighbors=7, weights='distance'), scale=True, impute='median'
    ),
    'SVM(RBF)': make_pipeline(
        SVC(kernel='rbf', C=1.0, gamma='scale', probability=True, class_weight='balanced'), scale=True, impute='median'
    ),
    'DecisionTree': make_pipeline(
        DecisionTreeClassifier(max_depth=None, min_samples_leaf=3, class_weight='balanced'), scale=False, impute='median'
    ),

    # Ensembles
    'RandomForest': make_pipeline(
        RandomForestClassifier(n_estimators=300, max_depth=None, min_samples_leaf=2, class_weight='balanced_subsample'),
        scale=False, impute='median'
    ),
    'Bagging(Tree)': make_pipeline(
        BaggingClassifier(estimator=DecisionTreeClassifier(min_samples_leaf=3),
                          n_estimators=200, bootstrap=True, n_jobs=-1, random_state=42),
        scale=False, impute='median'
    ),
    'AdaBoost': make_pipeline(
        AdaBoostClassifier(n_estimators=300, learning_rate=0.05, random_state=42),
        scale=False, impute='median'
    ),
    'GradientBoosting': make_pipeline(
        GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42),
        scale=False, impute='median'
    ),
}

In [8]:
# 6) Train and evaluate
results = []
reports = {}

for name, pipe in models.items():
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    # Try probas for AUC if available
    try:
        y_proba = pipe.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_proba)
    except Exception:
        auc = np.nan

    report_dict = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    acc = report_dict['accuracy']
    precision = report_dict['weighted avg']['precision']
    recall = report_dict['weighted avg']['recall']
    f1 = report_dict['weighted avg']['f1-score']

    results.append({
        'model': name,
        'accuracy': acc,
        'precision_w': precision,
        'recall_w': recall,
        'f1_w': f1,
        'roc_auc': auc
    })
    reports[name] = classification_report(y_test, y_pred, digits=4, zero_division=0)

results_df = pd.DataFrame(results).sort_values(by=['f1_w', 'recall_w', 'accuracy'], ascending=False).reset_index(drop=True)
display(results_df)

best_model_name = results_df.loc[0, 'model']
print(f"\nBest model by weighted F1 on this split: {best_model_name}")

Unnamed: 0,model,accuracy,precision_w,recall_w,f1_w,roc_auc
0,LogisticRegression,0.75,0.758433,0.75,0.75291,0.823164
1,Bagging(Tree),0.75,0.743232,0.75,0.74343,0.819224
2,RandomForest,0.744792,0.74235,0.744792,0.743383,0.820179
3,SVM(RBF),0.729167,0.740624,0.729167,0.732928,0.808239
4,GradientBoosting,0.734375,0.729017,0.734375,0.730672,0.812776
5,kNN,0.729167,0.721938,0.729167,0.723454,0.781851
6,DecisionTree,0.713542,0.710734,0.713542,0.71196,0.722746
7,AdaBoost,0.729167,0.721205,0.729167,0.709323,0.821493



Best model by weighted F1 on this split: LogisticRegression


In [9]:

# 7) Print detailed report for top-3 models
print("\nDetailed classification reports (top-3 by F1):")
for name in results_df['model'].head(3):
    print(f"\n=== {name} ===")
    print(reports[name])


Detailed classification reports (top-3 by F1):

=== LogisticRegression ===
              precision    recall  f1-score   support

           0     0.8291    0.7760    0.8017       125
           1     0.6267    0.7015    0.6620        67

    accuracy                         0.7500       192
   macro avg     0.7279    0.7387    0.7318       192
weighted avg     0.7584    0.7500    0.7529       192


=== Bagging(Tree) ===
              precision    recall  f1-score   support

           0     0.7810    0.8560    0.8168       125
           1     0.6727    0.5522    0.6066        67

    accuracy                         0.7500       192
   macro avg     0.7269    0.7041    0.7117       192
weighted avg     0.7432    0.7500    0.7434       192


=== RandomForest ===
              precision    recall  f1-score   support

           0     0.7969    0.8160    0.8063       125
           1     0.6406    0.6119    0.6260        67

    accuracy                         0.7448       192
   macr

In [11]:
# 8) Persist best model
best_pipe = models[best_model_name]

