In [1]:
# =========================================================
# ML ASSIGNMENT 2 - 2025AA05099
# =========================================================

# 1. IMPORT LIBRARIES
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    precision_score,
    recall_score,
    f1_score,
    matthews_corrcoef
)

# =========================================================
# 2. UPLOAD & LOAD DATASET
# =========================================================
uploaded = files.upload()
df = pd.read_csv("adult.csv")

print("Dataset Shape:", df.shape)
print("\nTarget Distribution:\n", df["income"].value_counts())

# =========================================================
# 3. HANDLE MISSING VALUES
# =========================================================
df.replace("?", np.nan, inplace=True)
for col in df.select_dtypes(include="object").columns:
    df[col].fillna("Unknown", inplace=True)

# =========================================================
# 4. FEATURE / TARGET SPLIT
# =========================================================
X = df.drop("income", axis=1)
y = df["income"].map({"<=50K": 0, ">50K": 1})

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

# =========================================================
# 5. PREPROCESSOR
# =========================================================
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# =========================================================
# 6. TRAIN–TEST SPLIT
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# =========================================================
# 7. EVALUATION FUNCTION
# =========================================================
def evaluate(model):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    return [
        accuracy_score(y_test, y_pred),
        roc_auc_score(y_test, y_prob),
        precision_score(y_test, y_pred),
        recall_score(y_test, y_pred),
        f1_score(y_test, y_pred),
        matthews_corrcoef(y_test, y_pred)
    ]

# =========================================================
# 8. MODELS
# =========================================================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss", random_state=42)
}

results = []

# =========================================================
# 9. TRAIN & EVALUATE MODELS
# =========================================================
for name, model in models.items():
    if name == "Naive Bayes":
        nb_preprocessor = ColumnTransformer(
            transformers=[
                ("num", StandardScaler(), num_cols),
                ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
            ]
        )
        pipeline = Pipeline([
            ("preprocessor", nb_preprocessor),
            ("model", model)
        ])
    else:
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("model", model)
        ])

    pipeline.fit(X_train, y_train)
    results.append(evaluate(pipeline))

# =========================================================
# 10. COMPARISON TABLE
# =========================================================
results_df = pd.DataFrame(
    results,
    index=models.keys(),
    columns=["Accuracy", "AUC", "Precision", "Recall", "F1", "MCC"]
)

print("\nMODEL PERFORMANCE COMPARISON:\n")
display(results_df)

# =========================================================
# 11. OBSERVATIONS TABLE
# =========================================================
observations_df = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ],
    "Observation": [
        "Strong baseline with high AUC but moderate recall.",
        "Shows overfitting and lower generalization.",
        "Sensitive to scaling and K value.",
        "High recall but low precision due to independence assumption.",
        "Good bias–variance balance.",
        "Best overall performance across metrics."
    ]
})

print("\nMODEL-WISE OBSERVATIONS:\n")
display(observations_df)


Saving adult.csv to adult.csv
Dataset Shape: (32561, 15)

Target Distribution:
 income
<=50K    24720
>50K      7841
Name: count, dtype: int64

MODEL PERFORMANCE COMPARISON:



Unnamed: 0,Accuracy,AUC,Precision,Recall,F1,MCC
Logistic Regression,0.854291,0.904345,0.739366,0.609694,0.668298,0.580438
Decision Tree,0.813757,0.749305,0.610592,0.625,0.617712,0.494683
KNN,0.831107,0.852494,0.666193,0.598214,0.630376,0.522602
Naive Bayes,0.53754,0.735753,0.336355,0.946429,0.496321,0.324052
Random Forest,0.850146,0.898749,0.723565,0.610969,0.662517,0.570438
XGBoost,0.871181,0.923278,0.77468,0.655612,0.71019,0.631726



MODEL-WISE OBSERVATIONS:



Unnamed: 0,Model,Observation
0,Logistic Regression,Strong baseline with high AUC but moderate rec...
1,Decision Tree,Shows overfitting and lower generalization.
2,KNN,Sensitive to scaling and K value.
3,Naive Bayes,High recall but low precision due to independe...
4,Random Forest,Good bias–variance balance.
5,XGBoost,Best overall performance across metrics.
