In [4]:
# 1. IMPORT LIBRARIES
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

# 2. UPLOAD & LOAD DATASET
from google.colab import files
uploaded = files.upload()

train_df = pd.read_csv("adult.csv")
test_df = pd.read_csv("test.csv")

train_df['income'] = train_df['income'].str.strip()
test_df['income'] = test_df['income'].str.strip()

print(train_df.shape, test_df.shape)

# 3. HANDLE MISSING VALUES
train_df.replace('?', np.nan, inplace=True)
test_df.replace('?', np.nan, inplace=True)

train_df.dropna(inplace=True)
test_df.dropna(inplace=True)


# 4. FEATURE / TARGET SPLIT

X = train_df.drop("income", axis=1)
y = train_df["income"].map({"<=50K": 0, ">50K": 1})

# Identify column types
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

# 5. PREPROCESSOR

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

# Convert sparse matrix to dense (IMPORTANT for KNN)
to_dense = FunctionTransformer(
    lambda x: x.toarray() if hasattr(x, "toarray") else x,
    accept_sparse=True
)

preprocessor = Pipeline([
    ("col_transform", ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numerical_cols),
            ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
        ]
    )),
    ("to_dense", to_dense)
])


# 6. TRAIN–TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 7. EVALUATION FUNCTION

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None

    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob) if y_prob is not None else np.nan,
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

# 8. MODELS


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# 9. TRAIN & EVALUATE MODELS

results = []

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    metrics = evaluate_model(pipe, X_test, y_test)
    metrics['Model'] = name
    results.append(metrics)


# 10. COMPARISON TABLE

results_df = pd.DataFrame(results).set_index("Model")
print("\nMODEL PERFORMANCE COMPARISON:\n")
display(results_df)

# 11. OBSERVATIONS TABLE

observations_df = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "KNN",
        "Naive Bayes",
        "Random Forest",
        "XGBoost"
    ],
    "Observation": [
        "Strong baseline with high AUC but moderate recall.",
        "Shows overfitting and lower generalization.",
        "Sensitive to scaling and K value.",
        "High recall but low precision due to independence assumption.",
        "Good bias–variance balance.",
        "Best overall performance across metrics."
    ]
})

print("\nMODEL-WISE OBSERVATIONS:\n")
display(observations_df)

Saving test.csv to test (1).csv
(32561, 15) (30, 15)
Numerical columns: ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
Categorical columns: ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



MODEL PERFORMANCE COMPARISON:



Unnamed: 0_level_0,Accuracy,AUC,Precision,Recall,F1,MCC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.854301,0.913589,0.750201,0.621838,0.680015,0.591088
Decision Tree,0.815183,0.751002,0.630303,0.623169,0.626716,0.503925
KNN,0.834079,0.867278,0.683248,0.621838,0.651098,0.543609
Naive Bayes,0.601028,0.830016,0.379494,0.948735,0.542134,0.387561
Random Forest,0.85629,0.910527,0.74902,0.635819,0.687793,0.598636
XGBoost,0.872866,0.934065,0.789598,0.667111,0.723205,0.645282



MODEL-WISE OBSERVATIONS:



Unnamed: 0,Model,Observation
0,Logistic Regression,Strong baseline with high AUC but moderate rec...
1,Decision Tree,Shows overfitting and lower generalization.
2,KNN,Sensitive to scaling and K value.
3,Naive Bayes,High recall but low precision due to independe...
4,Random Forest,Good bias–variance balance.
5,XGBoost,Best overall performance across metrics.
