#### This a practice notebook to practice fundamentals

In [1]:
# import all required libraries 

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder 
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import(
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)

from xgboost import XGBClassifier 
import joblib

In [2]:
# data loading and EDA 
data_path = r"D:\Datasets\Adult_Census\adult.csv"
df = pd.read_csv(data_path)

# view top5
df.head()

# view basic information
df.info() 

# identify null columns 
df.isnull().sum() 

# get column statistics
df.describe(include="all")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
count,32561.0,32561,32561.0,32561,32561.0,32561,32561,32561,32561,32561,32561.0,32561.0,32561.0,32561,32561
unique,,9,,16,,7,15,6,5,2,,,,42,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22696,,10501,,14976,4140,13193,27816,21790,,,,29170,24720
mean,38.581647,,189778.4,,10.080679,,,,,,1077.648844,87.30383,40.437456,,
std,13.640433,,105550.0,,2.57272,,,,,,7385.292085,402.960219,12.347429,,
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,48.0,,237051.0,,12.0,,,,,,0.0,0.0,45.0,,


In [3]:
# data cleaning to handle ? fields 
df = df.replace("?", np.nan)

target_col = "income"
X = df.drop(columns=[target_col])
y=df[target_col]

# Encode target for XGBoost
label_encoder = LabelEncoder() 
y = label_encoder.fit_transform(y)

In [4]:
# implement preprocessing pipeline that is common to both 

categorical_cols = X.select_dtypes(include=["object"]).columns 
numerical_cols = X.select_dtypes(exclude=["object"]).columns

numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="most_frequent")),
    ("encoder",OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, numerical_cols),
        ("cat",categorical_pipeline, categorical_cols)
    ]
)

In [5]:
# Train/validation split 
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
# building common metrics 

def compute_metrics(y_true, y_pred, y_proba=None):
    metrics = {}
    
    metrics["Accuracy"] = accuracy_score(y_true, y_pred)
    metrics["Precision"] = precision_score(y_true, y_pred, average="weighted")
    metrics["Recall"] = recall_score(y_true, y_pred, average="weighted")
    metrics["F1"] = f1_score(y_true, y_pred, average="weighted")
    metrics["MCC"] = matthews_corrcoef(y_true, y_pred)
    
    if y_proba is not None:
        metrics["AUC"] = roc_auc_score(y_true, y_proba[:, 1])
    else:
        metrics["AUC"] = np.nan
        
    return metrics


In [8]:
# Implement logistic regression

log_reg = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LogisticRegression(max_iter=1000))
])

log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_val)
y_proba = log_reg.predict_proba(X_val)

log_metrics = compute_metrics(y_val, y_pred, y_proba)
log_metrics

# dump in file 
joblib.dump(log_reg, "../model/logistic_regression.pkl")

['../model/logistic_regression.pkl']

In [10]:
# Decision Tree 

dt = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", DecisionTreeClassifier(random_state=42))
])

dt.fit(X_train, y_train)

y_pred = dt.predict(X_val)
dt_metrics = compute_metrics(y_val, y_pred)

joblib.dump(dt,"../model/decision_tree.pkl")
dt_metrics

{'Accuracy': 0.8152924919391985,
 'Precision': 0.8170644991076461,
 'Recall': 0.8152924919391985,
 'F1': 0.8161384018912099,
 'MCC': 0.4995194513735348,
 'AUC': nan}

In [11]:
# K-nearest neighbours 
knn = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", KNeighborsClassifier(n_neighbors=5))
])

knn.fit(X_train, y_train)

y_pred = knn.predict(X_val)
knn_metrics = compute_metrics(y_val, y_pred)

joblib.dump(knn, "../model/knn.pkl")
knn_metrics

{'Accuracy': 0.8314140948871488,
 'Precision': 0.8257732235793103,
 'Recall': 0.8314140948871488,
 'F1': 0.8278319801471522,
 'MCC': 0.5218358256712414,
 'AUC': nan}

In [13]:
# Naive Bayes classifier 
X_train_nb = preprocessor.fit_transform(X_train)
X_val_nb = preprocessor.transform(X_val)

X_train_nb = X_train_nb.toarray()
X_val_nb = X_val_nb.toarray()

nb = GaussianNB()
nb.fit(X_train_nb, y_train)

y_pred = nb.predict(X_val_nb)
nb_metrics = compute_metrics(y_val, y_pred)

joblib.dump(nb,"../model/naive_bayes.pkl")
nb_metrics

{'Accuracy': 0.5366190695532013,
 'Precision': 0.8100271040634657,
 'Recall': 0.5366190695532013,
 'F1': 0.553082332992982,
 'MCC': 0.3236651398839895,
 'AUC': nan}

In [14]:
# Random Forest 
rf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=100, random_state=42))
])

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rf_metrics = compute_metrics(y_val, y_pred)

joblib.dump(rf, "../model/random_forest.pkl")
rf_metrics

{'Accuracy': 0.8512206356517734,
 'Precision': 0.8453572114705743,
 'Recall': 0.8512206356517734,
 'F1': 0.84675615375043,
 'MCC': 0.5736312797910291,
 'AUC': nan}

In [15]:
# XGBoost
xgb = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("mode", XGBClassifier(
        objective="binary:logistic",
        eval_metrcis="mlogloss",
        use_label_encoder = False
    ))
])

xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_val)
y_proba = xgb.predict_proba(X_val)

xgb_metrics = compute_metrics(y_val, y_pred, y_proba)

joblib.dump(xgb, "../model/xgboost.pkl")
xgb_metrics

Parameters: { "eval_metrcis", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


{'Accuracy': 0.8668816213726394,
 'Precision': 0.8621260795947587,
 'Recall': 0.8668816213726394,
 'F1': 0.8629610508009855,
 'MCC': 0.6193588530760175,
 'AUC': 0.9223297471162379}

In [16]:
# Comparision of metrics from all models 
metrics_df = pd.DataFrame([
    log_metrics,
    dt_metrics,
    knn_metrics,
    nb_metrics,
    rf_metrics,
    xgb_metrics
], index=[
    "Logistic Regression",
    "Decision Tree",
    "KNN",
    "Naive Bayes",
    "Random Forest",
    "XGBoost"
])

metrics_df.style.background_gradient(cmap="viridis")

Unnamed: 0,Accuracy,Precision,Recall,F1,MCC,AUC
Logistic Regression,0.853063,0.846737,0.853063,0.847648,0.575799,0.902445
Decision Tree,0.815292,0.817064,0.815292,0.816138,0.499519,
KNN,0.831414,0.825773,0.831414,0.827832,0.521836,
Naive Bayes,0.536619,0.810027,0.536619,0.553082,0.323665,
Random Forest,0.851221,0.845357,0.851221,0.846756,0.573631,
XGBoost,0.866882,0.862126,0.866882,0.862961,0.619359,0.92233
