# Practical 4

In [None]:
''' 
Aim:
    Use Naive bayes, K-nearest, and Decision tree classification algorithms to build classifiers on
    any two datasets. Pre-process the datasets using techniques specified in Q2. Compare the
    Accuracy, Precision, Recall and F1 measure reported for each dataset using the above mentioned
    classifiers under the following situations:
        i. Using Holdout method (Random sampling):
            a) Training set = 80% Test set = 20%
            b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
        ii. Using Cross-Validation:
            a) 10-fold
            b) 5-fold
'''

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## Dataset 1: Iris Dataset

In [23]:
from sklearn.datasets import load_iris

iris = load_iris()
X_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
y_iris = iris.target

scaler = StandardScaler()
X_iris_scaled = scaler.fit_transform(X_iris)


## Dataset 2: Titanic (Clean Data)

In [24]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

df_titan = pd.read_csv("../dataset/titanic.csv")

# Identify categorical and numeric columns
cat_cols = X_titan.select_dtypes(include=["object"]).columns
num_cols = X_titan.select_dtypes(include=["int64","float64"]).columns

# Numeric pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scale", StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encode", OneHotEncoder(handle_unknown="ignore"))
])

# Full preprocessing pipeline
ct = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])

X_titan_pre = ct.fit_transform(X_titan)


In [25]:
models = {
    "Naive Bayes": GaussianNB(),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}


In [26]:
# for evaluate model 
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds, average="macro"),
        "Recall": recall_score(y_test, preds, average="macro"),
        "F1": f1_score(y_test, preds, average="macro")
    }


### 1. HOLDOUT METHOD

In [27]:
def run_holdout(X, y, test_size, label):
    print(f"\n--- Holdout ({label}) ---")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )
    for name, model in models.items():
        results = evaluate_model(model, X_train, X_test, y_train, y_test)
        print(f"{name} → {results}")


In [28]:
# 80% Train — 20% Test
# for iris dataset
run_holdout(X_iris_scaled, y_iris, 0.2, "Iris 80/20")

# for titanic dataset
run_holdout(X_titan_pre, y_titan, 0.2, "Titanic 80/20")


--- Holdout (Iris 80/20) ---
Naive Bayes → {'Accuracy': 0.9666666666666667, 'Precision': 0.9696969696969697, 'Recall': 0.9666666666666667, 'F1': 0.9665831244778612}
KNN (k=5) → {'Accuracy': 0.9333333333333333, 'Precision': 0.9444444444444445, 'Recall': 0.9333333333333332, 'F1': 0.9326599326599326}
Decision Tree → {'Accuracy': 0.9333333333333333, 'Precision': 0.9333333333333332, 'Recall': 0.9333333333333332, 'F1': 0.9333333333333332}

--- Holdout (Titanic 80/20) ---
Naive Bayes → {'Accuracy': 0.8053435114503816, 'Precision': 0.7471085586663475, 'Recall': 0.7348392965433597, 'F1': 0.7404580152671756}
KNN (k=5) → {'Accuracy': 0.8587786259541985, 'Precision': 0.8305226174791392, 'Recall': 0.7852486355366889, 'F1': 0.8034588325933134}
Decision Tree → {'Accuracy': 0.8358778625954199, 'Precision': 0.7868733256792958, 'Recall': 0.784111582777441, 'F1': 0.7854708178615634}


In [29]:
# 66.6% Train — 33.3% Test
run_holdout(X_iris_scaled, y_iris, 0.333, "Iris 66/33")
run_holdout(X_titan_pre, y_titan, 0.333, "Titanic 66/33")


--- Holdout (Iris 66/33) ---
Naive Bayes → {'Accuracy': 0.92, 'Precision': 0.9251461988304094, 'Recall': 0.9215686274509803, 'F1': 0.9212962962962963}
KNN (k=5) → {'Accuracy': 0.92, 'Precision': 0.9365079365079364, 'Recall': 0.9215686274509803, 'F1': 0.92046783625731}
Decision Tree → {'Accuracy': 0.94, 'Precision': 0.9500000000000001, 'Recall': 0.9411764705882352, 'F1': 0.9407149084568439}

--- Holdout (Titanic 66/33) ---
Naive Bayes → {'Accuracy': 0.8004587155963303, 'Precision': 0.7417781614156236, 'Recall': 0.7289146779993463, 'F1': 0.7347542427993035}
KNN (k=5) → {'Accuracy': 0.8440366972477065, 'Precision': 0.8151495016611296, 'Recall': 0.7584177835894084, 'F1': 0.7794047619047619}
Decision Tree → {'Accuracy': 0.8555045871559633, 'Precision': 0.8209243773041992, 'Recall': 0.791680287675711, 'F1': 0.8043129795464745}


### 2. CROSS-VALIDATION

In [30]:
def run_cv(X, y, folds, label):
    print(f"\n--- {folds}-Fold Cross Validation ({label}) ---")
    for name, model in models.items():
        cv = cross_validate(model, X, y, cv=folds,
                            scoring=["accuracy","precision_macro","recall_macro","f1_macro"])
        print(f"{name} → "
              f"Acc: {cv['test_accuracy'].mean():.3f}, "
              f"Prec: {cv['test_precision_macro'].mean():.3f}, "
              f"Rec: {cv['test_recall_macro'].mean():.3f}, "
              f"F1: {cv['test_f1_macro'].mean():.3f}")


In [31]:
# 10-Fold Cross Validation
run_cv(X_iris_scaled, y_iris, 10, "Iris")
run_cv(X_titan_pre, y_titan, 10, "Titanic")


--- 10-Fold Cross Validation (Iris) ---
Naive Bayes → Acc: 0.953, Prec: 0.963, Rec: 0.953, F1: 0.952
KNN (k=5) → Acc: 0.953, Prec: 0.960, Rec: 0.953, F1: 0.953
Decision Tree → Acc: 0.953, Prec: 0.959, Rec: 0.953, F1: 0.953

--- 10-Fold Cross Validation (Titanic) ---
Naive Bayes → Acc: 0.819, Prec: 0.783, Rec: 0.743, F1: 0.747
KNN (k=5) → Acc: 0.802, Prec: 0.771, Rec: 0.735, F1: 0.736
Decision Tree → Acc: 0.614, Prec: 0.671, Rec: 0.599, F1: 0.561


In [32]:
# 5-Fold Cross Validation
run_cv(X_iris_scaled, y_iris, 5, "Iris")
run_cv(X_titan_pre, y_titan, 5, "Titanic")


--- 5-Fold Cross Validation (Iris) ---
Naive Bayes → Acc: 0.953, Prec: 0.958, Rec: 0.953, F1: 0.953
KNN (k=5) → Acc: 0.960, Prec: 0.963, Rec: 0.960, F1: 0.960
Decision Tree → Acc: 0.953, Prec: 0.955, Rec: 0.953, F1: 0.953

--- 5-Fold Cross Validation (Titanic) ---
Naive Bayes → Acc: 0.801, Prec: 0.774, Rec: 0.728, F1: 0.725
KNN (k=5) → Acc: 0.776, Prec: 0.753, Rec: 0.727, F1: 0.716
Decision Tree → Acc: 0.434, Prec: 0.559, Rec: 0.432, F1: 0.350
