# Practical 4

In [None]:
''' 
Aim:
    Use Naive bayes, K-nearest, and Decision tree classification algorithms to build classifiers on
    any two datasets. Pre-process the datasets using techniques specified in Q2. Compare the
    Accuracy, Precision, Recall and F1 measure reported for each dataset using the abovementioned
    classifiers under the following situations:
        i. Using Holdout method (Random sampling):
            a) Training set = 80% Test set = 20%
            b) Training set = 66.6% (2/3rd of total), Test set = 33.3%
        ii. Using Cross-Validation:
            a) 10-fold
            b) 5-fold
'''

In [None]:
#implement 
# knn
# nb(naive bayes)
# decision tree

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_wine
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, cross_validate
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

# helper to run train/test
def evaluate_holdout(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted', zero_division=0),
        'recall': recall_score(y_test, y_pred, average='weighted', zero_division=0),
        'f1': f1_score(y_test, y_pred, average='weighted', zero_division=0)
    }

def evaluate_cv(clf, X, y, cv):
    scoring = {'accuracy': make_scorer(accuracy_score),
               'precision': make_scorer(precision_score, average='weighted', zero_division=0),
               'recall': make_scorer(recall_score, average='weighted', zero_division=0),
               'f1': make_scorer(f1_score, average='weighted', zero_division=0)}
    scores = cross_validate(clf, X, y, cv=cv, scoring=scoring, return_train_score=False)
    return {k: np.mean(scores['test_' + k]) for k in scoring.keys()}

def run_experiments(X, y, dataset_name):
    # Pre-process: standardize numeric features
    scaler = StandardScaler()
    Xs = scaler.fit_transform(X)

    classifiers = {
        'NaiveBayes': GaussianNB(),
        'KNN': KNeighborsClassifier(n_neighbors=5),
        'DecisionTree': DecisionTreeClassifier(random_state=42)
    }

    results = []

    # Holdout 80/20
    Xtr, Xte, ytr, yte = train_test_split(Xs, y, test_size=0.2, stratify=y, random_state=42)
    for name, clf in classifiers.items():
        r = evaluate_holdout(clf, Xtr, ytr, Xte, yte)
        r.update({'dataset':dataset_name, 'method':'holdout', 'split':'80-20', 'classifier':name})
        results.append(r)

    # Holdout 66.6/33.3
    Xtr, Xte, ytr, yte = train_test_split(Xs, y, test_size=1/3, stratify=y, random_state=42)
    for name, clf in classifiers.items():
        r = evaluate_holdout(clf, Xtr, ytr, Xte, yte)
        r.update({'dataset':dataset_name, 'method':'holdout', 'split':'66.6-33.3', 'classifier':name})
        results.append(r)

    # Cross-validation 10-fold
    skf10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    for name, clf in classifiers.items():
        r = evaluate_cv(clf, Xs, y, skf10)
        r.update({'dataset':dataset_name, 'method':'cv', 'split':'10-fold', 'classifier':name})
        results.append(r)

    # Cross-validation 5-fold
    skf5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    for name, clf in classifiers.items():
        r = evaluate_cv(clf, Xs, y, skf5)
        r.update({'dataset':dataset_name, 'method':'cv', 'split':'5-fold', 'classifier':name})
        results.append(r)

    return pd.DataFrame(results)

# Load datasets
data1 = load_breast_cancer()
df1 = run_experiments(data1.data, data1.target, 'breast_cancer')

data2 = load_wine()
df2 = run_experiments(data2.data, data2.target, 'wine')

results = pd.concat([df1, df2]).reset_index(drop=True)
print(results)
# Save results
results.to_csv('classification_results.csv', index=False)
