<a href="https://colab.research.google.com/github/ikabrain/UML501-Machine-Learning-Lab/blob/main/ML_assign6/ML_assign6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Lab Assignment 6
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Q1: Gaussian Naïve Bayes Classifier
---

Implement Gaussian Naïve Bayes Classifier on the Iris dataset from sklearn.datasets

In [2]:
from sklearn.datasets import load_iris

iris = load_iris(as_frame=True)
df = iris.frame
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
from sklearn.model_selection import train_test_split

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### (i) Step-by-step implementation

In [4]:
class GausianNaiveBayesClassifier():
    def __init__(self):
        self.X_train = None
        self.y_train = None
        self.y_classes = None

        # Prior probabilities
        self.prior = None

        # Stats per feature per class
        self.means = None
        self.stds = None

        self.fitted = False
        return None


    def _prior(self, X, y):
        self.X_train = X.to_numpy()
        self.y_train = y.to_numpy()
        self.y_classes = np.unique(y)
        self.prior = np.array([np.mean(y == c) for c in self.y_classes])
        return self

    def fit(self, X, y):
        """Fit the Gaussian Naive Bayes model to training data"""
        self._prior(X, y)
        self.means = np.array([X[y == c].mean(axis=0) for c in self.y_classes])
        self.stds = np.array([X[y == c].std(axis=0) + 1e-6 for c in self.y_classes])  # small epsilon to prevent /0
        self.fitted = True
        return self


    def _pdf(self, x, mean, std):
        """Probability Density Function for Gaussian Distribution"""
        return (1 / (np.sqrt(2 * np.pi) * std)) * np.exp(-((x - mean) ** 2) / (2 * std ** 2))


    def _class_likelihood(self, X, class_idx):
        """Compute the log likelihood of X for a given class"""
        mean = self.means[class_idx]
        std = self.stds[class_idx]
        return np.sum(np.log(self._pdf(X, mean, std) + 1e-9))


    def predict(self, X):
        """Predict class labels for given test data"""
        X = X.to_numpy()
        if not self.fitted:
            raise Exception("Model not fitted yet. Call fit() first.")

        y_pred = []
        for x in X:
            # Computing posterior log probability per class
            log_posteriors = []
            for i in range(len(self.y_classes)):
                log_prior = np.log(self.prior[i])
                log_likelihood = self._class_likelihood(x, i)
                posterior = log_prior + log_likelihood
                log_posteriors.append(posterior)
            y_pred.append(self.y_classes[np.argmax(log_posteriors)])
        return np.array(y_pred)

In [5]:
from sklearn.metrics import accuracy_score

gnb = GausianNaiveBayesClassifier()
gnb = gnb.fit(X_train, y_train)

y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.9428571428571428
Testing Accuracy: 0.9777777777777777


### (ii) In-built function

In [6]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_train_pred = gnb.predict(X_train)
y_test_pred = gnb.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Testing Accuracy:", accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.9428571428571428
Testing Accuracy: 0.9777777777777777


## Q2: GridSearchCV
---

Explore about GridSearchCV tool in scikit-learn. This is a tool that is often used for tuning hyperparameters of machine learning models.

Use this tool to find the best value of K for K-NN Classifier using any dataset.

In [7]:
from sklearn.datasets import load_breast_cancer

bc = load_breast_cancer(as_frame=True)
df = bc.frame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [8]:
from sklearn.model_selection import train_test_split

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Feature scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Feature Extraction through PCA as K-NNis a localised non-parametric approach
from sklearn.decomposition import PCA

pca = PCA(n_components=2, random_state=42)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [11]:
# Exhastively search K-NN hyperparameters through cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25]
}

grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train_pca, y_train)

best_knn = grid_search.best_estimator_

In [12]:
print("Best K (n_neighbors):", grid_search.best_params_['n_neighbors'])
print("Best Cross-validation score:", grid_search.best_score_)

Best K (n_neighbors): 11
Best Cross-validation score: 0.9371835443037975


In [13]:
y_test_pred_best = best_knn.predict(X_test_pca)
test_accuracy = accuracy_score(y_test, y_test_pred_best)
print("Final Test Accuracy with best K:", test_accuracy)

Final Test Accuracy with best K: 0.9298245614035088
