<a href="https://colab.research.google.com/github/jesse-venson/Machine-learning/blob/main/Assignment_6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# LAB ASSIGNMENT 6: Gaussian Naïve Bayes & GridSearchCV

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("="*70)
print("TASK 1: GAUSSIAN NAÏVE BAYES CLASSIFIER")
print("="*70)

# Load Iris dataset
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"\nDataset Shape: {X.shape}")
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")
print(f"Classes: {iris.target_names}\n")

# ============================================================================
# (i) STEP-BY-STEP IMPLEMENTATION
# ============================================================================
print("\n" + "="*70)
print("(i) STEP-BY-STEP IMPLEMENTATION")
print("="*70)

class GaussianNaiveBayes:
    def fit(self, X, y):
        """Train the model by calculating priors, means, and variances"""
        self.classes = np.unique(y)
        n_samples = len(y)

        # Calculate prior probabilities: P(class)
        self.priors = {}
        for c in self.classes:
            self.priors[c] = np.sum(y == c) / n_samples

        # Calculate mean and variance for each feature in each class
        self.means = {}
        self.variances = {}

        for c in self.classes:
            X_c = X[y == c]  # Get samples of class c
            self.means[c] = np.mean(X_c, axis=0)
            self.variances[c] = np.var(X_c, axis=0) + 1e-9  # Add small value to avoid division by zero

        print("\n✓ Model trained successfully!")
        print(f"  Prior probabilities: {self.priors}")

    def gaussian_probability(self, x, mean, var):
        """Calculate Gaussian probability: P(x|class)"""
        coefficient = 1 / np.sqrt(2 * np.pi * var)
        exponent = np.exp(-((x - mean) ** 2) / (2 * var))
        return coefficient * exponent

    def predict(self, X):
        """Predict class for each sample"""
        predictions = []

        for x in X:
            posteriors = []

            # Calculate posterior for each class
            for c in self.classes:
                # Start with prior: P(class)
                prior = np.log(self.priors[c])

                # Multiply by likelihoods: P(feature|class) for all features
                likelihood = np.sum(np.log(self.gaussian_probability(x, self.means[c], self.variances[c])))

                # Posterior = prior + likelihood (in log space)
                posterior = prior + likelihood
                posteriors.append(posterior)

            # Choose class with highest posterior
            predictions.append(self.classes[np.argmax(posteriors)])

        return np.array(predictions)

# Train manual implementation
print("\nTraining Manual Gaussian NB...")
manual_gnb = GaussianNaiveBayes()
manual_gnb.fit(X_train, y_train)

# Predict and evaluate
y_pred_manual = manual_gnb.predict(X_test)
manual_accuracy = accuracy_score(y_test, y_pred_manual)

print(f"\n✓ Manual Implementation Accuracy: {manual_accuracy:.4f} ({manual_accuracy*100:.2f}%)")

# Confusion Matrix
cm_manual = confusion_matrix(y_test, y_pred_manual)
print("\nConfusion Matrix:")
print(cm_manual)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_manual, target_names=iris.target_names))

# ============================================================================
# (ii) BUILT-IN FUNCTION IMPLEMENTATION
# ============================================================================
print("\n" + "="*70)
print("(ii) BUILT-IN FUNCTION IMPLEMENTATION")
print("="*70)

# Train sklearn's GaussianNB
sklearn_gnb = GaussianNB()
sklearn_gnb.fit(X_train, y_train)
y_pred_sklearn = sklearn_gnb.predict(X_test)
sklearn_accuracy = accuracy_score(y_test, y_pred_sklearn)

print(f"\n✓ Sklearn Implementation Accuracy: {sklearn_accuracy:.4f} ({sklearn_accuracy*100:.2f}%)")

# Confusion Matrix
cm_sklearn = confusion_matrix(y_test, y_pred_sklearn)
print("\nConfusion Matrix:")
print(cm_sklearn)

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_sklearn, target_names=iris.target_names))

# ============================================================================
# TASK 2: GridSearchCV for K-NN HYPERPARAMETER TUNING
# ============================================================================
print("\n\n" + "="*70)
print("TASK 2: GRIDSEARCHCV FOR K-NN HYPERPARAMETER TUNING")
print("="*70)

# Define parameter grid for K values
param_grid = {'n_neighbors': list(range(1, 31))}  # Test K from 1 to 30

print(f"\nParameter Grid: K values from {param_grid['n_neighbors'][0]} to {param_grid['n_neighbors'][-1]}")

# Setup GridSearchCV
knn = KNeighborsClassifier()
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    verbose=1,
    n_jobs=-1  # Use all processors
)

print("\nPerforming Grid Search with 5-Fold Cross-Validation...")
print("-"*70)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Results
print("\n" + "="*70)
print("GRID SEARCH RESULTS")
print("="*70)
print(f"\n✓ Best K value: {grid_search.best_params_['n_neighbors']}")
print(f"✓ Best Cross-Validation Accuracy: {grid_search.best_score_:.4f} ({grid_search.best_score_*100:.2f}%)")

# Test on test set
best_knn = grid_search.best_estimator_
y_pred_knn = best_knn.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred_knn)
print(f"✓ Test Set Accuracy: {test_accuracy:.4f} ({test_accuracy*100:.2f}%)")

# Extract results for display
results_df = pd.DataFrame(grid_search.cv_results_)

# Display top 5 K values
print("\n" + "-"*70)
print("TOP 5 K VALUES:")
print("-"*70)
top_5 = results_df.nlargest(5, 'mean_test_score')[['param_n_neighbors', 'mean_test_score', 'std_test_score']]
top_5.columns = ['K', 'Mean CV Accuracy', 'Std Dev']
print(top_5.to_string(index=False))

# Classification report for best K-NN
print("\n" + "="*70)
print(f"CLASSIFICATION REPORT (K-NN with K={grid_search.best_params_['n_neighbors']})")
print("="*70)
print(classification_report(y_test, y_pred_knn, target_names=iris.target_names))

# Confusion Matrix for K-NN
cm_knn = confusion_matrix(y_test, y_pred_knn)
print("\nConfusion Matrix:")
print(cm_knn)

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*70)
print("SUMMARY")
print("="*70)
print(f"\n1. Gaussian Naïve Bayes:")
print(f"   - Manual Implementation:  {manual_accuracy*100:.2f}%")
print(f"   - Sklearn Implementation: {sklearn_accuracy*100:.2f}%")
print(f"\n2. K-NN with GridSearchCV:")
print(f"   - Best K value: {grid_search.best_params_['n_neighbors']}")
print(f"   - CV Accuracy: {grid_search.best_score_*100:.2f}%")
print(f"   - Test Accuracy: {test_accuracy*100:.2f}%")
print("\n" + "="*70)

TASK 1: GAUSSIAN NAÏVE BAYES CLASSIFIER

Dataset Shape: (150, 4)
Training samples: 105, Test samples: 45
Classes: ['setosa' 'versicolor' 'virginica']


(i) STEP-BY-STEP IMPLEMENTATION

Training Manual Gaussian NB...

✓ Model trained successfully!
  Prior probabilities: {np.int64(0): np.float64(0.29523809523809524), np.int64(1): np.float64(0.3523809523809524), np.int64(2): np.float64(0.3523809523809524)}

✓ Manual Implementation Accuracy: 0.9778 (97.78%)

Confusion Matrix:
[[19  0  0]
 [ 0 12  1]
 [ 0  0 13]]

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      0.92      0.96        13
   virginica       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45


(ii) BUILT-IN FUNCTION IMPLEMENTATION

✓ Sklearn Implementation Accuracy: 0.9778