<a href="https://colab.research.google.com/github/kanishika-c15/ml_assignments/blob/main/ass6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load Iris dataset
iris = load_iris()
X = iris.data        # features
y = iris.target      # labels (0,1,2)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (120, 4)
Test shape: (30, 4)


In [2]:
class MyGaussianNB:
    def __init__(self):
        self.classes_ = None      # unique class labels
        self.class_priors_ = None # P(y=c)
        self.means_ = None        # mean per class per feature
        self.vars_ = None         # variance per class per feature
        self.eps = 1e-9           # small value to avoid division by zero

    def fit(self, X, y):
        """
        X: (n_samples, n_features)
        y: (n_samples,)
        """
        n_samples, n_features = X.shape
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)

        # Initialize containers
        self.means_ = np.zeros((n_classes, n_features))
        self.vars_ = np.zeros((n_classes, n_features))
        self.class_priors_ = np.zeros(n_classes)

        # Compute stats for each class
        for idx, c in enumerate(self.classes_):
            X_c = X[y == c]
            self.means_[idx, :] = X_c.mean(axis=0)
            self.vars_[idx, :] = X_c.var(axis=0) + self.eps
            self.class_priors_[idx] = X_c.shape[0] / float(n_samples)

        return self

    def _gaussian_log_likelihood(self, class_idx, x):
        """
        x: (n_features,)
        returns log P(x | y = class_idx)
        """
        mean = self.means_[class_idx]
        var = self.vars_[class_idx]

        # Gaussian log pdf per feature
        log_prob = -0.5 * np.log(2.0 * np.pi * var) \
                   - ((x - mean) ** 2) / (2.0 * var)

        return log_prob.sum()

    def predict(self, X):
        """
        X: (n_samples, n_features)
        """
        y_pred = []

        for x in X:
            posteriors = []
            for idx, c in enumerate(self.classes_):
                # log P(y=c)
                log_prior = np.log(self.class_priors_[idx])
                # log P(x | y=c)
                log_likelihood = self._gaussian_log_likelihood(idx, x)
                # log posterior ∝ log prior + log likelihood
                log_posterior = log_prior + log_likelihood
                posteriors.append(log_posterior)

            # choose class with max posterior
            y_pred.append(self.classes_[np.argmax(posteriors)])

        return np.array(y_pred)


In [3]:
# Create object
my_gnb = MyGaussianNB()

# Train
my_gnb.fit(X_train, y_train)

# Predict
y_pred_custom = my_gnb.predict(X_test)

# Evaluate
print("Custom GaussianNB Accuracy:", accuracy_score(y_test, y_pred_custom))
print("\nClassification Report (Custom):")
print(classification_report(y_test, y_pred_custom, target_names=iris.target_names))

print("Confusion Matrix (Custom):")
print(confusion_matrix(y_test, y_pred_custom))


Custom GaussianNB Accuracy: 0.9666666666666667

Classification Report (Custom):
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

Confusion Matrix (Custom):
[[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]


In [4]:
from sklearn.naive_bayes import GaussianNB

# Create model
gnb = GaussianNB()

# Train
gnb.fit(X_train, y_train)

# Predict
y_pred_sklearn = gnb.predict(X_test)

# Evaluate
print("Sklearn GaussianNB Accuracy:", accuracy_score(y_test, y_pred_sklearn))
print("\nClassification Report (Sklearn):")
print(classification_report(y_test, y_pred_sklearn, target_names=iris.target_names))

print("Confusion Matrix (Sklearn):")
print(confusion_matrix(y_test, y_pred_sklearn))


Sklearn GaussianNB Accuracy: 0.9666666666666667

Classification Report (Sklearn):
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        10
  versicolor       1.00      0.90      0.95        10
   virginica       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30

Confusion Matrix (Sklearn):
[[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]


In [5]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load dataset
iris = load_iris()
X = iris.data
y = iris.target

# Train–Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create KNN model
knn = KNeighborsClassifier()

# Define parameter search space
param_grid = {
    'n_neighbors': range(1, 21),     # K = 1 to 20
    'weights': ['uniform', 'distance']
}

# Grid Search
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X_train, y_train)

# Best parameters
print("Best parameters found:", grid.best_params_)

# Train best model
best_knn = grid.best_estimator_

# Predictions
y_pred = best_knn.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))


Best parameters found: {'n_neighbors': 3, 'weights': 'uniform'}
Accuracy: 1.0
