In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [3]:
# --- Fit: estimate class prior π_k, mean μ_kj, variance σ^2_kj ---
classes = np.unique(y_train)
priors = {k: np.mean(y_train == k) for k in classes}
means  = {k: X_train[y_train == k].mean(axis=0) for k in classes}
vars_  = {k: X_train[y_train == k].var(axis=0) + 1e-9 for k in classes}  # var-smoothing


In [5]:

# --- Predict using log-likelihood (Gaussian) + log prior ---
def log_gauss(x, mu, var):
    return -0.5*np.sum(np.log(2*np.pi*var) + ((x - mu)**2)/var, axis=1)

def predict(X):
    scores = []
    for k in classes:
        s = log_gauss(X, means[k], vars_[k]) + np.log(priors[k])
        scores.append(s)
    scores = np.vstack(scores).T           # shape: [n_samples, n_classes]
    return np.argmax(scores, axis=1)

y_pred = predict(X_test)
print("From-scratch GNB accuracy:", accuracy_score(y_test, y_pred))


From-scratch GNB accuracy: 0.9666666666666667


Q-1 (ii)

In [6]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X, y = load_iris(return_X_y=True)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

gnb = GaussianNB()
gnb.fit(X_tr, y_tr)
y_hat = gnb.predict(X_te)
print("sklearn GaussianNB accuracy:", accuracy_score(y_te, y_hat))


sklearn GaussianNB accuracy: 0.9666666666666667


Q-2

In [7]:
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Load dataset
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define model
knn = KNeighborsClassifier()

# Define parameter grid (values of K to try)
param_grid = {'n_neighbors': range(1, 21)}

# Grid search with cross-validation
grid = GridSearchCV(knn, param_grid, cv=5)
grid.fit(X_train, y_train)

# Print best K and corresponding accuracy
print("Best K:", grid.best_params_['n_neighbors'])
print("Best Accuracy:", grid.best_score_)


Best K: 3
Best Accuracy: 0.9583333333333334
