In [None]:
# ==============================================================
# ASSIGNMENT #02: Implementation and Comparison of Classification Algorithms
# Course: CS-471 Machine Learning (BEE-14)
# Student: Irfa Farooq
# ==============================================================

# Importing important libraries
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Part 1: Dataset Selection and Preprocessing

iris = datasets.load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = iris.target_names[y]

# Print first 5 rows for overview of dataset
print("Dataset sample:\n", df.head())

# Select only two classes for binary classification: Setosa vs Versicolor
df_binary = df[df['species'].isin(['setosa', 'versicolor'])]
X = df_binary[['petal length (cm)', 'petal width (cm)']].values
y = df_binary['species'].map({'setosa': 0, 'versicolor': 1}).values

# Explain preprocessing choices in comments
print("\nDataset only contains 'setosa' and 'versicolor' classes.")
print("Using only 'petal length' and 'petal width' features for clear separation.")
# These two features provide nearly perfect linear separability between classes

# Train-test split (80-20) with stratification to preserve class ratios
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features for gradient-based models like Logistic Regression and SVM
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Part 2: From-Scratch Implementations

# ---------------- Logistic Regression ----------------
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_train(X, y, lr=0.01, epochs=1000):
    m, n = X.shape
    w = np.zeros(n)  # Initialize weights
    b = 0            # Initialize bias
    for _ in range(epochs):
        z = np.dot(X, w) + b
        y_pred = sigmoid(z)
        dw = (1/m) * np.dot(X.T, (y_pred - y))  # Gradient of weights
        db = (1/m) * np.sum(y_pred - y)         # Gradient of bias
        w -= lr * dw
        b -= lr * db
    return w, b

def logistic_predict(X, w, b):
    y_pred = sigmoid(np.dot(X, w) + b)
    return (y_pred >= 0.5).astype(int)  # Threshold at 0.5

# ---------------- Support Vector Machine ----------------
def svm_train(X, y, lr=0.01, epochs=1000, C=1.0):
    y_svm = np.where(y == 0, -1, 1)  # Convert {0,1} to {-1,+1}
    m, n = X.shape
    w = np.zeros(n)
    b = 0
    for _ in range(epochs):
        for i in range(m):
            condition = y_svm[i] * (np.dot(X[i], w) + b) >= 1
            if condition:
                # Only regularization term applied if margin satisfied
                w -= lr * (2 * (1/epochs) * w)
            else:
                # Hinge loss gradient update + regularization
                w -= lr * (2 * (1/epochs) * w - np.dot(X[i], y_svm[i]) * C)
                b += lr * y_svm[i] * C
    return w, b

def svm_predict(X, w, b):
    return np.sign(np.dot(X, w) + b)

# ---------------- Gaussian Naive Bayes ----------------
class GaussianNB:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.priors = {}
        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = X_c.mean(axis=0)      # Mean per class
            self.var[c] = X_c.var(axis=0)        # Variance per class
            self.priors[c] = X_c.shape[0] / X.shape[0]  # Prior probability

    def predict(self, X):
        return np.array([self._predict(x) for x in X])

    def _predict(self, x):
        posteriors = []
        for c in self.classes:
            prior = np.log(self.priors[c])
            likelihood = -0.5 * np.sum(np.log(2 * np.pi * self.var[c]))
            likelihood -= 0.5 * np.sum(((x - self.mean[c]) ** 2) / self.var[c])
            posteriors.append(prior + likelihood)
        return self.classes[np.argmax(posteriors)]

# Part 3: Model Training

# Train from-scratch models
w_log, b_log = logistic_train(X_train, y_train)
y_pred_log = logistic_predict(X_test, w_log, b_log)

w_svm, b_svm = svm_train(X_train, y_train)
y_pred_svm = np.where(svm_predict(X_test, w_svm, b_svm) == 1, 1, 0)

nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

# Train built-in classifiers
dt = DecisionTreeClassifier().fit(X_train, y_train)
rf = RandomForestClassifier().fit(X_train, y_train)
ada = AdaBoostClassifier().fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_pred_rf = rf.predict(X_test)
y_pred_ada = ada.predict(X_test)

# Part 4: Evaluation

models = {
    'Logistic Regression': y_pred_log,
    'SVM': y_pred_svm,
    'Naive Bayes': y_pred_nb,
    'Decision Tree': y_pred_dt,
    'Random Forest': y_pred_rf,
    'AdaBoost': y_pred_ada
}

results = []
for name, y_pred in models.items():
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append([name, acc, prec, rec, f1])
    # Print evaluation metrics
    print(f"{name}: Acc={acc:.2f}, Prec={prec:.2f}, Rec={rec:.2f}, F1={f1:.2f}")

results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score'])
print("\nModel Comparison:\n", results_df)

# Comment on evaluation:
# Using only 'petal length' and 'petal width', classes are linearly separable
# Hence all models achieve perfect scores: Accuracy=1.0, Precision=1.0, Recall=1.0, F1=1.0
# No false positives or false negatives exist in predictions

# Part 5: Visualization

# --- Confusion Matrices ---
for name, y_pred in models.items():
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# --- Accuracy Comparison Bar Plot ---
plt.figure(figsize=(8, 4))
sns.barplot(x='Model', y='Accuracy', data=results_df)
plt.title('Model Accuracy Comparison')
plt.xticks(rotation=45)
plt.show()

# --- 2D Decision Boundary Visualizations ---
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
                     np.linspace(y_min, y_max, 300))
grid = np.c_[xx.ravel(), yy.ravel()]
grid_scaled = scaler.transform(grid)

# Predict on grid for visualization
zz_log = (sigmoid(np.dot(grid_scaled, w_log) + b_log) >= 0.5).astype(int).reshape(xx.shape)
zz_svm = np.sign(np.dot(grid_scaled, w_svm) + b_svm).reshape(xx.shape)
zz_nb = nb.predict(grid_scaled).reshape(xx.shape)

fig, axs = plt.subplots(1, 3, figsize=(18, 5))

# --- Logistic Regression ---
cs = axs[0].contourf(xx, yy, zz_log, alpha=0.3, cmap='coolwarm')
scatter = axs[0].scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', edgecolors='k')
axs[0].set_title('Logistic Regression')
axs[0].set_xlabel('Petal Length (cm)')
axs[0].set_ylabel('Petal Width (cm)')
# Legend: fix for matplotlib warning
handles = [plt.Line2D([], [], marker='o', color='w', markerfacecolor='blue', markersize=8),
           plt.Line2D([], [], marker='o', color='w', markerfacecolor='red', markersize=8)]
axs[0].legend(handles, ['Setosa', 'Versicolor'], loc='upper right')

# --- SVM with decision boundary + margins ---
axs[1].contourf(xx, yy, zz_svm, alpha=0.3, cmap='coolwarm')
axs[1].scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', edgecolors='k')
x_vals = np.linspace(x_min, x_max, 200)
axs[1].set_title('Support Vector Machine')
axs[1].set_xlabel('Petal Length (cm)')
axs[1].set_ylabel('Petal Width (cm)')
axs[1].legend(handles, ['Setosa', 'Versicolor'], loc='upper right')

# --- Naive Bayes ---
axs[2].contourf(xx, yy, zz_nb, alpha=0.3, cmap='coolwarm')
scatter_nb = axs[2].scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', edgecolors='k')
axs[2].set_title('Naive Bayes')
axs[2].set_xlabel('Petal Length (cm)')
axs[2].set_ylabel('Petal Width (cm)')
axs[2].legend(handles, ['Setosa', 'Versicolor'], loc='upper right')

plt.tight_layout()
plt.show()


: 