In [10]:
# Research AI Task 9

# - Perform PCA on the given dataset
# - Implement a Support Vector Machine (SVM) from scratch
# - Implement Logistic Regression from scratch
# - Compare SVM and Logistic Regression, with and without PCA

# ASSUMPTIONS
# -----------
# - The dataset is a CSV file.
# - All columns except the LAST one are numeric features.
# - The LAST column is the target label (can be string or numeric).
# - Works for binary or multi-class classification (one-vs-rest).

# HOW TO RUN
# ----------
# python task9_pca_svm_lr.py


import numpy as np
import pandas as pd

# ==============================
# 1. Load Dataset
# ==============================

# Change this to your CSV file name if needed
DATA_PATH = "/content/titanic (3).csv"

data = pd.read_csv(DATA_PATH)

# Labels: last column
y_raw = data.iloc[:, -1].values

# Features: all columns except the last one
# Identify and drop non-numeric columns that cannot be converted to float
# Also drop 'PassengerId' as it's an identifier
features_df = data.iloc[:, :-1]

# List of columns to drop (non-numeric or identifiers, common in Titanic dataset)
columns_to_drop_from_features = ['PassengerId', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

# Drop columns. 'errors='ignore'' will prevent error if a column is not found
X = features_df.drop(columns=columns_to_drop_from_features, errors='ignore').values.astype(float)

print("Dataset shape:", X.shape)

# Ensure y_raw is consistently string type, handling NaNs
y_raw = pd.Series(y_raw).fillna('NaN_label').astype(str).values
print("Unique labels (original):", np.unique(y_raw))

# Encode labels to integers 0..K-1
classes, y_int = np.unique(y_raw, return_inverse=True)
y = y_int               # 0..K-1
num_classes = len(classes)

print("Number of classes:", num_classes)

# ==============================
# Imputation (from scratch) - Added to handle NaNs
# ==============================
class SimpleImputer:
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.fill_value = None

    def fit(self, X):
        if self.strategy == 'mean':
            self.fill_value = np.nanmean(X, axis=0)
        elif self.strategy == 'median':
            self.fill_value = np.nanmedian(X, axis=0)
        elif self.strategy == 'constant':
            # You can specify a constant value if needed, e.g., self.constant_value
            self.fill_value = np.zeros(X.shape[1]) # Fill with zeros as a default constant
        return self

    def transform(self, X):
        X_imputed = X.copy()
        # Handle cases where entire column might be NaN after split
        # Use a small constant if fill_value itself is NaN for a column
        for i in range(X.shape[1]):
            if np.isnan(self.fill_value[i]):
                X_imputed[np.isnan(X_imputed[:, i]), i] = 0.0 # Fallback to 0 if mean is NaN
            else:
                X_imputed[np.isnan(X_imputed[:, i]), i] = self.fill_value[i]
        return X_imputed

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)


# ==============================
# 2. Train / Test Split
# ==============================

def train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42):
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    if shuffle:
        rng = np.random.RandomState(random_state)
        rng.shuffle(indices)

    test_size_int = int(n_samples * test_size)
    test_idx = indices[:test_size_int]
    train_idx = indices[test_size_int:]

    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# ==============================
# 3. Standardization
# ==============================

class StandardScaler:
    def fit(self, X):
        self.mean_ = X.mean(axis=0)
        self.std_ = X.std(axis=0)
        self.std_[self.std_ == 0] = 1.0
        return self

    def transform(self, X):
        return (X - self.mean_) / self.std_

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ==============================
# 4. PCA (from scratch)
# ==============================

class PCA:
    def __init__(self, n_components=None, var_threshold=None):
        """
        n_components: fixed number of components to keep
        var_threshold: if given, choose smallest number of components
                       explaining at least this fraction of variance.
        """
        self.n_components = n_components
        self.var_threshold = var_threshold

    def fit(self, X):
        # Covariance matrix of standardized data
        cov_matrix = np.cov(X.T)  # (d, d)

        # Eigen decomposition of symmetric matrix
        eigenvalues, eigenvectors = np.linalg.eigh(cov_matrix)

        # Sort eigenvalues/vectors in descending order
        idxs = np.argsort(eigenvalues)[::-1]
        eigenvalues = eigenvalues[idxs]
        eigenvectors = eigenvectors[:, idxs]

        explained_variance_ratio = eigenvalues / np.sum(eigenvalues)

        # Decide number of components
        if self.var_threshold is not None and self.n_components is None:
            cumulative = np.cumsum(explained_variance_ratio)
            self.n_components = np.searchsorted(cumulative, self.var_threshold) + 1

        if self.n_components is None:
            self.n_components = X.shape[1]

        self.components_ = eigenvectors[:, : self.n_components].T  # (k, d)
        self.explained_variance_ratio_ = explained_variance_ratio[: self.n_components]
        return self

    def transform(self, X):
        return np.dot(X, self.components_.T)

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

# Keep enough components to explain ~95% variance
pca = PCA(var_threshold=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print("\nPCA chose components:", pca.n_components)
print("Explained variance ratio of chosen components:")
print(pca.explained_variance_ratio_)

# ==============================
# 5. Logistic Regression (from scratch)
# ==============================

class LogisticRegressionBinary:
    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def _sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))

    def fit(self, X, y):
        # y is 0 or 1
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0.0

        for _ in range(self.n_iters):
            linear = np.dot(X, self.w) + self.b
            y_pred = self._sigmoid(linear)

            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))
            db = (1 / n_samples) * np.sum(y_pred - y)

            self.w -= self.lr * dw
            self.b -= self.lr * db

    def predict_proba(self, X):
        linear = np.dot(X, self.w) + self.b
        return self._sigmoid(linear)

    def predict(self, X):
        proba = self.predict_proba(X)
        return (proba >= 0.5).astype(int)


class OneVsRestLogistic:
    """
    Multi-class logistic regression using one-vs-rest.
    """

    def __init__(self, lr=0.01, n_iters=1000):
        self.lr = lr
        self.n_iters = n_iters
        self.classifiers = []
        self.classes_ = None

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.classifiers = []

        for c in self.classes_:
            y_binary = (y == c).astype(int)  # 1 for class c, 0 otherwise
            clf = LogisticRegressionBinary(lr=self.lr, n_iters=self.n_iters)
            clf.fit(X, y_binary)
            self.classifiers.append(clf)

    def predict(self, X):
        # For each classifier, get probability of being its class
        all_probs = []
        for clf in self.classifiers:
            probs = clf.predict_proba(X)
            all_probs.append(probs.reshape(-1, 1))
        all_probs = np.hstack(all_probs)  # (n_samples, n_classes)
        class_indices = np.argmax(all_probs, axis=1)
        return self.classes_[class_indices]

# ==============================
# 6. Linear SVM (from scratch)
# ==============================

class LinearSVMBinary:
    def __init__(self, lr=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = lr
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.w = None
        self.b = None

    def fit(self, X, y):
        """
        y should be in {-1, +1}
        """
        n_samples, n_features = X.shape
        self.w = np.zeros(n_features)
        self.b = 0.0

        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                condition = y[idx] * (np.dot(x_i, self.w) + self.b) >= 1
                if condition:
                    dw = 2 * self.lambda_param * self.w
                    db = 0.0
                else:
                    dw = 2 * self.lambda_param * self.w - y[idx] * x_i
                    db = -y[idx]

                self.w -= self.lr * dw
                self.b -= self.lr * db

    def decision_function(self, X):
        return np.dot(X, self.w) + self.b

    def predict(self, X):
        scores = self.decision_function(X)
        return np.sign(scores)


class OneVsRestSVM:
    """
    Multi-class linear SVM using one-vs-rest strategy.
    """

    def __init__(self, lr=0.001, lambda_param=0.01, n_iters=1000):
        self.lr = lr
        self.lambda_param = lambda_param
        self.n_iters = n_iters
        self.classifiers = []
        self.classes_ = None

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        self.classifiers = []

        for c in self.classes_:
            y_binary = np.where(y == c, 1, -1)
            clf = LinearSVMBinary(
                lr=self.lr, lambda_param=self.lambda_param, n_iters=self.n_iters
            )
            clf.fit(X, y_binary)
            self.classifiers.append(clf)

    def predict(self, X):
        all_scores = []
        for clf in self.classifiers:
            scores = clf.decision_function(X)
            all_scores.append(scores.reshape(-1, 1))
        all_scores = np.hstack(all_scores)  # (n_samples, n_classes)
        class_indices = np.argmax(all_scores, axis=1)
        return self.classes_[class_indices]

# ==============================
# 7. Accuracy Metric
# ==============================

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

# ==============================
# 8. Training and Comparison
# ==============================

print("\n==========================")
print("WITHOUT PCA")
print("=========================")

# Logistic Regression without PCA
log_reg = OneVsRestLogistic(lr=0.01, n_iters=2000)
log_reg.fit(X_train_scaled, y_train)
y_pred_lr = log_reg.predict(X_test_scaled)
acc_lr = accuracy(y_test, y_pred_lr)
print("Logistic Regression accuracy:", acc_lr)

# SVM without PCA
svm = OneVsRestSVM(lr=0.001, lambda_param=0.01, n_iters=1000)
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)
acc_svm = accuracy(y_test, y_pred_svm)
print("Linear SVM accuracy:", acc_svm)

print("\n==========================")
print("WITH PCA")
print("==========================")

# Logistic Regression with PCA
log_reg_pca = OneVsRestLogistic(lr=0.01, n_iters=2000)
log_reg_pca.fit(X_train_pca, y_train)
y_pred_lr_pca = log_reg_pca.predict(X_test_pca)
acc_lr_pca = accuracy(y_test, y_pred_lr_pca)
print("Logistic Regression (PCA) accuracy:", acc_lr_pca)

# SVM with PCA
svm_pca = OneVsRestSVM(lr=0.001, lambda_param=0.01, n_iters=1000)
svm_pca.fit(X_train_pca, y_train)
y_pred_svm_pca = svm_pca.predict(X_test_pca)
acc_svm_pca = accuracy(y_test, y_pred_svm_pca)
print("Linear SVM (PCA) accuracy:", acc_svm_pca)

print("\n=========== SUMMARY ==========")
print(f"Logistic Regression  (no PCA): {acc_lr:.4f}")
print(f"Logistic Regression  (PCA)   : {acc_lr_pca:.4f}")
print(f"SVM (no PCA)                 : {acc_svm:.4f}")
print(f"SVM (PCA)                    : {acc_svm_pca:.4f}")
print("Original classes mapping:", dict(enumerate(classes)))
print("===============================")


Dataset shape: (891, 6)
Unique labels (original): ['C' 'NaN_label' 'Q' 'S']
Number of classes: 4
Train shape: (713, 6)
Test shape: (178, 6)

PCA chose components: 6
Explained variance ratio of chosen components:
[0.30657111 0.27796526 0.16027894 0.10473784 0.0894279  0.06101895]

WITHOUT PCA
Logistic Regression accuracy: 0.6685393258426966
Linear SVM accuracy: 0.6629213483146067

WITH PCA
Logistic Regression (PCA) accuracy: 0.6685393258426966
Linear SVM (PCA) accuracy: 0.6629213483146067

Logistic Regression  (no PCA): 0.6685
Logistic Regression  (PCA)   : 0.6685
SVM (no PCA)                 : 0.6629
SVM (PCA)                    : 0.6629
Original classes mapping: {0: 'C', 1: 'NaN_label', 2: 'Q', 3: 'S'}
