In [40]:
import numpy as np
import pandas as pd

In [41]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        probabilities = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            probability = np.mean(k_nearest_labels)  # Proportion of positive class
            probabilities.append(probability)
        return np.array(probabilities)
    

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric: choose 'euclidean' or 'manhattan'.")

In [42]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    
    ## Handle categorical variables
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # Align columns of test data with training data
    test_data = test_data.reindex(columns=train_data.columns.drop('Exited'), fill_value=0)

    # Split features and labels
    X = train_data.drop(['Exited', 'CustomerId', 'Surname'], axis=1).values
    y = train_data['Exited'].values
    X_test = test_data.drop(['CustomerId', 'Surname'], axis=1).values

    # Scale features (standardization)
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std
    X_test = (X_test - mean) / std

    return X, y, X_test


In [43]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores

    # TODO: Implement cross-validation
    # Compute ROC AUC scores

    fold_size = len(X) // n_splits
    auc_scores = []
    
    for i in range(n_splits):
        val_indices = range(i * fold_size, (i + 1) * fold_size)
        train_indices = list(set(range(len(X))) - set(val_indices))
        
        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]
        
        knn.fit(X_train, y_train)
        predictions = knn.predict(X_val)

        # Compute AUC score using a simple method (you can use sklearn for a more precise calculation)
        # Here we can calculate a simple metric based on confusion matrix values
        tp = np.sum((predictions == 1) & (y_val == 1))
        fp = np.sum((predictions == 1) & (y_val == 0))
        tn = np.sum((predictions == 0) & (y_val == 0))
        fn = np.sum((predictions == 0) & (y_val == 1))
        
        # Calculate AUC-like score (simplified)
        if tp + fp > 0 and tp + fn > 0:
            sensitivity = tp / (tp + fn)
            specificity = tn / (tn + fp)
            auc_score = (sensitivity + specificity) / 2
            auc_scores.append(auc_score)

    return auc_scores

In [44]:
# Load and preprocess data
X, y, X_test = preprocess_data('~/github-repos/CS506/beaudion-assignment-5/train.csv', '~/github-repos/CS506/beaudion-assignment-5/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
# Hyperparameter tuning
best_k = 3
best_auc = 0
for k in range(1, 21):  # Testing values from 1 to 20
    knn = KNN(k=k)
    cv_scores = cross_validate(X, y, knn)
    mean_auc = np.mean(cv_scores)
    if mean_auc > best_auc:
        best_auc = mean_auc
        best_k = k

print("Best K:", best_k)
print("Best Cross-validation AUC:", best_auc)


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('~/github-repos/CS506/beaudion-assignment-5/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.7908235100156623, 0.7924181754874652, 0.7963511343103794, 0.8318957743607437, 0.7791763132540516]
Best K: 8
Best Cross-validation AUC: 0.7992436097323738
