In [1]:
import numpy as np
import pandas as pd

In [5]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        probabilities = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            probability = np.mean(k_nearest_labels)  # Proportion of positive class
            probabilities.append(probability)
        return np.array(probabilities)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric: choose 'euclidean' or 'manhattan'.")

In [12]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    
    # handling categorical variables
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    # aligning columns of test data with training data
    test_data = test_data.reindex(columns=train_data.columns.drop('Exited'), fill_value=0)

    # splitting features and labels
    X = train_data.drop(['Exited', 'CustomerId', 'Surname'], axis=1).values
    y = train_data['Exited'].values
    X_test = test_data.drop(['CustomerId', 'Surname'], axis=1).values

    # scaling features by standardization
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std
    X_test = (X_test - mean) / std

    return X, y, X_test


In [13]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores

    indices = np.arange(len(X))
    np.random.shuffle(indices)
    fold_sizes = len(X) // n_splits
    scores = []
    
    for fold in range(n_splits):
        test_indices = indices[fold * fold_sizes:(fold + 1) * fold_sizes]
        train_indices = np.concatenate([indices[:fold * fold_sizes], indices[(fold + 1) * fold_sizes:]])

        X_train, X_val = X[train_indices], X[test_indices]
        y_train, y_val = y[train_indices], y[test_indices]

        knn.fit(X_train, y_train)
        val_predictions = knn.predict(X_val)
        
        # calculating AUC
        thresholds = np.sort(np.unique(val_predictions))
        tpr = []
        fpr = []

        for threshold in thresholds:
            tp = np.sum((val_predictions >= threshold) & (y_val == 1))
            fp = np.sum((val_predictions >= threshold) & (y_val == 0))
            fn = np.sum((val_predictions < threshold) & (y_val == 1))
            tn = np.sum((val_predictions < threshold) & (y_val == 0))

            # true positive rate
            tpr.append(tp / (tp + fn) if (tp + fn) > 0 else 0)
            # false positive rate
            fpr.append(fp / (fp + tn) if (fp + tn) > 0 else 0)

        # Calculate AUC using the trapezoidal rule
        scores.append(np.trapz(tpr, fpr))
    
    return scores

In [14]:
# Load and preprocess data
X, y, X_test = preprocess_data('~/github-repos/CS506/beaudion-assignment-5/train.csv', '~/github-repos/CS506/beaudion-assignment-5/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
# Hyperparameter tuning

# hyperparameter tuning function
def tune_hyperparameters(X, y, k_values, n_splits=5):
    best_k = None
    best_score = 0

    for k in k_values:
        knn = KNN(k=k, distance_metric='euclidean')
        score = cross_validate(X, y, knn, n_splits)
        print(f"Mean AUC score for k={k}: {score}")

        if score > best_score:
            best_score = score
            best_k = k

    return best_k, best_score

# testing k from 1 to 20
k_values = range(1, 21)
# performing hyperparameter tuning
best_k, best_score = tune_hyperparameters(X, y, k_values)
print(f"Best k: {best_k} with AUC score: {best_score}")


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('~/github-repos/CS506/beaudion-assignment-5/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.7416870042708968, 0.7541248979406695, 0.7621252760467611, 0.7705018850401884, 0.7543799160007822]
Best K: 5
Best Cross-validation AUC: 0.7565637958598594
