In [140]:
import numpy as np
import pandas as pd

In [141]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for i in range(len(X)):
            distances = self.compute_distance(self.X_train, X.iloc[i])  # Use .iloc for row access
            k_neighbors_idx = np.argsort(distances)[:self.k]
            k_neighbor_labels = self.y_train.iloc[k_neighbors_idx].values.astype(int)  # Convert labels to int
            majority_vote = np.bincount(k_neighbor_labels).argmax()
            predictions.append(majority_vote)
        return np.array(predictions)
    
    def predict_proba(self, X):
        probabilities = []
        for i in range(len(X)):
            distances = self.compute_distance(self.X_train, X.iloc[i])

            k_neighbors_idx = np.argsort(distances)[:self.k]
            k_neighbor_labels = self.y_train.iloc[k_neighbors_idx].values.astype(int)

            prob_of_class_1 = np.mean(k_neighbor_labels == 1)

            probabilities.append([1 - prob_of_class_1, prob_of_class_1])

        return np.array(probabilities)
    
    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [142]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    # Load the train and test data files
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values for numeric columns
    numeric_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 
                       'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
    train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].median())
    test_data[numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].median())

    # One-hot encode categorical features (Geography, Gender)
    categorical_features = ['Geography', 'Gender']
    train_data = pd.get_dummies(train_data, columns=categorical_features)
    test_data = pd.get_dummies(test_data, columns=categorical_features)

    # Convert only the boolean one-hot encoded columns to integers
    boolean_columns = train_data.select_dtypes(include=['bool']).columns
    train_data[boolean_columns] = train_data[boolean_columns].astype(int)
    test_data[boolean_columns] = test_data[boolean_columns].astype(int)

    # Make sure the test and train datasets have the same dummy columns
    for col in set(train_data.columns) - set(test_data.columns):
        if col.startswith('Geography_') or col.startswith('Gender_'):
            test_data[col] = 0  # Add missing columns to test set
    
    for col in set(test_data.columns) - set(train_data.columns):
        if col.startswith('Geography_') or col.startswith('Gender_'):
            train_data[col] = 0  # Add missing columns to train set

    # Manual Z-score Standardization
    for col in numeric_columns:
        mean = train_data[col].mean()
        std = train_data[col].std()
        # Apply Z-score standardization
        train_data[col] = (train_data[col] - mean) / std
        test_data[col] = (test_data[col] - mean) / std  # Apply the same mean and std from training data to test data

    # Reset index after preprocessing to ensure indices align correctly
    X_train = train_data.drop(columns=['Exited', 'CustomerId', 'Surname', 'id']).reset_index(drop=True)
    y_train = train_data['Exited'].reset_index(drop=True)
    X_test = test_data.drop(columns=['CustomerId', 'Surname', 'id']).reset_index(drop=True)

    return X_train, y_train, X_test

In [143]:
def calculate_auc(y_true, y_scores):
    # Sort by predicted probabilities
    sorted_indices = np.argsort(y_scores)[::-1]
    y_true_sorted = np.array(y_true)[sorted_indices]
    
    # Calculate the true positive and false positive rates
    tpr = []
    fpr = []
    positives = sum(y_true)
    negatives = len(y_true) - positives
    
    tp = 0  # true positives
    fp = 0  # false positives
    
    for i in range(len(y_true_sorted)):
        if y_true_sorted[i] == 1:
            tp += 1
        else:
            fp += 1
        tpr.append(tp / positives)
        fpr.append(fp / negatives)
    
    # Calculate AUC using the trapezoidal rule (approximation)
    auc = 0
    for i in range(1, len(tpr)):
        auc += (fpr[i] - fpr[i - 1]) * (tpr[i] + tpr[i - 1]) / 2
    
    return auc

# Define cross-validation function
def cross_validate(X, y, model, n_splits=5):
    n_samples = len(X)
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    fold_size = n_samples // n_splits
    auc_scores = []

    for fold in range(n_splits):
        val_indices = indices[fold * fold_size:(fold + 1) * fold_size]
        train_indices = np.concatenate([indices[:fold * fold_size], indices[(fold + 1) * fold_size:]])

        X_train, X_val = X.iloc[train_indices], X.iloc[val_indices]
        y_train, y_val = y.iloc[train_indices], y.iloc[val_indices]
        
        model.fit(X_train, y_train)
        y_pred_proba = model.predict_proba(X_val)[:, 1]

        # Calculate AUC manually
        auc = calculate_auc(y_val, y_pred_proba)
        auc_scores.append(auc)
    
    return np.mean(auc_scores)

In [144]:
# Preprocess the data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Ensure the indices are properly reset
X = X.reset_index(drop=True)
y = y.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

# Hyperparameter tuning
best_k = None
best_metric = None
best_auc = 0

for k in [3, 5, 7]:
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        avg_auc = np.mean(scores)
        print(f"K: {k}, Metric: {metric}, AUC: {avg_auc}")
        if avg_auc > best_auc:
            best_auc = avg_auc
            best_k = k
            best_metric = metric

print(f"Best K: {best_k}, Best Metric: {best_metric}, Best AUC: {best_auc}")

# Train on full dataset with optimal hyperparameters and make predictions on test set
# Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)

# Predict probabilities for the positive class (Exited = 1)
test_probabilities = knn.predict_proba(X_test)[:, 1]  # [:, 1] extracts the probability of class 1 (Exited = 1)


# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_probabilities}).to_csv('submissions.csv', index=False)

K: 3, Metric: euclidean, AUC: 0.850567991842915
K: 3, Metric: manhattan, AUC: 0.8473855597739212
K: 5, Metric: euclidean, AUC: 0.8752849164943468
K: 5, Metric: manhattan, AUC: 0.8726639874286265
K: 7, Metric: euclidean, AUC: 0.8867646236275336
K: 7, Metric: manhattan, AUC: 0.8874669237709177
Best K: 7, Best Metric: manhattan, Best AUC: 0.8874669237709177
