In [1]:
import numpy as np
import pandas as pd

In [2]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = np.array(y) 

    def predict(self, X):
        predictions = []
        for X_test_sample in X:
            distances = self.compute_distance(self.X_train, X_test_sample.reshape(1, -1))
            k_indices = distances.argsort(axis=0)[:self.k].flatten()
            k_labels = self.y_train[k_indices]
            k_distances = distances[k_indices]
        
            weights = 1 / (k_distances + 1e-5)  # Avoid division by zero
            weighted_avg = np.sum(weights * k_labels) / np.sum(weights)
            predictions.append(weighted_avg)
        return np.array(predictions)


    def compute_distance(self, X1, X2):
        X1 = np.array(X1, dtype=np.float64)
        X2 = np.array(X2, dtype=np.float64)
        # Compute distances between each row in X1 and X2
        if self.distance_metric == 'euclidean':
            # Using broadcasting to compute Euclidean distance
            distances = np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            # Using broadcasting to compute Manhattan distance
            distances = np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")
        return distances

In [3]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop irrelevant features
    train_data = train_data.drop(['id', 'CustomerId', 'Surname'], axis=1)

    # Separate features (X) and target (y) from training data
    X_train = train_data.drop(['Exited'], axis=1)
    y_train = train_data['Exited']
    X_test = test_data.drop(['id'], axis=1)
    
    # Identify numerical and categorical columns
    numerical_cols = X_train.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

    # Scale the numerical columns
    for col in numerical_cols:
        mean = X_train[col].mean()
        std = X_train[col].std()

        if std == 0:
            X_train[col] = 0
            X_test[col] = 0
        else:
            X_train[col] = (X_train[col] - mean) / std
            X_test[col] = (X_test[col] - mean) / std 

    # One-Hot Encode categorical columns
    X_train_cat = pd.get_dummies(X_train[categorical_cols], drop_first=True)
    X_test_cat = pd.get_dummies(X_test[categorical_cols], drop_first=True)

    # Align the one-hot encoded categorical columns between train and test data
    X_train_cat, X_test_cat = X_train_cat.align(X_test_cat, join='left', axis=1, fill_value=0)

    # Combine the scaled numerical and one-hot encoded categorical columns
    X_train_final = np.hstack([X_train[numerical_cols].values, X_train_cat.values])
    X_test_final = np.hstack([X_test[numerical_cols].values, X_test_cat.values])
    
    return X_train_final, y_train, X_test_final

In [4]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    fold_size = len(X) // n_splits
    auc_scores = []

    for fold in range(n_splits):
        # Split the data into training and validation sets
        start = fold * fold_size
        end = start + fold_size

        X_val = X[start:end]
        y_val = y[start:end]

        X_train = np.concatenate([X[:start], X[end:]], axis=0)
        y_train = np.concatenate([y[:start], y[end:]], axis=0)

        # Train the model on the training set
        knn.fit(X_train, y_train)

        # Predict on the validation set
        y_val_pred = knn.predict(X_val)

        # AUC Calculation
        sorted_indices = np.argsort(y_val_pred)
        y_val_sorted = np.array(y_val)[sorted_indices]

        # Count positives and negatives
        P = np.sum(y_val_sorted)
        N = len(y_val_sorted) - P

        # Calculate True Positive Rate (TPR) and False Positive Rate (FPR)
        TPR = np.cumsum(y_val_sorted) / P
        FPR = np.cumsum(1 - y_val_sorted) / N

        # Calculate the AUC (using the trapezoidal rule)
        auc = np.trapz(TPR, FPR)
        auc_scores.append(auc)

    # Return the mean AUC score from all splits
    return np.mean(auc_scores)


In [5]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

best_auc = 0
best_k = None
best_metric = None

for k in [3,5, 7]:
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        cv_scores = cross_validate(X, y, knn)
        mean_auc = np.mean(cv_scores)
        print(f'k={k}, metric={metric}, CV AUC={mean_auc:.4f}')
        if mean_auc > best_auc:
            best_auc = mean_auc
            best_k = k
            best_metric = metric

print(f'Best hyperparameters: k={best_k}, metric={best_metric}, AUC={best_auc:.4f}')

knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

k=3, metric=euclidean, CV AUC=0.1466
k=3, metric=manhattan, CV AUC=0.1482
k=5, metric=euclidean, CV AUC=0.1200
k=5, metric=manhattan, CV AUC=0.1217
k=7, metric=euclidean, CV AUC=0.1099
k=7, metric=manhattan, CV AUC=0.1101
Best hyperparameters: k=3, metric=manhattan, AUC=0.1482
