In [None]:
import numpy as np
import pandas as pd

In [None]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        batch_size = 500
        predictions = []
        total_samples = X.shape[0]

        for start_idx in range(0, total_samples, batch_size):
            end_idx = min(start_idx + batch_size, total_samples)
            X_batch = X[start_idx:end_idx]

            distances = self.compute_distance(X_batch, self.X_train)
            k_indices = np.argsort(distances, axis=1)[:, :self.k]
            k_labels = self.y_train[k_indices]

            batch_predictions = np.mean(k_labels, axis=1)

            predictions.extend(batch_predictions)

            del distances, k_indices, k_labels, X_batch

        return np.array(predictions)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
          X1_squared = np.sum(X1 ** 2, axis=1).reshape(-1, 1)
          X2_squared = np.sum(X2 ** 2, axis=1).reshape(1, -1)
          cross_term = np.dot(X1, X2.T)
          distances_squared = np.maximum(X1_squared - 2 * cross_term + X2_squared, 0)
          distances = distances_squared ** 0.5
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(X1[:, np.newaxis] - X2), axis=2)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")
        return distances

In [None]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    X_train = train_data.drop(columns=['Exited', 'CustomerId', 'Surname'])
    y_train = train_data['Exited'].values
    X_test = test_data.drop(columns=['CustomerId', 'Surname'])

    combined = pd.concat([X_train, X_test], axis=0, sort=False).reset_index(drop=True)

    categorical_features = ['Geography', 'Gender', 'HasCrCard', 'IsActiveMember']
    combined = pd.get_dummies(combined, columns=categorical_features, dtype=float)

    all_features = combined.columns.tolist()

    combined = combined.astype(float)

    for feature in all_features:
        min_value = combined[feature].min()
        max_value = combined[feature].max()
        if max_value - min_value > 0:
            combined[feature] = (combined[feature] - min_value) / (max_value - min_value)
        else:
            combined[feature] = 0

    X_train_processed = combined.iloc[:len(X_train)].values
    X_test_processed = combined.iloc[len(X_train):].values

    return X_train_processed, y_train, X_test_processed

In [None]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    n_samples = X.shape[0]
    indices = np.arange(n_samples)
    np.random.shuffle(indices)

    classes, y_indices = np.unique(y, return_inverse=True)
    stratified_folds = [[] for _ in range(n_splits)]
    for cls in classes:
        cls_indices = indices[y[indices] == cls]
        np.random.shuffle(cls_indices)
        cls_fold_sizes = np.array_split(cls_indices, n_splits)
        for fold_idx in range(n_splits):
            stratified_folds[fold_idx].extend(cls_fold_sizes[fold_idx])

    auc_scores = []

    for fold in range(n_splits):
        val_indices = np.array(stratified_folds[fold])
        train_indices = np.setdiff1d(indices, val_indices)

        X_train_cv = X[train_indices]
        y_train_cv = y[train_indices]
        X_val_cv = X[val_indices]
        y_val_cv = y[val_indices]

        knn.fit(X_train_cv, y_train_cv)
        y_val_pred = knn.predict(X_val_cv)

        auc = compute_roc_auc(y_val_cv, y_val_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores)

def get_tpr_fpr(y_true, y_scores, thresholds):
    tpr_list = []
    fpr_list = []
    P = np.sum(y_true == 1)
    N = np.sum(y_true == 0)

    for thresh in thresholds:
        y_pred_thresh = (y_scores >= thresh).astype(int)
        TP = np.sum((y_true == 1) & (y_pred_thresh == 1))
        FP = np.sum((y_true == 0) & (y_pred_thresh == 1))
        TPR = TP / P if P > 0 else 0
        FPR = FP / N if N > 0 else 0
        tpr_list.append(TPR)
        fpr_list.append(FPR)

    return np.array(tpr_list), np.array(fpr_list)

def compute_roc_auc(y_true, y_scores):
    thresholds = np.unique(y_scores)
    thresholds = np.sort(thresholds)[::-1]
    tpr_list, fpr_list = get_tpr_fpr(y_true, y_scores, thresholds)

    tpr_list = np.concatenate(([0], tpr_list, [1]))
    fpr_list = np.concatenate(([0], fpr_list, [1]))

    auc = np.trapz(tpr_list, fpr_list)
    return auc

In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

best_k = None
best_metric = None
best_cv_score = 0

for distance_metric in ['euclidean', 'manhattan']:
  for k in [3, 5, 7, 9]:
    knn = KNN(k=k, distance_metric=distance_metric)
    cv_score = cross_validate(X, y, knn)
    print(f"k={k}, distance_metric={distance_metric}, cv_score={cv_score}")
    if cv_score > best_cv_score:
      best_cv_score = cv_score
      best_k = k
      best_metric = distance_metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}, cv_score={best_cv_score}")

knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

k=3, distance_metric=euclidean, cv_score=0.8335298930882491
k=5, distance_metric=euclidean, cv_score=0.8608676048246082
k=7, distance_metric=euclidean, cv_score=0.8731121324091454
k=9, distance_metric=euclidean, cv_score=0.8771936322830977
k=3, distance_metric=manhattan, cv_score=0.8332209193992399
k=5, distance_metric=manhattan, cv_score=0.8617254070117717
k=7, distance_metric=manhattan, cv_score=0.8753953506554165
k=9, distance_metric=manhattan, cv_score=0.8806790950260792
Best hyperparameters: k=9, distance_metric=manhattan, cv_score=0.8806790950260792
