In [5]:
import numpy as np
import pandas as pd

In [6]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        neighbors = []
        distances = self.compute_distance(X, self.X_train)
        neighbors_idx = np.argsort(distances)[:, :self.k]
        k_nearest_labels = self.y_train[neighbors_idx]

        return np.array([self.most_common(labels) for labels in k_nearest_labels])

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
          return np.sqrt(np.sum((X1[:, np.newaxis, :] - X2[np.newaxis, :, :]) ** 2, axis=2))

    def most_common(self, lst):
      '''Returns the most common element in a list'''
      unique, counts = np.unique(lst, return_counts=True)
      return unique[np.argmax(counts)]

In [7]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    X = train_data.drop('Exited', axis=1)
    y = train_data["Exited"]
    X_test = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    categorical_cols = ['Geography', 'Gender']
    numerical_cols = ['CreditScore', 'Age', 'Tenure', 'NumOfProducts', 'Balance', 'EstimatedSalary']

    for col in numerical_cols:
        X.fillna({col: X[col].mean()}, inplace=True)
        X_test.fillna({col: X_test[col].mean()}, inplace=True)

    for col in categorical_cols:
        X.fillna({col: X[col].mode()[0]}, inplace=True)
        X_test.fillna({col: X_test[col].mode()[0]}, inplace=True)

        X = pd.get_dummies(X, columns=[col], drop_first=True, dtype=float)
        X_test = pd.get_dummies(X_test, columns=[col], drop_first=True, dtype=float)

    X[numerical_cols] = (X[numerical_cols] - X[numerical_cols].mean()) / X[numerical_cols].std()
    X_test[numerical_cols] = (X_test[numerical_cols] - X_test[numerical_cols].mean()) / X_test[numerical_cols].std()

    X = X.to_numpy()
    y = y.to_numpy()
    X_test = X_test.to_numpy()

    return X, y, X_test

In [8]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    fold_size = len(X) // n_splits
    folds = [indices[i * fold_size:(i + 1) * fold_size] for i in range(n_splits)]

    auc_scores = []

    for i in range(n_splits):
        val_indices = folds[i]
        train_indices = np.concatenate([folds[j] for j in range(n_splits) if j != i])

        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        knn.fit(X_train, y_train)

        distances = knn.compute_distance(X_val, X_train)

        knn_indices = np.argsort(distances, axis=1)[:, :knn.k]

        k_nearest_labels = y_train[knn_indices]
        predictions = np.array([knn.most_common(labels) for labels in k_nearest_labels])

        min_distances = np.min(distances[np.arange(distances.shape[0])[:, np.newaxis], knn_indices], axis=1)

        scores = 1 / (min_distances + 1e-8)

        auc = calculate_roc_auc(y_val, scores)
        auc_scores.append(auc)

    return np.mean(auc_scores)

def calculate_roc_auc(y_true, y_scores):

    sorted_indices = np.argsort(y_scores)
    y_true_sorted = y_true[sorted_indices]

    tps = np.cumsum(y_true_sorted)
    fps = np.arange(1, len(y_true_sorted) + 1) - tps

    tpr = tps / tps[-1]
    fpr = fps / fps[-1]

    return np.trapz(tpr, fpr)

In [9]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')
# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')
# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# # TODO: hyperparamters tuning
best_k = None
best_metric = None
best_score = -1

for k in range(3, 22, 2):
    knn = KNN(k=k, distance_metric='euclidean')
    score = cross_validate(X, y, knn)
    print(f"k={k}, metric={'euclidean'}, score={score}")
    if score > best_score:
        best_score = score
        best_k = k

print(f"Best parameters: k={best_k}, metric={best_metric}, score={best_score}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=3, distance_metric='euclidean')
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

  return np.trapz(tpr, fpr)


Cross-validation scores: 0.6770810851348463
k=3, metric=euclidean, score=0.6765776679551261
k=5, metric=euclidean, score=0.6763062337944871
k=7, metric=euclidean, score=0.6741538869244181
k=9, metric=euclidean, score=0.6741857403989874
k=11, metric=euclidean, score=0.6759381731300621
k=13, metric=euclidean, score=0.6762844418817431
k=15, metric=euclidean, score=0.6748492192292586
k=17, metric=euclidean, score=0.677665394726102
k=19, metric=euclidean, score=0.6762215590472953
k=21, metric=euclidean, score=0.679091439174158
Best parameters: k=21, metric=None, score=0.679091439174158
