In [11]:
import numpy as np
import pandas as pd

In [12]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        helper = self.predictHelper
        return np.apply_along_axis(helper, 1, X.values)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        sub = (X2- X1)
        if self.distance_metric == 'manhattan':
            return np.sum(np.abs(sub), axis=1)
        return np.linalg.norm(sub, axis=1)

    def predictHelper(self, x):
        kInd = np.argpartition(self.compute_distance(x, self.X_train.values), self.k)[:self.k]
        return np.mean(self.y_train.iloc[kInd])

In [13]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    trainTest = pd.get_dummies(pd.concat([train_data, test_data], axis=0, ignore_index=True), columns=['Geography', 'Gender'], drop_first=True)
    colf = ['IsActiveMember', 'Balance', 'CreditScore', 'HasCrCard', 'Tenure', 'Age', 'EstimatedSalary', 'NumOfProducts']
    extender = [col for col in trainTest.columns if col.startswith('Geography_') or col.startswith('Gender_')]
    colf.extend(extender)
    trainTest[colf] = (trainTest[colf] - trainTest[colf].mean()) / trainTest[colf].std()
    lenVar = len(train_data)
    return trainTest.loc[:lenVar-1, colf], train_data['Exited'], trainTest.loc[lenVar:, colf]

In [14]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    scores = []
    for i in range(n_splits):
        tIndice = np.setdiff1d(indices, indices[i * fold_size:(i + 1) * fold_size])
        X_train, X_val = X.iloc[tIndice], X.iloc[indices[i * fold_size:(i + 1) * fold_size]]
        y_train, y_val = y.iloc[tIndice], y.iloc[indices[i * fold_size:(i + 1) * fold_size]]
        knn.fit(X_train, y_train)
        y_est_prob = knn.predict(X_val)
        if len(np.where(y_val == 0)[0]) == 0 or len(np.where(y_val == 1)[0]) == 0:
            score = 0.5
        else:
            score = np.sum(y_est_prob[np.where(y_val == 1)[0]][:, None] > y_est_prob[np.where(y_val == 0)[0]]) / (len(np.where(y_val == 1)[0]) * len(np.where(y_val == 0)[0]))
        scores.append(score)
    return scores

In [15]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')
# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')
# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# # TODO: hyperparamters tuning
print("Mean score  ", np.mean(cv_scores))
k_values = list(range(3, 18, 2))
arr = ['manhattan', 'euclidean']
best = 0
best2 = {}
for k in k_values:
    for x in arr:
        knn = KNN(k=k, distance_metric=x)
        scores = cross_validate(X, y, knn)
        mean_score = np.mean(scores)
        if mean_score > best:
            best = mean_score
            best2 = {'k': k, 'distance_metric': x}
print("Best score:", best)
print("Best parameters:", best2)
knn = KNN(**best2)
knn.fit(X, y)
test_predictions = knn.predict(X_test)
rounded_predictions = np.round(test_predictions, 2)
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': rounded_predictions.ravel()}).to_csv('submissions.csv', index=False)

Cross-validation scores: [np.float64(0.8215987163339602), np.float64(0.8220658774863581), np.float64(0.8282043116592361), np.float64(0.8375417175417176), np.float64(0.8406714949250985)]
Mean score   0.8300164235892741
Best score: 0.8893814596273846
Best parameters: {'k': 15, 'distance_metric': 'manhattan'}
