In [11]:
import numpy as np
import pandas as pd

In [12]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        return np.apply_along_axis(self.predict_proba, 1, X)

    def predict_proba(self, X):
        # TODO: Implement the predict method
        distances = self.compute_distance(X, self.X_train)
        k_nearest_indices = np.argpartition(distances, self.k)[:self.k]
        k_nearest_labels = self.y_train[k_nearest_indices]
        predictions = np.mean(k_nearest_labels)
        return predictions

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X1 - X2, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

In [13]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.

    # Combine train and test for preprocessing
    all_data = pd.concat([train_data, test_data], axis=0)

    all_data = pd.get_dummies(all_data, columns=['Geography', 'Gender'])

    features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary'] + \
               [col for col in all_data.columns if col.startswith('Geography_') or col.startswith('Gender_')]

    all_data[features] = (all_data[features] - all_data[features].mean()) / all_data[features].std()

    X_train = all_data[features].iloc[:len(train_data)].values
    y_train = train_data['Exited'].values
    X_test = all_data[features].iloc[len(train_data):].values

    return X_train, y_train, X_test

In [14]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores

    np.random.seed(42)
    n = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    scores = []

    for i in range(n_splits):
        val_indices = n[i * fold_size:(i + 1) * fold_size]
        train_indices = np.concatenate([n[:i * fold_size], n[(i + 1) * fold_size:]])

        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        knn.fit(X_train, y_train)
        y_pred_proba = knn.predict(X_val)
        score = roc_auc_score(y_val, y_pred_proba)
        scores.append(score)

    return scores

def roc_auc_score(y_true, y_score):
    positive_indices = np.where(y_true == 1)[0]
    negative_indices = np.where(y_true == 0)[0]

    if len(positive_indices) == 0 or len(negative_indices) == 0:
        return 0.5

    n_positive = y_score[positive_indices]
    n_negative = y_score[negative_indices]

    positive_ranks = np.sum(n_positive[:, None] > n_negative)
    auc = positive_ranks / (len(positive_indices) * len(negative_indices))

    return auc

In [15]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
k_values = [3, 5, 7, 9, 11, 13, 15, 17]
distance_metrics = ['euclidean', 'manhattan']
best_k = 0
best_metric = ''
best_score = 0

for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        mean_score = np.mean(scores)
        if mean_score > best_score:
            best_score = mean_score
            best_k = k
            best_metric = metric

print(f"Best hyperparameters: k={best_k}, distance_metric={best_metric}")
print(f"Best mean ROC AUC score: {best_score}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.7837026455140117, 0.8193873678503659, 0.8125610574347233, 0.7861145617667357, 0.8205360286978428]
Best hyperparameters: k=17, distance_metric=manhattan
Best mean ROC AUC score: 0.8788329671681072
