In [34]:
import numpy as np
import pandas as pd

In [33]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        probabilities = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices].astype(int)
            weights = 1 / (distances[k_indices] + 1e-5)
            weighted_sum = np.dot(weights, k_nearest_labels)
            prob = weighted_sum / np.sum(weights)
            probabilities.append(np.clip(prob, 0, 1))
        return np.array(probabilities)


    def compute_distance(self, X_train, x):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
       X_train = np.array(X_train, dtype=float)
       x = np.array(x, dtype=float).reshape(1, -1)

       if self.distance_metric == 'euclidean':
        # Compute Euclidean distance!
         return np.sqrt(np.sum((X_train - x) ** 2, axis=1))
       elif self.distance_metric == 'manhattan':
        # Compute Manhattan distance!
         return np.sum(np.abs(X_train - x), axis=1)
        # Compute Minkowski distance!
       elif self.distance_metric == 'minkowski':
        return np.sum(np.abs(X_train - x) ** 3, axis=1) ** (1/3)

In [32]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing



    y = train_data['Exited'].values
    train_data.drop(['Exited', 'CustomerId', 'Surname'], axis=1, inplace=True)


    train_data['Gender'] = train_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
    test_data['Gender'] = test_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)


    geography_train = pd.get_dummies(train_data['Geography'], prefix='Geography', drop_first=True)
    geography_test = pd.get_dummies(test_data['Geography'], prefix='Geography', drop_first=True)
    train_data = pd.concat([train_data.drop('Geography', axis=1), geography_train], axis=1)
    test_data = pd.concat([test_data.drop('Geography', axis=1), geography_test], axis=1)


    X = train_data.values
    X_test = test_data.reindex(columns=train_data.columns, fill_value=0).values

    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    X = (X - X_min) / (X_max - X_min)
    X_test = (X_test - X_min) / (X_max - X_min)

    return X, y, X_test


In [31]:
def roc_auc_score_custom(y_true, y_pred):
    pos_label = 1
    neg_label = 0
    sorted_indices = np.argsort(-y_pred)
    sorted_true = y_true[sorted_indices]

    tpr = np.cumsum(sorted_true == pos_label) / np.sum(sorted_true == pos_label)
    fpr = np.cumsum(sorted_true == neg_label) / np.sum(sorted_true == neg_label)

    auc = np.trapz(tpr, fpr)
    return auc

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    fold_size = len(X) // n_splits
    scores = []

    for i in range(n_splits):
        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]), axis=0)
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]), axis=0)
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        auc_score = roc_auc_score_custom(y_val, y_pred)
        scores.append(auc_score)

    return np.mean(scores)


In [30]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

selected_distance = 'minkowski'

# Create and evaluate model
#knn = KNN(k=5, distance_metric=selected_distance)

# Perform cross-validation
#cv_scores = cross_validate(X, y, knn)

#print("Cross-validation scores:", cv_scores)
#print("Mean cross-validation accuracy:", cv_scores.mean())
# TODO: hyperparamters tuning

best_k = 0
best_score = 0
'''
for k in range(1, 30):
    knn = KNN(k=k, distance_metric=selected_distance)
    print("Current K: ", k)
    scores = cross_validate(X, y, knn)
    print("Scores: ", scores)
    mean_score = scores.mean()
    print("Mean Score: ", mean_score)
    if mean_score > best_score:
        best_k = k
        best_score = mean_score


print(f"Best k: {best_k}, Best ROC AUC: {best_score}")
'''

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn =  KNN(k=24, distance_metric=selected_distance)
scores = cross_validate(X, y, knn)
print("Best K:", best_k)
print(selected_distance)
print("Scores: ", scores)

knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Best K: 0
minkowski
Scores:  0.8800430135009304
