In [None]:
import numpy as np
import pandas as pd

In [None]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            distances = [self.compute_distance(X[i], x_train) for x_train in self.X_train]
            k_nearest_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[idx] for idx in k_nearest_indices]
            prob_class_1 = sum(k_nearest_labels) / self.k
            predictions.append(prob_class_1)
        return np.array(predictions)

    def compute_distance(self, X1, X2):
         if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2))

         elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2))

         elif self.distance_metric == 'cosine':
            dot_product = np.dot(X1, X2)
            norm_X1 = np.linalg.norm(X1)
            norm_X2 = np.linalg.norm(X2)
            return 1 - dot_product / (norm_X1 * norm_X2)

         elif self.distance_metric == 'chebyshev':
            return np.max(np.abs(X1 - X2))

In [None]:
def preprocess_data(train_path, test_path):
    # Load the train and test datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values by filling them with the mean of the column
    # Only for numeric features
    numeric_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    for feature in numeric_features:
        train_data[feature] = train_data[feature].fillna(train_data[feature].mean())
        test_data[feature] = test_data[feature].fillna(test_data[feature].mean())

    # Encode the 'Gender' column manually (0 for Female, 1 for Male)
    train_data['Gender'] = train_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)
    test_data['Gender'] = test_data['Gender'].apply(lambda x: 1 if x == 'Male' else 0)

    # One-hot encode the 'Geography' column manually
    for country in ['France', 'Germany']:
        train_data[f'Geography_{country}'] = (train_data['Geography'] == country).astype(int)
        test_data[f'Geography_{country}'] = (test_data['Geography'] == country).astype(int)

    # Scale the numerical features manually
    for feature in numeric_features:
        mean = train_data[feature].mean()
        std = train_data[feature].std()

        # Standardize by mean and std deviation for both train and test sets
        train_data[feature] = (train_data[feature] - mean) / std
        test_data[feature] = (test_data[feature] - mean) / std  # Use training mean and std

    # Separate the features and target variable for the training set
    X_train = train_data.drop(['CustomerId', 'Surname', 'Exited', 'Geography'], axis=1)
    y_train = train_data['Exited']

    # Prepare the test set features
    X_test = test_data.drop(['CustomerId', 'Surname', 'Geography'], axis=1)

    return X_train.values, y_train.values, X_test.values

In [None]:
import numpy as np
import pandas as pd

def cross_validate(X, y, knn, n_splits=5):
    """Performs cross-validation and computes ROC AUC scores without sklearn."""

    # Shuffle the data
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X, y = X[indices], y[indices]

    # Determine fold size and initialize the list for storing AUC scores
    fold_size = X.shape[0] // n_splits
    roc_auc_scores = []

    for i in range(n_splits):
        # Define the start and end of the validation fold
        start, end = i * fold_size, (i + 1) * fold_size
        X_val, y_val = X[start:end], y[start:end]
        X_train = np.concatenate((X[:start], X[end:]), axis=0)
        y_train = np.concatenate((y[:start], y[end:]), axis=0)

        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)

        # Get probability scores for the validation set
        y_val_pred = []
        for x in X_val:
            # Calculate distances and find the k nearest neighbors
            distances = [knn.compute_distance(x, x_train) for x_train in knn.X_train]
            k_nearest_indices = np.argsort(distances)[:knn.k]
            k_nearest_labels = [knn.y_train[idx] for idx in k_nearest_indices]

            # Calculate the probability of class 1 (label == 1) based on neighbor votes
            prob_class_1 = sum(k_nearest_labels) / knn.k
            y_val_pred.append(prob_class_1)

        # Convert to numpy array for easier handling
        y_val_pred = np.array(y_val_pred)

        # Sort the predictions and corresponding true labels in descending order
        desc_sorted_indices = np.argsort(-y_val_pred)
        y_val_sorted = y_val[desc_sorted_indices]
        y_scores_sorted = y_val_pred[desc_sorted_indices]

        # Total positives and negatives
        P = np.sum(y_val == 1)
        N = np.sum(y_val == 0)

        # Initialize TPR and FPR lists
        TPR = []
        FPR = []

        TP = 0
        FP = 0

        # Iterate through sorted scores
        for j in range(len(y_scores_sorted)):
            if y_val_sorted[j] == 1:
                TP += 1
            else:
                FP += 1

            tpr = TP / P if P > 0 else 0
            fpr = FP / N if N > 0 else 0

            TPR.append(tpr)
            FPR.append(fpr)

        #append (0,0) at the beginning and (1,1) at the end to complete the ROC curve
        TPR = [0] + TPR + [1]
        FPR = [0] + FPR + [1]

        #calculate AUC using the trapezoidal rule
        auc_score = 0.0
        for j in range(1, len(FPR)):
            auc_score += (FPR[j] - FPR[j-1]) * (TPR[j] + TPR[j-1]) / 2.0
        roc_auc_scores.append(auc_score)

    #return the mean ROC AUC score across all folds
    return np.mean(roc_auc_scores)


In [None]:
# Load and preprocess data
X, y, X_test = preprocess_data('/content/train.csv', '/content/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# hyperparamters tuning
best_k = None
best_distance_metric = None
best_auc_score = -1
for k in [9, 15]:  #range between 9 and 15 because i tested 3, 5, 7, 9, 11, 13, 15 before and 15 performed the best and 9 was just a random number
    for metric in ['euclidean', 'manhattan', 'cosine', 'chebyshev']:
        knn = KNN(k=k, distance_metric=metric)
        auc_score = cross_validate(X, y, knn)
        print(f"k={k}, metric={metric}, AUC score={auc_score}")

        # Update best parameters based on AUC score
        if auc_score > best_auc_score:
            best_k = k
            best_distance_metric = metric
            best_auc_score = auc_score

print(f"Best parameters: k={best_k}, metric={best_distance_metric}, AUC score={best_auc_score}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/content/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: 0.5371374182231293
k=9, metric=euclidean, AUC score=0.526014694263458
k=9, metric=manhattan, AUC score=0.6194072025732258
k=9, metric=cosine, AUC score=0.8936885572428022
k=9, metric=chebyshev, AUC score=0.5049145179451052
k=15, metric=euclidean, AUC score=0.5095510115020485
k=15, metric=manhattan, AUC score=0.597336589398805
k=15, metric=cosine, AUC score=0.9024629991158475
k=15, metric=chebyshev, AUC score=0.499951917237452
Best parameters: k=15, metric=cosine, AUC score=0.9024629991158475
