In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold

In [23]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store training data for later use in rest of class
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_predicted = []
        for test_point in X:
          distances = self.compute_distance(test_point, self.X_train)

          k_nn_indicies = np.argsort(distances)[:self.k]
          k_nn_labels = [self.y_train[i] for i in k_nn_indicies] #Go into the training data, and grab the class of each of the k-nearest points

          #Which classification is most common
          dic = {}
          for label in k_nn_labels:
            if label in dic:
              dic[label] += 1
            else:
              dic[label] = 1

          #Probability of exited = votes for exiting (1)/total votes (k)
          pr_exit = dic.get(1,0)/self.k
          y_predicted.append(pr_exit)
        return np.array(y_predicted)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X2 - X1, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.abs(X2 - X1).sum(axis=1)
        else:
            raise ValueError("Unsupported distance metric")

In [24]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    #Remove irrelevant data & separate the result from the attributes we're looking at
    X = train_data.drop(['Exited', 'id', 'CustomerId', 'Surname'], axis=1)
    y = train_data['Exited']

    #Numerical values vs. Categorical values. Used to determine which is more important later
    num_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    cat_features = ['Geography', 'Gender']

    #Process numerical values by mean, & categorical values by frquencies
    preprocessor = ColumnTransformer(
      transformers=[
          ('num', Pipeline(steps=[
              ('imputer', SimpleImputer(strategy='mean')),
              ('scaler', StandardScaler())
          ]), num_features),
          ('cat', Pipeline(steps=[
              ('imputer', SimpleImputer(strategy='most_frequent')),
              ('onehot', OneHotEncoder(handle_unknown='ignore'))
          ]), cat_features)
      ])

    X_processed = preprocessor.fit_transform(X)

    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1)
    X_test_processed = preprocessor.transform(X_test)

    return X_processed, y, X_test_processed


In [26]:
#Scikit learn cv_scoring has been problematic using my custom knn class so cross validation will be done manually
def roc_auc_score_manual(y_true, y_pred):
    #Define exited (1) or not exited (0)
    pos_label = 1
    neg_label = 0

    # Sorting by predicted scores
    desc_sort_order = np.argsort(-y_pred)
    y_true_sorted = y_true.iloc[desc_sort_order]

    #Compare true positives & false positives
    true_positives = np.cumsum(y_true_sorted == pos_label) / np.sum(y_true == pos_label)
    false_positives = np.cumsum(y_true_sorted == neg_label) / np.sum(y_true == neg_label)

    #Area under the curve using the trapezoidal rule
    auc = np.trapz(true_positives, false_positives)

    return auc

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    n = len(y)
    indicies = np.arange(n)
    np.random.shuffle(indicies)
    fold_sizes = np.full(n_splits, n // n_splits, dtype=int)
    fold_sizes[:n % n_splits] += 1

    current = 0
    scores = []

    for fold_size in fold_sizes:
        val_indicies = indicies[current:current + fold_size]
        train_indicies = np.setdiff1d(indicies, val_indicies)

        X_train, X_val = X[train_indicies], X[val_indicies]
        y_train, y_val = y.iloc[train_indicies], y.iloc[val_indicies]

        knn.fit(X_train, y_train.to_numpy())
        y_pred = knn.predict(X_val)

        #Find roc auc score by comparing the data we found from the actual
        score = roc_auc_score_manual(y_val, y_pred)
        scores.append(score)

        current += fold_size

    return scores

In [29]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

mean = np.mean(cv_scores)
print("Mean cross-validation score:", mean)

# TODO: hyperparamters tuning. Figure out the optimal k value or distance metric
opt_k = 0
opt_distance_metric = 0
best_score = -np.inf

#Loop through different values of k and metrics. Essentially reapply the steps from above, but this time we are repeating these steps until we find the best score
for k in [5, 7, 9, 11, 13, 20]:
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        cv_score = cross_validate(X,y,knn)
        mean_score = np.mean(cv_score)
        print(k, metric, cv_score)
        print(k, metric, mean_score)
        if mean_score > best_score:
            best_score = mean_score
            opt_k = k
            opt_distance_metric = metric


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
print(opt_k, opt_distance_metric)
knn = KNN(k=opt_k, distance_metric= opt_distance_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.8633854787967192, 0.880079245471765, 0.8780488131965544, 0.8694667078818714, 0.8737142773705169]
Mean cross-validation score: 0.8729389045434853
5 euclidean [0.8839734668223039, 0.8695079967470858, 0.8688471832758936, 0.8718126746509043, 0.8734194298883522]
5 euclidean 0.8735121502769079
5 manhattan [0.860120144417136, 0.8744483646094322, 0.8603962856074114, 0.8872720336174704, 0.869598475383918]
5 manhattan 0.8703670607270737
7 euclidean [0.8882950691142132, 0.8816058626775909, 0.884873159107039, 0.8722394368592942, 0.8997920997920998]
7 euclidean 0.8853611255100473
7 manhattan [0.8789226867268242, 0.8864121263157896, 0.8963763448685346, 0.8819200580606263, 0.8786869059144516]
7 manhattan 0.8844636243772455
9 euclidean [0.9065242317483162, 0.9001359490895933, 0.8840064074468897, 0.8807698857608683, 0.8978093532265249]
9 euclidean 0.8938491654544386
9 manhattan [0.8795399601175101, 0.8926210331531794, 0.9026229712224916, 0.8940373844918181, 0.886154220997564