In [16]:
import numpy as np
import pandas as pd

In [17]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # TODO: Implement the predict method
        helper = self.predictHelper
        return np.apply_along_axis(helper, 1, X.values)

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        sub = (X2- X1)
        if self.distance_metric == 'manhattan':
            return np.sum(np.abs(sub), axis=1)
        return np.linalg.norm(sub, axis=1)

    def predictHelper(self, x):
        kInd = np.argpartition(self.compute_distance(x, self.X_train.values), self.k)[:self.k]
        return np.mean(self.y_train.iloc[kInd])

In [18]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Combine datasets for preprocessing
    combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

    # Convert categorical variables to dummy/indicator variables
    combined_data = pd.get_dummies(combined_data, columns=['Geography', 'Gender'], drop_first=True)

    # Specify columns to scale
    scaling_columns = ['IsActiveMember', 'Balance', 'CreditScore', 'HasCrCard', 'Tenure', 'Age', 'EstimatedSalary', 'NumOfProducts']
    # Identify additional dummy columns created from categorical variables
    dummy_columns = [col for col in combined_data.columns if col.startswith('Geography_') or col.startswith('Gender_')]
    # Combine both sets of columns for scaling
    scaling_columns.extend(dummy_columns)

    # Standardize the specified columns (mean=0, std=1)
    combined_data[scaling_columns] = (combined_data[scaling_columns] - combined_data[scaling_columns].mean()) / combined_data[scaling_columns].std()

    # Determine the split index to separate training and testing datasets
    train_length = len(train_data)

    # Return preprocessed training features, training labels, and preprocessed test features
    return combined_data.loc[:train_length-1, scaling_columns], train_data['Exited'], combined_data.loc[train_length:, scaling_columns]

In [19]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Shuffle the dataset and create indices for the split
    indices = np.random.permutation(len(X))
    fold_size = len(X) // n_splits
    scores = []

    # Perform k-fold cross-validation
    for i in range(n_splits):
        # Determine training and validation indices
        train_indices = np.setdiff1d(indices, indices[i * fold_size:(i + 1) * fold_size])
        X_train, X_val = X.iloc[train_indices], X.iloc[indices[i * fold_size:(i + 1) * fold_size]]
        y_train, y_val = y.iloc[train_indices], y.iloc[indices[i * fold_size:(i + 1) * fold_size]]

        # Train the model on the training fold
        knn.fit(X_train, y_train)

        # Generate predictions on the validation fold
        y_est_prob = knn.predict(X_val)

        # Handle cases where only one class is present in the validation set
        if len(np.where(y_val == 0)[0]) == 0 or len(np.where(y_val == 1)[0]) == 0:
            score = 0.5  # Assign default AUC score for degenerate cases
        else:
            # Compute the ROC AUC score using ranks of predicted probabilities
            positive_indices = np.where(y_val == 1)[0]
            negative_indices = np.where(y_val == 0)[0]
            score = np.sum(y_est_prob[positive_indices][:, None] > y_est_prob[negative_indices]) / (
                len(positive_indices) * len(negative_indices)
            )
        
        # Append the fold's score to the list
        scores.append(score)

    # Return the list of AUC scores for each fold
    return scores

In [20]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')
# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')
# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# # TODO: hyperparamters tuning
print("Mean cross-validation score: ", np.mean(cv_scores))
k_values = list(range(3, 18, 2))
distance_metrics = ['manhattan', 'euclidean']

# Track the best score and parameters
highest_score = 0
optimal_params = {}

# Iterate through all combinations of k and distance metrics
for k in k_values:
    for metric in distance_metrics:
        knn = KNN(k=k, distance_metric=metric)
        scores = cross_validate(X, y, knn)
        avg_score = np.mean(scores)
        if avg_score > highest_score:
            highest_score = avg_score
            optimal_params = {'k': k, 'distance_metric': metric}

# Output the best score and parameters
print("Best cross-validation score:", highest_score)
print("Optimal hyperparameters:", optimal_params)

# Refit the model using the best hyperparameters
knn = KNN(**optimal_params)
knn.fit(X, y)

# Generate predictions on the test set
test_predictions = knn.predict(X_test)
rounded_predictions = np.round(test_predictions, 2)

# Save the predictions in the desired format
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': rounded_predictions.ravel()}).to_csv('submissions.csv', index=False)

Cross-validation scores: [np.float64(0.8042114924461216), np.float64(0.8074104430673906), np.float64(0.8029113259854539), np.float64(0.7930828122198905), np.float64(0.805876728710808)]
Mean cross-validation score:  0.8026985604859329
Best cross-validation score: 0.8777236555305119
Optimal hyperparameters: {'k': 17, 'distance_metric': 'manhattan'}
