In [1]:
# Necessary imports
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import numpy as np

In [3]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        X = np.array(X)
        results = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            results.append(np.bincount(k_nearest_labels.astype(int)).argmax())
        return np.array(results)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(X1 - X2, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

In [4]:
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Drop less relevant columns such as 'id', 'CustomerId', and 'Surname'
    drop_columns = ['id', 'CustomerId', 'Surname']
    train_data.drop(drop_columns, axis=1, inplace=True)
    test_data.drop(drop_columns, axis=1, inplace=True)

    # Extract labels
    y_train = train_data['Exited']
    X_train = train_data.drop('Exited', axis=1)
    X_test = test_data

    # Handle categorical columns using OneHotEncoder
    categorical_columns = ['Geography', 'Gender']
    numerical_columns = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']

    # Define transformers for preprocessing
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    numerical_transformer = StandardScaler()

    # Combine transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_columns),
            ('cat', categorical_transformer, categorical_columns)])

    # Fit the preprocessor on the training data and transform both train and test sets
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    return X_train, y_train, X_test

In [5]:

def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    X = np.array(X)
    y = np.array(y)

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)

        auc = roc_auc_score(y_test, y_pred)
        auc_scores.append(auc)

    return np.mean(auc_scores), auc_scores

In [18]:
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Sample a fraction of the dataset for testing (20% of the data)
X_sampled = pd.DataFrame(X).sample(frac=0.2, random_state=42)
y_sampled = y.loc[X_sampled.index]

# Perform cross-validation with the smaller subset
mean_auc, auc_scores = cross_validate(X_sampled, y_sampled, knn, n_splits=3)
print("Cross-validation AUC scores:", auc_scores)
print("Mean AUC score:", mean_auc)

# Hyperparameter tuning (exploring different values for k)
best_k = 5
best_auc = mean_auc

for k in range(1, 21):  # Test different values for k (1 to 20)
    knn = KNN(k=k, distance_metric='euclidean')
    mean_auc, _ = cross_validate(X_sampled, y_sampled, knn, n_splits=3)
    if mean_auc > best_auc:
        best_auc = mean_auc
        best_k = k

print(f"Optimal k found: {best_k} with AUC: {best_auc}")

# Train on the full dataset with optimal hyperparameters and make predictions on the test set
knn = KNN(k=best_k, distance_metric='euclidean')
knn.fit(np.array(X), np.array(y))
test_predictions = knn.predict(np.array(X_test))

# Save test predictions to match the submission format
test_ids = pd.read_csv('test.csv')['CustomerId']

# Check the lengths of test_predictions and test_ids
print(f"Number of test predictions: {len(test_predictions)}")
print(f"Number of test IDs: {len(test_ids)}")

# Ensure the number of rows matches 10,000
if len(test_predictions) != len(test_ids):
    print(f"Error: The number of predictions ({len(test_predictions)}) does not match the number of test IDs ({len(test_ids)}).")
else:
    submission = pd.DataFrame({'id': test_ids, 'Exited': test_predictions})

    # Check for and remove duplicate IDs
    if submission['id'].duplicated().sum() > 0:
        print("Duplicates found in submission. Removing duplicates.")
        submission = submission.drop_duplicates(subset='id')

    # Save the cleaned submission file
    submission.to_csv('submissions.csv', index=False)

    print("Submission saved!")


Cross-validation AUC scores: [0.7364538586876018, 0.7207187316642371, 0.7366242979603901]
Mean AUC score: 0.7312656294374097
Optimal k found: 11 with AUC: 0.7320847119628038
Number of test predictions: 10000
Number of test IDs: 10000
Duplicates found in submission. Removing duplicates.
Submission saved!


In [17]:
from google.colab import files
files.download('submissions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>