In [4]:
import numpy as np
import pandas as pd

In [5]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', batch_size=500):
        self.k = k
        self.distance_metric = distance_metric
        self.batch_size = batch_size
        
        # Set feature weights based on correlation
        self.weights = {
            'Age': 1.0,  # Highest positive correlation
            'NumOfProducts': 0.5,  # Negative correlation, somewhat important
            'Geography_Germany': 0.75,  # Positive correlation
            'Gender_Male': 0.5,  # Negative correlation, somewhat important
            'Balance': 0.6,  # Positive correlation
            'HasCrCard': 0.2,  # Not important
            'IsActiveMember': 0.1,  # Not important
            'Geography_France': 0.2,  # Not important
            'Geography_Spain': 0.2,  # Not important
            'Gender_Female': 0.5,  # Not important
        }

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        n_samples = X.shape[0]
        predictions = []

        # Process in batches
        for i in range(0, n_samples, self.batch_size):
            X_batch = X[i:i + self.batch_size]
            distances = self.compute_distances(X_batch)
            k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
            k_nearest_labels = self.y_train[k_nearest_indices]

            # Find the most common label in the k nearest neighbors for each test point
            batch_predictions = [max(set(labels), key=labels.tolist().count) for labels in k_nearest_labels]
            predictions.extend(batch_predictions)

        return np.array(predictions)

    def compute_distances(self, X_batch):
        # Calculate weighted distances
        weights = np.array([self.weights.get(col, 1.0) for col in range(self.X_train.shape[1])])
        
        if self.distance_metric == 'euclidean':
            # Broadcasting for efficient Euclidean distance computation
            weighted_X_train = self.X_train * weights
            weighted_X_batch = X_batch * weights
            dists = np.sqrt(np.sum((weighted_X_batch[:, np.newaxis] - weighted_X_train) ** 2, axis=2))
        elif self.distance_metric == 'manhattan':
            weighted_X_train = self.X_train * weights
            weighted_X_batch = X_batch * weights
            dists = np.sum(np.abs(weighted_X_batch[:, np.newaxis] - weighted_X_train), axis=2)
        else:
            raise ValueError("Invalid distance metric")
        
        return dists

In [6]:
def preprocess_data(train_path, test_path):
    # Load training data
    train_data = pd.read_csv(train_path)
    
    # Load test data
    test_data = pd.read_csv(test_path)

    # Handle missing values
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_features = ['Geography', 'Gender']

    # Fill missing values for numerical features with the median for both train and test
    for feature in numerical_features:
        train_data[feature].fillna(train_data[feature].median(), inplace=True)
        test_data[feature].fillna(train_data[feature].median(), inplace=True)

    # Fill missing values for categorical features with the mode for both train and test
    for feature in categorical_features:
        train_data[feature].fillna(train_data[feature].mode().iloc[0], inplace=True)
        test_data[feature].fillna(train_data[feature].mode().iloc[0], inplace=True)

    # One-hot encode categorical features for both train and test
    train_data = pd.get_dummies(train_data, columns=categorical_features, drop_first=True)
    test_data = pd.get_dummies(test_data, columns=categorical_features, drop_first=True)

    # Align the test data to have the same columns as the train data
    test_data = test_data.reindex(columns=train_data.columns.drop('Exited'), fill_value=0)

    # Drop unnecessary columns for training
    columns_to_drop = ['CustomerId', 'Surname', 'CreditScore', 'Tenure', 'EstimatedSalary', 'Exited', 'id']
    X = train_data.drop(columns=columns_to_drop, axis=1)
    test_data = test_data.drop(columns=['CustomerId', 'Surname', 'CreditScore', 'Tenure', 'EstimatedSalary', 'id'], axis=1)
    y = train_data['Exited']

    # Print the list of remaining columns
    print("Remaining columns after dropping from training data:", X.columns.tolist())

    # Change the data type to float64
    X = X.astype(np.float64)
    test_data = test_data.astype(np.float64)

    # Min-Max Scaling for numerical features
    min_values = X.min()
    max_values = X.max()

    X = (X - min_values) / (max_values - min_values)  # Scale numerical features
    test_data = (test_data - min_values) / (max_values - min_values)  # Scale test data

    return X.values, y.values, test_data.values

In [7]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    n = len(X)
    fold_size = n // n_splits
    scores = []

    for i in range(n_splits):
        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)
        X_val = X[i * fold_size: (i + 1) * fold_size]
        y_val = y[i * fold_size: (i + 1) * fold_size]

        knn.fit(X_train, y_train)
        predictions = knn.predict(X_val)

        accuracy = np.mean(predictions == y_val)
        scores.append(accuracy)

    return scores

In [8]:
# Load and preprocess data
X, y, X_test = preprocess_data('data/train.csv', 'data/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

Remaining columns after dropping from training data: ['Age', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'Geography_Germany', 'Geography_Spain', 'Gender_Male']
Cross-validation scores: [0.8753333333333333, 0.8873333333333333, 0.875, 0.885, 0.8813333333333333]


In [16]:
# TODO: hyperparamters tuning
k_values = list(range(2, 16))  # Explore a wider range of k values
distance_metrics = ['euclidean', 'manhattan']
best_distance = 'euclidean'
best_k = 3
best_score = 0

for metric in distance_metrics:
    for k in k_values:
        knn = KNN(k=k, distance_metric=metric)
        cv_scores = cross_validate(X, y, knn)
        avg_score = np.mean(cv_scores)
        if avg_score > best_score:
            best_score = avg_score
            best_k = k
            best_distance = metric

print(f"Best k: {best_k}, Distance Metric: {best_distance}, Cross-validation score: {best_score}")

Best k: 11, Distance Metric: manhattan, Cross-validation score: 0.8845333333333334


In [17]:
# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance)
knn.fit(X, y)
test_predictions = knn.predict(X_test)


# Save test predictions
pd.DataFrame({'id': pd.read_csv('data/test.csv')['id'], 'Exited': test_predictions}).to_csv('data/submissions.csv', index=False)