In [6]:
import numpy as np
import pandas as pd

In [1]:
class KNN:
    def __init__(self, k_neighbors=3, distance_metric='euclidean'):
        self.k_neighbors = k_neighbors
        self.distance_metric = distance_metric

    def fit(self, training_data, training_labels):
        self.training_data = training_data
        self.training_labels = training_labels

    def predict(self, test_data):
        predictions = []
        test_data = np.array(test_data)
        for test_point in test_data:
            distances = self.compute_distance(test_point, self.training_data)
            nearest_neighbors_indices = np.argsort(distances)[:self.k_neighbors]
            nearest_neighbors_labels = self.training_labels.iloc[nearest_neighbors_indices]
            weights = 1 / (distances[nearest_neighbors_indices] + 1e-5)  # Small epsilon to avoid division by zero

            class_1_probability = np.sum(weights * (nearest_neighbors_labels == 1)) / np.sum(weights)
            predictions.append(class_1_probability)

        return np.array(predictions)

    def compute_distance(self, test_point, training_data):
        if self.distance_metric == 'euclidean':
            return np.linalg.norm(training_data - test_point, axis=1)
        elif self.distance_metric == 'manhattan':
            return np.abs(training_data - test_point).sum(axis=1)
        else:
            raise ValueError("Unsupported distance metric")


In [3]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    # Load data
    training_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Select features and target variable
    training_features = training_data.drop(columns=['Exited', 'id', 'Surname'])
    training_labels = training_data['Exited']
    test_features = test_data.drop(columns=['id', 'Surname'])

    # Handle categorical variables (One-hot encoding)
    training_features = pd.get_dummies(training_features, drop_first=True)
    test_features = pd.get_dummies(test_features, drop_first=True)

    # Align test set columns with training set
    test_features = test_features.reindex(columns=training_features.columns, fill_value=0)

    # Scale features
    training_mean = training_features.mean()
    training_std = training_features.std()
    training_features = (training_features - training_mean) / training_std
    test_features = (test_features - training_mean) / training_std

    return training_features.values, training_labels, test_features.values


In [10]:
# Define cross-validation function
def cross_validate(features, labels, knn_model, n_splits=5):
    indices = np.arange(len(labels))
    scores = []
    np.random.shuffle(indices)
    fold_size = len(labels) // n_splits

    for i in range(n_splits):
        validation_indices = indices[i * fold_size:(i + 1) * fold_size]
        training_indices = np.setdiff1d(indices, validation_indices)

        features_train, features_validation = features[training_indices], features[validation_indices]
        labels_train, labels_validation = labels.iloc[training_indices], labels.iloc[validation_indices]

        knn_model.fit(features_train, labels_train)
        validation_predictions_proba = knn_model.predict(features_validation)

        # Compute ROC AUC
        score = compute_roc_auc(labels_validation, validation_predictions_proba)
        scores.append(score)

    return scores

def compute_roc_auc(true_labels, predicted_probabilities):
    positive_label = 1
    sorted_score_indices = np.argsort(predicted_probabilities, kind="mergesort")[::-1]
    true_labels = np.array(true_labels)[sorted_score_indices]
    predicted_probabilities = np.array(predicted_probabilities)[sorted_score_indices]

    distinct_value_indices = np.where(np.diff(predicted_probabilities))[0]
    threshold_indices = np.r_[distinct_value_indices, true_labels.size - 1]

    true_positives = np.cumsum(true_labels == positive_label)[threshold_indices]
    false_positives = 1 + threshold_indices - true_positives

    true_positive_rate = true_positives / true_positives[-1]
    false_positive_rate = false_positives / false_positives[-1]

    return np.trapz(true_positive_rate, false_positive_rate)

# Function to find the best k
def find_best_k(features, labels, max_k=20):
    best_k_value = 1
    best_score = 0
    for k_value in range(1, max_k + 1):
        knn_model = KNN(k_neighbors=k_value)
        cross_val_scores = cross_validate(features, labels, knn_model)
        average_score = np.mean(cross_val_scores)
        if average_score > best_score:
            best_score = average_score
            best_k_value = k_value
    return best_k_value, best_score


In [12]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Find the optimal k
optimal_k, optimal_score = find_best_k(X, y)
print(f"Optimal k: {optimal_k}, Best CV Score: {optimal_score}")

# Train the model with the optimal k
knn = KNN()
knn.fit(X, y)

# Make probability predictions on the test set
test_probabilities = knn.predict(X_test)

# Save test predictions with probabilities to CSV
submission_df = pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_probabilities})
submission_df.to_csv('submissions.csv', index=False)

print("Predictions saved to submissions.csv")

Optimal k: 20, Best CV Score: 0.9037303647377491
Predictions saved to submissions.csv
