In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# KNN with weighted voting
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', p=2, weights='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.p = p
        self.weights = weights

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def compute_distances(self, X):
        if self.distance_metric == 'euclidean':
            return np.sqrt(((X[:, np.newaxis] - self.X_train) ** 2).sum(axis=2))
        elif self.distance_metric == 'manhattan':
            return np.abs(X[:, np.newaxis] - self.X_train).sum(axis=2)
        elif self.distance_metric == 'minkowski':
            return np.sum(np.abs(X[:, np.newaxis] - self.X_train) ** self.p, axis=2) ** (1 / self.p)
        else:
            raise ValueError("Unsupported distance metric")

    def predict(self, X):
        distances = self.compute_distances(X)
        nearest_indices = np.argsort(distances, axis=1)[:, :self.k]
        nearest_labels = self.y_train[nearest_indices]
        if self.weights == 'uniform':
            return np.mean(nearest_labels, axis=1)
        elif self.weights == 'distance':
            nearest_distances = np.take_along_axis(distances, nearest_indices, axis=1)
            weights = 1 / (nearest_distances + 1e-5)
            return np.sum(weights * nearest_labels, axis=1) / np.sum(weights, axis=1)

# Preprocessing
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    y_train = train_data['Exited']
    X_train = train_data.drop(['id', 'CustomerId', 'Surname', 'Exited'], axis=1)
    X_test = test_data.drop(['id', 'CustomerId', 'Surname'], axis=1)

    # Encode categorical variables (Geography and Gender)
    label_encoder_geo = LabelEncoder()
    label_encoder_gender = LabelEncoder()

    X_train['Geography'] = label_encoder_geo.fit_transform(X_train['Geography'])
    X_train['Gender'] = label_encoder_gender.fit_transform(X_train['Gender'])

    X_test['Geography'] = label_encoder_geo.transform(X_test['Geography'])
    X_test['Gender'] = label_encoder_gender.transform(X_test['Gender'])

    # Standardize the numerical features for KNN
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, y_train, X_test_scaled, test_data['id']

# Cross-validation with Stratified K-Folds
def cross_validate(X, y, knn, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits)
    roc_auc_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        knn.fit(X_train, y_train)
        y_prob = knn.predict(X_val)

        score = roc_auc_score(y_val, y_prob)
        roc_auc_scores.append(score)

    return np.mean(roc_auc_scores), roc_auc_scores

# Hyperparameter tuning with early stopping when AUC > 0.9
def tune_hyperparameters(X, y, X_test, test_ids, max_k=20, auc_threshold=0.9):
    best_k = 1
    best_score = 0
    best_metric = 'euclidean'
    best_weights = 'uniform'
    metrics = ['euclidean', 'manhattan', 'minkowski']
    weights = ['uniform', 'distance']

    for metric in metrics:
        for weight in weights:
            for k in range(1, max_k + 1):
                knn = KNN(k=k, distance_metric=metric, weights=weight)
                mean_auc, _ = cross_validate(X, y, knn, n_splits=5)
                print(f"K={k}, Metric={metric}, Weights={weight}, AUC={mean_auc:.4f}")
                
                if mean_auc > best_score:
                    best_score = mean_auc
                    best_k = k
                    best_metric = metric
                    best_weights = weight
                
                # If the AUC score exceeds the threshold, save predictions and stop
                if mean_auc >= auc_threshold:
                    print(f"AUC={mean_auc:.4f} exceeds threshold {auc_threshold}, stopping early and saving predictions...")
                    # Train the model on the full dataset and save predictions
                    knn.fit(X, y)
                    test_predictions = knn.predict(X_test)
                    pd.DataFrame({'id': test_ids, 'Exited': test_predictions}).to_csv(f'submissions_k{k}_metric_{metric}_weights_{weight}.csv', index=False)
                    print(f"Predictions saved to 'submissions_k{k}_metric_{metric}_weights_{weight}.csv'")
                    return best_k, best_metric, best_weights, best_score

    return best_k, best_metric, best_weights, best_score

# Load and preprocess data
X, y, X_test, test_ids = preprocess_data('train.csv', 'test.csv')

# Tune hyperparameters and output predictions if AUC > 0.9
best_k, best_metric, best_weights, best_score = tune_hyperparameters(X, y, X_test, test_ids)
print(f"Best K: {best_k}, Best Metric: {best_metric}, Best Weights: {best_weights}, Best AUC: {best_score}")


K=1, Metric=euclidean, Weights=uniform, AUC=0.7545
K=2, Metric=euclidean, Weights=uniform, AUC=0.8180
K=3, Metric=euclidean, Weights=uniform, AUC=0.8463
K=4, Metric=euclidean, Weights=uniform, AUC=0.8648
K=5, Metric=euclidean, Weights=uniform, AUC=0.8746
