In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
from numba import njit, prange

train_data = pd.read_csv('/kaggle/input/506-data/train.csv')
test_data = pd.read_csv('/kaggle/input/506-data/test.csv')

features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 
            'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 
            'EstimatedSalary']

X = train_data[features]
y = train_data['Exited']

num_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 
                'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
cat_features = ['Geography', 'Gender']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(), cat_features)
    ])

X_preprocessed = preprocessor.fit_transform(X)

X_preprocessed = np.array(X_preprocessed)
y = np.array(y)

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed, y)

@njit(parallel=True)
def compute_distances(X_train, X_test):
    n_train, n_test = X_train.shape[0], X_test.shape[0]
    distances = np.empty((n_test, n_train))
    for i in prange(n_test):
        for j in range(n_train):
            distances[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j]) ** 2)) 
    return distances

@njit(parallel=True)
def knn_predict_proba(X_train, y_train, X_test, k):
    distances = compute_distances(X_train, X_test)
    n_test = X_test.shape[0]
    y_prob = np.empty(n_test, dtype=np.float64)

    for i in prange(n_test):
        neighbors = np.argsort(distances[i])[:k] 
        top_k_labels = y_train[neighbors]
        
        y_prob[i] = np.sum(top_k_labels) / k 

    return y_prob

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
k_values = [3, 5, 7, 9, 11] 

best_k = 0
best_auc = 0
for k in k_values:
    auc_scores = []
    for train_index, val_index in kf.split(X_resampled, y_resampled):
        X_train_fold, X_val_fold = X_resampled[train_index], X_resampled[val_index]
        y_train_fold, y_val_fold = y_resampled[train_index], y_resampled[val_index]

        y_val_prob_fold = knn_predict_proba(X_train_fold, y_train_fold, X_val_fold, k)

        auc_score_fold = roc_auc_score(y_val_fold, y_val_prob_fold)
        auc_scores.append(auc_score_fold)

    mean_auc_score = np.mean(auc_scores)
    print(f'Mean AUC score for k={k}: {mean_auc_score}')
    
    if mean_auc_score > best_auc:
        best_auc = mean_auc_score
        best_k = k

print(f'Best k value: {best_k} with AUC: {best_auc}')

X_test = preprocessor.transform(test_data[features])
X_test = np.array(X_test)
y_test_probabilities = knn_predict_proba(X_resampled, y_resampled, X_test, best_k)

submission = pd.DataFrame({
    'id': pd.read_csv('/kaggle/input/506-data/test.csv')['id'],
    'Exited': y_test_probabilities 
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")


Mean AUC score for k=3: 0.946858399201633
Mean AUC score for k=5: 0.9514259714598758
Mean AUC score for k=7: 0.9517133862694649
Mean AUC score for k=9: 0.9503997013621059
Mean AUC score for k=11: 0.9488785889373771
Best k value: 7 with AUC: 0.9517133862694649
Submission file saved as 'submission.csv'.
