In [1]:
import pandas as pd
import numpy as np
from collections import Counter # for counting elements
from numba import njit, prange # for increased speed precompiling code

train_data = pd.read_csv('/kaggle/input/506-data/train.csv')
test_data = pd.read_csv('/kaggle/input/506-data/test.csv')

features = ['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 
            'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 
            'EstimatedSalary']
X = train_data[features]
y = train_data['Exited']

num_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 
                'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
cat_features = ['Geography', 'Gender']

class StandardScalerCustom:
    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0)
        
    def transform(self, X):
        return (X - self.mean_) / self.std_

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

class OneHotEncoderCustom:
    def fit(self, X):
        self.categories_ = [np.unique(X[:, i]) for i in range(X.shape[1])]
    
    def transform(self, X):
        one_hot_encoded = []
        for i in range(X.shape[1]):
            one_hot_col = np.zeros((X.shape[0], len(self.categories_[i])))
            for j, category in enumerate(self.categories_[i]):
                one_hot_col[:, j] = (X[:, i] == category).astype(float)
            one_hot_encoded.append(one_hot_col)
        return np.hstack(one_hot_encoded)
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

class ColumnTransformerCustom:
    def __init__(self, transformers):
        self.transformers = transformers
    
    def fit_transform(self, X):
        transformed_columns = []
        for name, transformer, columns in self.transformers:
            X_subset = X[columns].values
            transformed_columns.append(transformer.fit_transform(X_subset))
        return np.hstack(transformed_columns)
    
    def transform(self, X):
        transformed_columns = []
        for name, transformer, columns in self.transformers:
            X_subset = X[columns].values
            transformed_columns.append(transformer.transform(X_subset))
        return np.hstack(transformed_columns)

class StratifiedKFoldCustom:
    def __init__(self, n_splits, shuffle=False, random_state=None):
        self.n_splits = n_splits
        self.shuffle = shuffle
        self.random_state = random_state
        
    def split(self, X, y):
        labels, counts = np.unique(y, return_counts=True)
        folds = [[] for _ in range(self.n_splits)]
        label_indices = {label: np.where(y == label)[0] for label in labels}
        
        if self.shuffle:
            np.random.seed(self.random_state)
            for label in labels:
                np.random.shuffle(label_indices[label])
        
        for label in labels:
            label_count = len(label_indices[label])
            for fold_idx in range(self.n_splits):
                start_idx = fold_idx * (label_count // self.n_splits)
                end_idx = (fold_idx + 1) * (label_count // self.n_splits)
                folds[fold_idx].extend(label_indices[label][start_idx:end_idx])
                
        for i in range(self.n_splits):
            train_idx = np.hstack([folds[j] for j in range(self.n_splits) if j != i])
            val_idx = np.array(folds[i])
            yield train_idx, val_idx

class SMOTECustom:
    def __init__(self, random_state=None):
        self.random_state = random_state
        if random_state is not None:
            np.random.seed(random_state)

    def fit_resample(self, X, y):
        counts = Counter(y)
        max_class = max(counts, key=counts.get)
        minority_class = min(counts, key=counts.get)

        minority_idx = np.where(y == minority_class)[0]
        majority_idx = np.where(y == max_class)[0]

        n_samples_to_generate = len(majority_idx) - len(minority_idx)
        synthetic_samples = []

        while len(synthetic_samples) < n_samples_to_generate:
            idx = np.random.choice(minority_idx)
            # Find k nearest neighbors of the selected minority sample
            distances = np.linalg.norm(X[minority_idx] - X[idx], axis=1)
            nearest_neighbors_idx = np.argsort(distances)[1:]  # Exclude the sample itself

            # Randomly select one of the nearest neighbors
            neighbor_idx = np.random.choice(minority_idx[nearest_neighbors_idx])
            diff = X[neighbor_idx] - X[idx]
            new_sample = X[idx] + np.random.rand() * diff
            synthetic_samples.append(new_sample)

        X_resampled = np.vstack((X, synthetic_samples))
        y_resampled = np.hstack((y, [minority_class] * len(synthetic_samples)))

        return X_resampled, y_resampled

def roc_auc_score_custom(y_true, y_score):
    pos_label = 1
    desc_score_indices = np.argsort(y_score)[::-1]
    y_true = np.array(y_true)[desc_score_indices]
    y_score = np.array(y_score)[desc_score_indices]
    
    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    tps = np.cumsum(y_true == pos_label)[threshold_idxs]
    fps = 1 + threshold_idxs - tps
    
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    
    fpr = fps / fps[-1]
    tpr = tps / tps[-1]
    
    return np.trapz(tpr, fpr)

# KNN Algorithm
@njit(parallel=True)
def compute_distances(X_train, X_test):
    n_train, n_test = X_train.shape[0], X_test.shape[0]
    distances = np.empty((n_test, n_train))
    for i in prange(n_test):
        for j in range(n_train):
            distances[i, j] = np.sqrt(np.sum((X_test[i] - X_train[j]) ** 2)) 
    return distances

@njit(parallel=True)
def knn_predict_proba(X_train, y_train, X_test, k):
    distances = compute_distances(X_train, X_test)
    n_test = X_test.shape[0]
    y_prob = np.empty(n_test, dtype=np.float64)

    for i in prange(n_test):
        neighbors = np.argsort(distances[i])[:k] 
        top_k_labels = y_train[neighbors]
        
        y_prob[i] = np.sum(top_k_labels) / k 

    return y_prob

preprocessor = ColumnTransformerCustom(
    transformers=[
        ('num', StandardScalerCustom(), num_features),
        ('cat', OneHotEncoderCustom(), cat_features)
    ]
)

X_preprocessed = preprocessor.fit_transform(X)

smote = SMOTECustom(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_preprocessed, y)

kf = StratifiedKFoldCustom(n_splits=5, shuffle=True, random_state=42)
k_values = [3, 5, 7, 9, 11]

best_k = 0
best_auc = 0
for k in k_values:
    auc_scores = []
    for train_index, val_index in kf.split(X_resampled, y_resampled):
        X_train_fold, X_val_fold = X_resampled[train_index], X_resampled[val_index]
        y_train_fold, y_val_fold = y_resampled[train_index], y_resampled[val_index]

        y_val_prob_fold = knn_predict_proba(X_train_fold, y_train_fold, X_val_fold, k)
        
        auc_score_fold = roc_auc_score_custom(y_val_fold, y_val_prob_fold)
        auc_scores.append(auc_score_fold)
    
    mean_auc_score = np.mean(auc_scores)
    print(f'Mean AUC score for k={k}: {mean_auc_score}')
    
    if mean_auc_score > best_auc:
        best_auc = mean_auc_score
        best_k = k

print(f'Best k value: {best_k} with AUC: {best_auc}')

X_test = preprocessor.transform(test_data[features])
X_test = np.array(X_test)

y_test_probabilities = knn_predict_proba(X_resampled, y_resampled, X_test, best_k)

submission = pd.DataFrame({
    'id': test_data['id'],
    'Exited': y_test_probabilities 
})
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'.")


Mean AUC score for k=3: 0.942572945292973
Mean AUC score for k=5: 0.949471828003707
Mean AUC score for k=7: 0.9521628150359847
Mean AUC score for k=9: 0.9521915937782733
Mean AUC score for k=11: 0.9513777560928247
Best k value: 9 with AUC: 0.9521915937782733
Submission file saved as 'submission.csv'.
