In [11]:
import numpy as np
import pandas as pd

In [12]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', class_weight=None):
        self.k = k
        self.distance_metric = distance_metric
        self.class_weight = class_weight if class_weight else {0: 1, 1: 1}
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X, dtype=float)
        self.y_train = np.array(y, dtype=int)

    def predict_proba(self, X):
        probabilities = []
        X = np.array(X, dtype=float)

        for x_test in X:
            if self.distance_metric == 'euclidean':
                distances = np.sqrt(np.sum((self.X_train - x_test) ** 2, axis=1))
            elif self.distance_metric == 'manhattan':
                distances = np.sum(np.abs(self.X_train - x_test), axis=1)
            else:
                raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

            k_indices = np.argpartition(distances, self.k)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            k_nearest_distances = distances[k_indices]

            weights = 1 / (k_nearest_distances + 1e-5)
            weighted_sum = np.sum(weights * np.array([self.class_weight[label] for label in k_nearest_labels]))
            prob_class_1 = np.sum(weights * np.array([self.class_weight[1] if label == 1 else 0 for label in k_nearest_labels])) / weighted_sum

            probabilities.append(prob_class_1)

        return np.array(probabilities)

In [13]:
def custom_roc_auc_score(y_true, y_pred):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    auc = np.trapz(tpr, fpr)
    return abs(auc)

def roc_curve(y_true, y_pred):
    sorted_indices = np.argsort(y_pred)
    y_true_sorted = y_true[sorted_indices]
    y_pred_sorted = y_pred[sorted_indices]

    fpr = []
    tpr = []
    thresholds = np.unique(y_pred_sorted)

    total_positives = np.sum(y_true_sorted == 1)
    total_negatives = np.sum(y_true_sorted == 0)

    for threshold in thresholds:
        predictions = y_pred_sorted >= threshold
        true_positives = np.sum((y_true_sorted == 1) & (predictions == True))
        false_positives = np.sum((y_true_sorted == 0) & (predictions == True))

        tpr_value = true_positives / total_positives if total_positives > 0 else 0
        fpr_value = false_positives / total_negatives if total_negatives > 0 else 0

        tpr.append(tpr_value)
        fpr.append(fpr_value)

    return np.array(fpr), np.array(tpr), thresholds


In [14]:
def robust_scale(X):
    X_scaled = np.copy(X)
    medians = np.median(X_scaled, axis=0)
    q1 = np.percentile(X_scaled, 25, axis=0)
    q3 = np.percentile(X_scaled, 75, axis=0)
    iqr = q3 - q1

    non_zero_iqr = iqr != 0
    X_scaled[:, non_zero_iqr] = (X_scaled[:, non_zero_iqr] - medians[non_zero_iqr]) / iqr[non_zero_iqr]

    return X_scaled

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    columns_to_drop = ['CustomerId', 'Surname', 'id']
    train_data = train_data.drop(columns=columns_to_drop)
    test_data = test_data.drop(columns=columns_to_drop)

    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True)

    missing_cols = set(train_data.columns) - set(test_data.columns)
    for col in missing_cols:
        test_data[col] = 0
    test_data = test_data[train_data.columns.drop('Exited')]  # Drop 'Exited' from the training columns

    numeric_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    X_train = train_data.drop('Exited', axis=1).values
    X_test = test_data.values

    X_train_scaled = robust_scale(X_train)
    X_test_scaled = robust_scale(X_test)

    y_train = train_data['Exited'].values

    return X_train_scaled, y_train, X_test_scaled


def cross_validate(X, y, knn, n_splits=5):
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)

    X_shuffled = X[indices]
    y_shuffled = y[indices]

    fold_size = X.shape[0] // n_splits
    aucs = []

    for i in range(n_splits):
        start = i * fold_size
        end = start + fold_size

        X_val = X_shuffled[start:end]
        y_val = y_shuffled[start:end]

        X_train = np.concatenate([X_shuffled[:start], X_shuffled[end:]], axis=0)
        y_train = np.concatenate([y_shuffled[:start], y_shuffled[end:]], axis=0)

        knn.fit(X_train, y_train)
        y_pred_proba = knn.predict_proba(X_val)

        auc = custom_roc_auc_score(y_val, y_pred_proba)  # Use custom AUC function
        aucs.append(auc)

    avg_auc = np.mean(aucs)
    return avg_auc

In [16]:
def hyperparameter_tuning(X, y):
    k_values = [151, 153]
    distance_metrics = ['euclidean', 'manhattan']
    class_weights_options = [{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 5}]

    best_knn = None
    best_auc = 0

    for k in k_values:
        for distance in distance_metrics:
            for class_weight in class_weights_options:
                knn = KNN(k=k, distance_metric=distance, class_weight=class_weight)
                auc_score = cross_validate(X, y, knn)
                print(f"K={k}, Metric={distance}, Class Weights={class_weight}, AUC={auc_score:.4f}")
                if auc_score > best_auc:
                    best_auc = auc_score
                    best_knn = knn

    return best_knn

train_path = '/content/train.csv'
test_path = '/content/test.csv'

X, y, X_test = preprocess_data(train_path, test_path)

best_knn = hyperparameter_tuning(X, y)

best_knn.fit(X, y)

test_probabilities = best_knn.predict_proba(X_test)

pd.DataFrame({'id': pd.read_csv(test_path)['id'], 'Exited_Probability': test_probabilities}).to_csv('submissions_probabilities.csv', index=False)


K=151, Metric=euclidean, Class Weights={0: 1, 1: 1}, AUC=0.9127
K=151, Metric=euclidean, Class Weights={0: 1, 1: 2}, AUC=0.9134
K=151, Metric=euclidean, Class Weights={0: 1, 1: 3}, AUC=0.9125
K=151, Metric=euclidean, Class Weights={0: 1, 1: 5}, AUC=0.9124
K=151, Metric=manhattan, Class Weights={0: 1, 1: 1}, AUC=0.9152
K=151, Metric=manhattan, Class Weights={0: 1, 1: 2}, AUC=0.9154
K=151, Metric=manhattan, Class Weights={0: 1, 1: 3}, AUC=0.9159
K=151, Metric=manhattan, Class Weights={0: 1, 1: 5}, AUC=0.9152
K=153, Metric=euclidean, Class Weights={0: 1, 1: 1}, AUC=0.9131
K=153, Metric=euclidean, Class Weights={0: 1, 1: 2}, AUC=0.9122
K=153, Metric=euclidean, Class Weights={0: 1, 1: 3}, AUC=0.9127
K=153, Metric=euclidean, Class Weights={0: 1, 1: 5}, AUC=0.9126
K=153, Metric=manhattan, Class Weights={0: 1, 1: 1}, AUC=0.9150
K=153, Metric=manhattan, Class Weights={0: 1, 1: 2}, AUC=0.9152
K=153, Metric=manhattan, Class Weights={0: 1, 1: 3}, AUC=0.9154
K=153, Metric=manhattan, Class Weights={