In [None]:
import numpy as np
import pandas as pd

train_path = 'train.csv'
test_path = 'test.csv'

In [None]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean', weights='uniform'):
        self.k = k
        self.distance_metric = distance_metric
        self.weights = weights

    def fit(self, X, y):
        self.x_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        else:
            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")

    def predict(self, X):
        predictions = []
        for x in X:
            distances = self.compute_distance(x, self.x_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = [self.y_train[i] for i in k_indices]

            if self.weights == 'uniform':
                majority_vote = np.argmax(np.bincount(k_nearest_labels))
            elif self.weights == 'distance':
                weights = 1 / np.array([distances[i] for i in k_indices])
                weighted_vote = np.bincount(k_nearest_labels, weights=weights)
                majority_vote = np.argmax(weighted_vote)

            predictions.append(majority_vote)
        return np.array(predictions)


In [None]:

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'], errors='ignore')
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'], errors='ignore')

    X_train = train_data.drop('Exited', axis=1)
    y_train = train_data['Exited']

    X_train = pd.get_dummies(X_train, drop_first=True)
    X_test = pd.get_dummies(test_data, drop_first=True)

    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    X_train_mean = X_train.mean()
    X_train_std = X_train.std()

    X_train_scaled = (X_train - X_train_mean) / X_train_std
    X_test_scaled = (X_test - X_train_mean) / X_train_std

    return X_train_scaled.to_numpy(), y_train.to_numpy(), X_test_scaled.to_numpy()


In [None]:
def roc_auc(y_true, y_pred_prob):

    sorted_indices = np.argsort(-y_pred_prob)
    y_true_sorted = y_true[sorted_indices]


    tpr = np.cumsum(y_true_sorted == 1) / np.sum(y_true == 1)
    fpr = np.cumsum(y_true_sorted == 0) / np.sum(y_true == 0)


    auc = np.trapz(tpr, fpr)

    return auc

def cross_validate(X, y, knn, n_splits=5):

    indices = np.arange(len(X))
    np.random.shuffle(indices)


    fold_size = len(X) // n_splits
    roc_auc_scores = []

    for fold in range(n_splits):

        start = fold * fold_size
        end = (fold + 1) * fold_size
        val_idx = indices[start:end]
        train_idx = np.concatenate([indices[:start], indices[end:]])


        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]


        knn.fit(X_train, y_train)


        y_pred_prob = knn.predict(X_val)


        roc_score = roc_auc(y_val, y_pred_prob)
        roc_auc_scores.append(roc_score)


    return np.mean(roc_auc_scores)


In [None]:
def find_bestk(X, y, max_k=20):
  best_k = None
  best_score = 0
  for k in range(1, 10, 2):
      knn = KNN(k=k, distance_metric='euclidean')
      cv_score = cross_validate(X, y, knn)
      print(f'k={k}, Cross-validation score={cv_score}')
      if cv_score > best_score:
          best_score = cv_score
          best_k = k

  return best_k, best_score

# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_k, best_score = find_bestk(X, y)


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: 0.7634035693891222
k=1, Cross-validation score=0.7517532585475412
k=3, Cross-validation score=0.7589690946556742
k=5, Cross-validation score=0.7561874840792383
k=7, Cross-validation score=0.7666173213392347
k=9, Cross-validation score=0.7671586558179382
