In [2]:
import numpy as np
import pandas as pd

In [30]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        X = np.array(X)
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        if self.distance_metric == 'euclidean':
            distances = np.sqrt(np.sum((self.X_train - x) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(self.X_train - x), axis=1)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        most_common = pd.Series(k_nearest_labels).mode()[0]
        return most_common

In [12]:
# Define data preprocessing function

def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle categorical variables using one-hot encoding
    train_data = pd.get_dummies(train_data)
    test_data = pd.get_dummies(test_data)

    # Align the columns of the train and test datasets (to ensure they have the same columns)
    train_data, test_data = train_data.align(test_data, join='left', axis=1, fill_value=0)

    # Scale numerical features
    numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns

    for col in numerical_cols:
        if col != "Exited":
            mean = train_data[col].mean()
            std = train_data[col].std()
            train_data[col] = (train_data[col] - mean) / std
            test_data[col] = (test_data[col] - mean) / std

    X = train_data
    y = train_data

    return X, y, test_data



In [31]:
def cross_validate(X, y, knn, n_splits=5):
    X, y = np.array(X), np.array(y)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    fold_size = len(X) // n_splits
    auc_scores = []

    for i in range(n_splits):
        test_indices = indices[i * fold_size: (i + 1) * fold_size]
        train_indices = np.concatenate([indices[:i * fold_size], indices[(i + 1) * fold_size:]])
        
        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[test_indices], y[test_indices]

        knn.fit(X_train, y_train)
        y_prob = knn.predict(X_val)

        auc_score = compute_roc_auc(y_val, y_prob)
        auc_scores.append(auc_score)

        return np.mean(auc_scores)

def compute_roc_auc(y_true, y_pred):
        sorted_indices = np.argsort(y_pred)
        y_true_sorted = y_true[sorted_indices]
        y_pred_sorted = y_pred[sorted_indices]

        tprs = np.cumsum(y_true_sorted == 1) / np.sum(y_true_sorted == 1)
        fprs = np.cumsum(y_true_sorted == 0) / np.sum(y_true_sorted == 0)
        
        tprs = np.concatenate([[0], tprs, [1]])
        fprs = np.concatenate([[0], fprs, [1]])
        
        auc = np.trapz(tprs, fprs)
        return auc

In [32]:
# Load and preprocess data
X, y, X_test = preprocess_data("train.csv", "test.csv")

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)



TypeError: loop of ufunc does not support argument 0 of type float which has no callable sqrt method

In [None]:
print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
best_k = 0
best_distance_metric = ''
best_auc_score = 0

for k in [3, 5, 7]:
    for metric in ['euclidean', 'manhattan']:
        knn = KNN(k=k, distance_metric=metric)
        auc_score = cross_validate(X, y, knn)
        if auc_score > best_auc_score:
            best_auc_score = auc_score
            best_k = k
            best_distance_metric = metric

print(f"Best k: {best_k}, Best distance metric: {best_distance_metric}, Best AUC score: {best_auc_score}")

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_distance_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)