In [1]:
import numpy as np
import pandas as pd

In [11]:
import numpy as np

class KNN:
    def __init__(self, k=3, distance_metric='euclidean', batch_size=500):
        self.k = k
        self.distance_metric = distance_metric
        self.batch_size = batch_size  # Added batch_size for batch processing

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        n_samples = X.shape[0]
        predictions = []

        # Process in batches
        for i in range(0, n_samples, self.batch_size):
            X_batch = X[i:i + self.batch_size]  # Get batch of test data
            distances = self.compute_distances(X_batch)  # Compute distances for the entire batch
            k_nearest_indices = np.argsort(distances, axis=1)[:, :self.k]  # Get k nearest neighbors
            k_nearest_labels = self.y_train[k_nearest_indices]  # Get labels of the k nearest neighbors

            # Find the most common label in the k nearest neighbors for each test point
            batch_predictions = [max(set(labels), key=labels.tolist().count) for labels in k_nearest_labels]
            predictions.extend(batch_predictions)

        return np.array(predictions)

    def compute_distances(self, X_batch):
        # Efficiently compute pairwise distances using vectorization
        if self.distance_metric == 'euclidean':
            # Broadcasting for efficient Euclidean distance computation
            dists = np.sqrt(np.sum((X_batch[:, np.newaxis] - self.X_train) ** 2, axis=2))
        elif self.distance_metric == 'manhattan':
            # Broadcasting for efficient Manhattan distance computation
            dists = np.sum(np.abs(X_batch[:, np.newaxis] - self.X_train), axis=2)
        else:
            raise ValueError("Invalid distance metric")
        return dists


In [3]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Handle missing values
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
    categorical_features = ['Geography', 'Gender']

    # Fill missing values for numerical features with the median
    train_data[numerical_features] = train_data[numerical_features].fillna(train_data[numerical_features].median())
    test_data[numerical_features] = test_data[numerical_features].fillna(test_data[numerical_features].median())

    # Fill missing values for categorical features with the mode
    train_data[categorical_features] = train_data[categorical_features].fillna(train_data[categorical_features].mode().iloc[0])
    test_data[categorical_features] = test_data[categorical_features].fillna(test_data[categorical_features].mode().iloc[0])

    # One-hot encode categorical features using get_dummies
    train_data = pd.get_dummies(train_data, columns=categorical_features)
    test_data = pd.get_dummies(test_data, columns=categorical_features)

    # Align train and test data to have the same columns (add missing columns as needed)
    test_data = test_data.reindex(columns=train_data.columns, fill_value=0)

    # Drop unnecessary columns
    X = train_data.drop(['CustomerId', 'Surname', 'Exited'], axis=1)
    y = train_data['Exited']
    X_test = test_data.drop(['CustomerId', 'Surname'], axis=1)

    # Min-Max Scaling for numerical features
    min_values = X[numerical_features].min() # Only use numerical features for min
    max_values = X[numerical_features].max() # Only use numerical features for max
    
    X[numerical_features] = (X[numerical_features] - min_values) / (max_values - min_values) # Only scale numerical features
    X_test[numerical_features] = (X_test[numerical_features] - min_values) / (max_values - min_values) # Only scale numerical features

    return X.values, y.values, X_test.values

In [4]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    n = len(X)
    fold_size = n // n_splits
    scores = []

    for i in range(n_splits):
        X_train = np.concatenate([X[:i * fold_size], X[(i + 1) * fold_size:]], axis=0)
        y_train = np.concatenate([y[:i * fold_size], y[(i + 1) * fold_size:]], axis=0)
        X_val = X[i * fold_size: (i + 1) * fold_size]
        y_val = y[i * fold_size: (i + 1) * fold_size]

        knn.fit(X_train, y_train)
        predictions = knn.predict(X_val)

        accuracy = np.mean(predictions == y_val)
        scores.append(accuracy)

    return scores

In [10]:
# Load and preprocess data
X, y, X_test = preprocess_data(r'C:\Users\julis\projects\jmijares-assignment-5\data\train.csv', r'C:\Users\julis\projects\jmijares-assignment-5\data\test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
#knn = KNN(k=best_k, distance_metric='euclidean')
#knn.fit(X, y)
#test_predictions = knn.predict(X_test)

# Save test predictions
# pd.DataFrame({'id': pd.read_csv('/path/of/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

KeyboardInterrupt: 

In [None]:
# TODO: hyperparamters tuning
k_values = [3, 5, 7, 9]
best_k = 3
best_score = 0

for k in k_values:
    knn = KNN(k=k, distance_metric='euclidean')
    cv_scores = cross_validate(X, y, knn)
    avg_score = np.mean(cv_scores)
    if avg_score > best_score:
        best_score = avg_score
        best_k = k

print(f"Best k: {best_k}, Cross-validation score: {best_score}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = knn.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))
print("ROC AUC:", roc_auc_score(y_val, y_pred))