In [123]:
import numpy as np
import pandas as pd

In [124]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        # TODO: Implement the predict method
        predictions = []
        for x in X:
            distances = self.compute_distance(x, self.X_train)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]
            prob_class_1 = np.sum(k_nearest_labels == 1) / self.k
            predictions.append(prob_class_1)  # Returning the probability for class 1
        
        return np.array(predictions)  # Array of probabilities

   

    def compute_distance(self, X1, X2):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        X1 = np.atleast_2d(X1).astype(np.float64)
        X2 = np.atleast_2d(X2).astype(np.float64)

        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X2 - X1) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X2 - X1), axis=1)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [125]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    y_train = train_data['Exited']
    
    X_train = train_data.drop(columns=['Exited', 'id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # Handle missing values only for numeric columns
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns
    X_train[numeric_cols] = X_train[numeric_cols].fillna(X_train[numeric_cols].mean())
    test_data[numeric_cols] = test_data[numeric_cols].fillna(test_data[numeric_cols].mean())
    
    # Combine X_train and test_data for consistent preprocessing
    full_data = pd.concat([X_train, test_data], axis=0)
    
    # Handle categorical variables (One-hot encode 'Geography' and 'Gender')
    full_data = pd.get_dummies(full_data, columns=['Geography', 'Gender'], drop_first=True)
    
    # Ensure there are no remaining non-numeric columns
    non_numeric_columns = full_data.select_dtypes(include=[object]).columns
    assert non_numeric_columns.empty, f"There are still non-numeric columns: {non_numeric_columns}"
    
    # Manually scale numerical features using mean and standard deviation
    numeric_cols = full_data.select_dtypes(include=[np.number]).columns
    means = full_data[numeric_cols].mean()
    stds = full_data[numeric_cols].std()
    full_data[numeric_cols] = (full_data[numeric_cols] - means) / stds
    
    # Ensure no NaN values exist after preprocessing
    assert not full_data.isnull().values.any(), "There are NaN values in the dataset!"
    
    # Separate the preprocessed train and test data
    X_train = full_data.iloc[:len(train_data), :]
    X_test = full_data.iloc[len(train_data):, :]
    
    return X_train.values, y_train.values, X_test.values

In [126]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    X = np.array(X)
    y = np.array(y)

    # Shuffle the data before splitting into folds
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    
    # Determine the size of each fold
    fold_size = len(X) // n_splits
    
    # Initialize an array to store the AUC scores
    auc_scores = []
    
    # Loop through each fold
    for fold in range(n_splits):
        # Determine the start and end of the test indices for the current fold
        test_start = fold * fold_size
        test_end = test_start + fold_size if fold != n_splits - 1 else len(X)
        
        # Split the data into training and testing sets
        X_test = X[test_start:test_end]
        y_test = y[test_start:test_end]
        
        X_train = np.concatenate([X[:test_start], X[test_end:]], axis=0)
        y_train = np.concatenate([y[:test_start], y[test_end:]], axis=0)
        
        # Fit the KNN model on the training data
        knn.fit(X_train, y_train)
        
        # Predict the labels (for KNN we use class predictions directly)
        y_pred = knn.predict(X_test)
        
        # Compute ROC AUC score for the current fold manually
        auc = compute_roc_auc(y_test, y_pred)
        auc_scores.append(auc)
    
    # Return the AUC scores as a numpy array
    return np.array(auc_scores)


def compute_roc_auc(y_true, y_pred):
    """
    Compute the ROC AUC score manually using numpy.
    
    Parameters:
    y_true: numpy array of true labels (0 or 1)
    y_pred: numpy array of predicted scores or labels (0 or 1)
    
    Returns:
    roc_auc: float, the computed ROC AUC score
    """
    # Ensure inputs are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Sort by predicted values (descending order)
    sorted_indices = np.argsort(y_pred)[::-1]
    y_true = y_true[sorted_indices]

    # Compute true positive rate (TPR) and false positive rate (FPR)
    n_pos = np.sum(y_true == 1)
    n_neg = np.sum(y_true == 0)
    
    if n_pos == 0 or n_neg == 0:
        return 0.5  # Undefined AUC when no positive or negative class exists
    
    tpr = np.cumsum(y_true) / n_pos  # True positive rate
    fpr = np.cumsum(1 - y_true) / n_neg  # False positive rate

    # Compute AUC as the area under the ROC curve
    roc_auc = np.trapz(tpr, fpr)
    return roc_auc

In [127]:
# Load and preprocess data
X, y, X_test = preprocess_data('CS506 Customer Churn KNN/train.csv', 'CS506 Customer Churn KNN/test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
def grid_search_knn(X_train, y_train, k_values, distance_metrics):
    best_k = None
    best_metric = None
    best_score = 0
    
    for k in k_values:
        for metric in distance_metrics:
            knn = KNN(k=k, distance_metric=metric)
            scores = cross_validate(X_train, y_train, knn)
            mean_score = np.mean(scores)
            
            if mean_score > best_score:
                best_score = mean_score
                best_k = k
                best_metric = metric
                
    print(f"Best k: {best_k}, Best distance metric: {best_metric}, Best score: {best_score}")
    return best_k, best_metric

k_values = [3, 5, 7, 9, 11]
distance_metrics = ['euclidean', 'manhattan']

# Perform grid search
best_k, best_metric = grid_search_knn(X, y, k_values, distance_metrics)


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('CS506 Customer Churn KNN/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.8621671  0.86634166 0.88624942 0.88135714 0.89269228]
Best k: 11, Best distance metric: manhattan, Best score: 0.9003462062883862


# AdaBoost KNN #

In [128]:
class AdaBoostKNN:
    def __init__(self, base_learner, n_estimators=50):
        self.base_learner = base_learner
        self.n_estimators = n_estimators
        self.alphas = []
        self.learners = []

    def fit(self, X, y):
        # Initialize weights equally
        n_samples = len(y)
        weights = np.ones(n_samples) / n_samples

        for i in range(self.n_estimators):
            # Train a new weak learner (KNN) with the current weights
            learner = self.base_learner()
            learner.fit(X, y)
            y_pred = learner.predict(X)
            
            # Compute weighted error rate
            misclassified = (y_pred != y)
            error = np.sum(weights * misclassified) / np.sum(weights)

            # Compute alpha (classifier's contribution to the final model)
            alpha = 0.5 * np.log((1 - error) / (error + 1e-10))  # Add small constant to avoid division by zero

            # Update weights: increase for misclassified samples
            weights *= np.exp(-alpha * y * (2 * y_pred - 1))  # y * (-1 or 1) for misclassification
            
            # Normalize weights
            weights /= np.sum(weights)

            # Store this learner and its alpha
            self.learners.append(learner)
            self.alphas.append(alpha)

    def predict(self, X):
        # Initialize predictions to zero
        final_predictions = np.zeros(X.shape[0])

        for alpha, learner in zip(self.alphas, self.learners):
            y_pred = learner.predict(X)
            final_predictions += alpha * (2 * y_pred - 1)  # Use (2 * y_pred - 1) to map {0, 1} -> {-1, 1}

        # Return final predictions
        return np.where(final_predictions >= 0, 1, 0)

In [129]:
def grid_search_adaboost_knn(X_train, y_train, k_values, n_estimators_values, distance_metrics):
    best_k = None
    best_metric = None
    best_n_estimators = None
    best_score = 0
    
    # Iterate over all combinations of k, distance metrics, and n_estimators
    for k in k_values:
        for metric in distance_metrics:
            for n_estimators in n_estimators_values:
                # Initialize AdaBoost with KNN as the base learner
                ada_knn = AdaBoostKNN(base_learner=lambda: KNN(k=k, distance_metric=metric), n_estimators=n_estimators)
                
                # Perform cross-validation and compute the mean score
                scores = cross_validate(X_train, y_train, ada_knn)
                mean_score = np.mean(scores)
                
                # Update the best parameters if current score is better
                if mean_score > best_score:
                    best_score = mean_score
                    best_k = k
                    best_metric = metric
                    best_n_estimators = n_estimators
    
    # Print the best combination of parameters
    print(f"Best k: {best_k}, Best distance metric: {best_metric}, Best n_estimators: {best_n_estimators}, Best score: {best_score}")
    
    return best_k, best_metric, best_n_estimators

In [None]:
X, y, X_test = preprocess_data('CS506 Customer Churn KNN/train.csv', 'CS506 Customer Churn KNN/test.csv')

k_values = [3, 5, 7, 9, 11]
n_estimators_values = [10, 50, 100]
distance_metrics = ['euclidean', 'manhattan']

best_k, best_metric, best_n_estimators = grid_search_adaboost_knn(X, y, k_values, n_estimators_values, distance_metrics)

# Train the model on the full dataset with optimal hyperparameters
ada_knn = AdaBoostKNN(base_learner=lambda: KNN(k=best_k, distance_metric=best_metric), n_estimators=best_n_estimators)
ada_knn.fit(X, y)

# Make predictions on the test set
test_predictions = ada_knn.predict(X_test)

# Save the test predictions
pd.DataFrame({'id': pd.read_csv('CS506 Customer Churn KNN/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)