In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
print("Current Working Directory:", os.getcwd())
print("Files in Current Directory:", os.listdir(os.getcwd()))


Current Working Directory: /Users/kelvinyeung/Documents/cs506/assign5
Files in Current Directory: ['test.csv', 'assignment5_starter.ipynb', 'train.csv', '.ipynb_checkpoints', 'sample_submission.csv']


In [3]:
test_data = pd.read_csv('test.csv')
train_data = pd.read_csv('train.csv')


# Print the column names
print("Columns in test.csv:", test_data.columns.tolist())
print("Columns in train.csv:", train_data.columns.tolist())


Columns in test.csv: ['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
Columns in train.csv: ['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']


In [4]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        # Initialize the class with the number of neighbors and the distance metric
        self.k = k
        self.metric = distance_metric

    def fit(self, X_train, y_train):
        # Store the training data
        self.train_data = np.asarray(X_train)
        self.train_labels = np.asarray(y_train)

    def predict(self, X_test):
        # Initialize a list to store probability predictions
        probabilities = []
        
        # Iterate over each test sample
        for sample in X_test:
            # Compute distances between the test sample and all training samples
            distances = self._get_distances(sample, self.train_data)
            
            # Find the indices of the k nearest neighbors
            nearest_indices = self._find_k_neighbors(distances)
            
            # Get the labels of the k nearest neighbors
            neighbors_labels = self.train_labels[nearest_indices]
            
            # Calculate the proportion of class 1 among the neighbors (probability for class 1)
            prob = np.mean(neighbors_labels == 1)
            probabilities.append(prob)  # Store the probability
        
        # Return an array of probabilities for class 1
        return np.array(probabilities)

    def _get_distances(self, x_test, X_train):
        # This helper function computes the distances between a test sample and all training samples
        if self.metric == 'euclidean':
            # Euclidean distance: square root of the sum of squared differences
            return np.sqrt(np.sum((X_train - x_test) ** 2, axis=1))
        elif self.metric == 'manhattan':
            # Manhattan distance: sum of absolute differences
            return np.sum(np.abs(X_train - x_test), axis=1)
        else:
            raise ValueError(f"Distance metric {self.metric} not supported.")

    def _find_k_neighbors(self, distances):
        # Get the indices of the k smallest distances (k nearest neighbors)
        return np.argsort(distances)[:self.k]



In [5]:
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Target variable for the train data
    target = train_data['Exited']
    
    # Drop unnecessary columns from train and test data
    test_features = test_data.drop(columns=['id', 'CustomerId', 'Surname'])
    train_features = train_data.drop(columns=['Exited', 'id', 'CustomerId', 'Surname'])
    
    # Handle missing values for numeric columns
    numeric_cols = train_features.select_dtypes(include=[np.number]).columns
    train_features[numeric_cols] = train_features[numeric_cols].fillna(train_features[numeric_cols].mean())
    test_features[numeric_cols] = test_features[numeric_cols].fillna(test_features[numeric_cols].mean())
    
    # Concatenate train and test data for consistent preprocessing
    combined_features = pd.concat([train_features, test_features], axis=0)
    
    # One-hot encode categorical variables such as 'Geography' and 'Gender'
    combined_features = pd.get_dummies(combined_features, columns=['Geography', 'Gender'], drop_first=True)
    
    # Ensure no non-numeric columns remain
    remaining_non_numeric = combined_features.select_dtypes(include=[object]).columns
    assert remaining_non_numeric.empty, f"Non-numeric columns detected: {remaining_non_numeric}"
    
    # Standardize numeric columns using mean and standard deviation
    numeric_columns = combined_features.select_dtypes(include=[np.number]).columns
    column_means = combined_features[numeric_columns].mean()
    column_stds = combined_features[numeric_columns].std()
    combined_features[numeric_columns] = (combined_features[numeric_columns] - column_means) / column_stds
    
    # Ensure there are no missing values after preprocessing
    assert not combined_features.isnull().any().any(), "NaN values detected in the preprocessed data."
    
    # Split the processed data back into train and test sets
    X_train_final = combined_features.iloc[:len(train_data), :]
    X_test_final = combined_features.iloc[len(train_data):, :]
    
    # Return processed train data, target, and test data
    return X_train_final.values, target.values, X_test_final.values



In [6]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # Convert inputs to numpy arrays
    X = np.array(X)
    y = np.array(y)

    # Shuffle the dataset before splitting into n_splits folds
    shuffled_indices = np.arange(len(X))
    np.random.shuffle(shuffled_indices)
    X = X[shuffled_indices]
    y = y[shuffled_indices]

    # Calculate the size of each fold
    fold_size = len(X) // n_splits

    # Array to store the AUC scores for each fold
    auc_scores = []

    # Perform K-fold cross-validation
    for fold in range(n_splits):
        # Define the indices for the test set for the current fold
        test_start = fold * fold_size
        test_end = test_start + fold_size if fold != n_splits - 1 else len(X)

        # Split the data into training and test sets
        X_test_fold = X[test_start:test_end]
        y_test_fold = y[test_start:test_end]
        
        X_train_fold = np.concatenate([X[:test_start], X[test_end:]], axis=0)
        y_train_fold = np.concatenate([y[:test_start], y[test_end:]], axis=0)

        # Train the KNN model on the training set
        knn.fit(X_train_fold, y_train_fold)

        # Predict probabilities or class scores for the test set
        y_pred_fold = knn.predict(X_test_fold)

        # Compute ROC AUC for the current fold
        auc_value = compute_roc_auc(y_test_fold, y_pred_fold)
        auc_scores.append(auc_value)

    # Return all ROC AUC scores as a numpy array
    return np.array(auc_scores)


# Define ROC AUC calculation function
def compute_roc_auc(y_true, y_pred):
    """
    Compute the ROC AUC score manually using numpy.
    
    Parameters:
    y_true: numpy array of true labels (0 or 1)
    y_pred: numpy array of predicted scores or labels (0 or 1)
    
    Returns:
    roc_auc: float, the computed ROC AUC score
    """
    # Ensure inputs are numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Sort by predicted values (descending order)
    sorted_indices = np.argsort(y_pred)[::-1]
    sorted_true = y_true[sorted_indices]

    # Calculate number of positive and negative labels
    n_pos = np.sum(sorted_true == 1)
    n_neg = np.sum(sorted_true == 0)

    # Handle cases where there are no positive or no negative labels
    if n_pos == 0 or n_neg == 0:
        return 0.5  # Return 0.5 if no positive or negative examples

    # Compute cumulative sums for TPR and FPR
    tpr = np.cumsum(sorted_true) / n_pos  # True Positive Rate
    fpr = np.cumsum(1 - sorted_true) / n_neg  # False Positive Rate

    # Compute and return AUC using the trapezoidal rule (in one line)
    return np.trapz(tpr, fpr)




In [7]:
# Load and preprocess data
X, y, X_test = preprocess_data('./train.csv', './test.csv')

# Create and evaluate model
knn = KNN(k=5, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning
def tune_hyperparameters(X, y, k_values, distance_metrics):
    best_score = 0
    best_k = None
    best_metric = None
    
    for k in k_values:
        for metric in distance_metrics:
            knn = KNN(k=k, distance_metric=metric)
            cv_scores = cross_validate(X, y, knn)  # This returns an array of scores
            mean_cv_score = np.mean(cv_scores)  # Compute the mean of the cross-validation scores
            print(f"k={k}, metric={metric}, Mean CV Score={mean_cv_score}")
            
            # Compare the mean score with the current best score
            if mean_cv_score > best_score:
                best_score = mean_cv_score
                best_k = k
                best_metric = metric
    
    return best_k, best_metric




# Hyperparameter tuning (trying different values of k and metrics)
k_values = [3, 5, 7, 9]  # You can expand this range as needed
distance_metrics = ['euclidean', 'manhattan']  # Try different metrics

best_k, best_metric = tune_hyperparameters(X, y, k_values, distance_metrics)

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn = KNN(k=best_k, distance_metric=best_metric)
knn.fit(X, y)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('./test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)


Cross-validation scores: [0.8772697  0.88087318 0.88325544 0.87561336 0.87730473]
k=3, metric=euclidean, Mean CV Score=0.8480572042323239
k=3, metric=manhattan, Mean CV Score=0.8452072051893795
k=5, metric=euclidean, Mean CV Score=0.8695421188551787
k=5, metric=manhattan, Mean CV Score=0.8732697653540586
k=7, metric=euclidean, Mean CV Score=0.8838304993897287
k=7, metric=manhattan, Mean CV Score=0.8853425052057358
k=9, metric=euclidean, Mean CV Score=0.8870391551160942
k=9, metric=manhattan, Mean CV Score=0.8884834804801199
