In [16]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
from scipy import stats

In [17]:
# Define the KNN class
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # TODO: Implement the fit method
        self.X_train = X
        self.y_train = y.reset_index(drop=True)
        

    def predict(self, X):
        # TODO: Implement the predict method
        probabilities = []
        for x in X:
        # Compute the distance from x to all points in the training data
            distances = self.compute_distance(self.X_train, x)
        # Get the indices of the k nearest neighbors
            neighbor_indices = np.argsort(distances)[:self.k]
        # Get the labels of the k nearest neighbors
            neighbor_labels = self.y_train.iloc[neighbor_indices]
        # Calculate the probability of the positive class
            prob_positive = np.mean(neighbor_labels)
            probabilities.append(prob_positive)
        return np.array(probabilities)
       
        

    def compute_distance(self, X1, X2, p=5):
        # TODO: Implement distance computation based on self.distance_metric
        # Hint: Use numpy operations for efficient computation
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        elif self.distance_metric == 'minkowski':
            return np.sum(np.abs(X1 - X2) ** p, axis=1) ** (1 / p)
        else:
            raise ValueError("Unsupported distance metric")
        

In [30]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    # Separate features and target variable
    X_train = train_data.drop(columns=['CustomerId', 'Surname', 'Exited'])
    y_train = train_data['Exited']
    X_test = test_data.drop(columns=['CustomerId', 'Surname'])

    # Feature Engineering: Creating Balance-to-Salary Ratio
    X_train = X_train.copy()  # Make an explicit copy of X_train to avoid chaining warnings
    X_train['BalanceToSalaryRatio'] = X_train['Balance'] / X_train['EstimatedSalary']
    X_train['BalanceToSalaryRatio'] = X_train['BalanceToSalaryRatio'].replace([np.inf, -np.inf], np.nan)

    
    # Handle any infinite or NaN values that might arise due to division
    X_test = X_test.copy()  # Make an explicit copy of X_test to avoid chaining warnings
    X_test['BalanceToSalaryRatio'] = X_test['Balance'] / X_test['EstimatedSalary']
    X_test['BalanceToSalaryRatio'] = X_test['BalanceToSalaryRatio'].replace([np.inf, -np.inf], np.nan)

    # Add interaction terms between Geography and IsActiveMember
    X_train['Geo_IsActive'] = X_train['Geography'].astype(str) + "_" + X_train['IsActiveMember'].astype(str)
    X_test['Geo_IsActive'] = X_test['Geography'].astype(str) + "_" + X_test['IsActiveMember'].astype(str)

    # Add interaction between Age and Balance
    X_train['Age_Balance_Interaction'] = X_train['Age'] * X_train['Balance']
    X_test['Age_Balance_Interaction'] = X_test['Age'] * X_test['Balance']

    
    # Define preprocessing for numerical features: impute missing values, then scale
    numerical_features = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'BalanceToSalaryRatio', 'Age_Balance_Interaction']
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    # Define preprocessing for categorical features: impute missing values, then one-hot encode
    categorical_features = ['Geography', 'Gender','Geo_IsActive']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
   
    
    # Combine preprocessing for numerical and categorical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features),
        ])
    
    # Preprocess the training data
    X_train_processed = preprocessor.fit_transform(X_train)
    # Preprocess the testing data
    X_test_processed = preprocessor.transform(X_test)
    
    return X_train_processed, X_test_processed, y_train
    

In [27]:
# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    # TODO: Implement cross-validation
    # Compute ROC AUC scores
    # Initialize the KFold cross-validation
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        # Split the data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Train the KNN model
        knn.fit(X_train, y_train)
        
        # Predict probabilities for the positive class
        y_prob = knn.predict(X_test)
        
        # Compute the ROC AUC score for this fold
        auc = roc_auc_score(y_test, y_prob)
        auc_scores.append(auc)
    
    # Return the average ROC AUC score across all folds
    mean_auc = sum(auc_scores) / len(auc_scores)
    return mean_auc, auc_scores
    

In [29]:

# Load and preprocess data
X_train, X_test, y_train = preprocess_data('/Users/xiachuancheng/Documents/CS506/jason123-assignment-5/train.csv', '/Users/xiachuancheng/Documents/CS506/jason123-assignment-5/test.csv')

# Create and evaluate model
best_k = 5
best_distance_metric = 'euclidean'
knn = KNN(k=best_k, distance_metric=best_distance_metric)

# Perform cross-validation


# TODO: hyperparamters tuning
def hyperparameter_tuning(X, y):
    best_score = 0
    best_params = {'k': 5, 'distance_metric': 'euclidean'}
    distance_metrics = ['euclidean', 'manhattan', 'minkowski']

    # Try different values of k and distance metrics
    for k in range(40, 44, 1): # Odd values of k from 3 to 19
        for metric in distance_metrics:
            knn_temp = KNN(k=k, distance_metric=metric)
            mean_auc, _ = cross_validate(X, y, knn_temp, n_splits=5)
            print(f'k: {k}, distance_metric: {metric}, AUC: {mean_auc}')
            
            # Update the best score and parameters if better
            if mean_auc > best_score:
                best_score = mean_auc
                best_params = {'k': k, 'distance_metric': metric}
    
    print(f'Best hyperparameters: {best_params}, AUC: {best_score}')
    return best_params

# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
# Tune hyperparameters
best_params = hyperparameter_tuning(X_train, y_train)
knn = KNN(k=best_params['k'], distance_metric=best_params['distance_metric'])
knn.fit(X_train, y_train)
test_predictions = knn.predict(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('/Users/xiachuancheng/Documents/CS506/jason123-assignment-5/test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

k: 40, distance_metric: minkowski, AUC: 0.9164420435321012
k: 41, distance_metric: minkowski, AUC: 0.9163366950739563
k: 42, distance_metric: minkowski, AUC: 0.9163026265519598
Best hyperparameters: {'k': 40, 'distance_metric': 'minkowski'}, AUC: 0.9164420435321012
