In [204]:
import numpy as np
import pandas as pd

In [205]:
# Define the KNN class
class KNN:
    def __init__(self, k, distance_metric):
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None

    def fit(self, X, y):
        self.X_train = np.array(X, dtype=float)  # Ensure float type
        self.y_train = np.array(y)

    def predictTrain(self, X):
        X = np.array(X, dtype=float)  # Ensure float type
        predictions = []
        for x in X:
            distances = self.compute_distance(self.X_train, x)
            k_indices = np.argsort(distances)[:self.k]
            k_nearest_labels = self.y_train[k_indices]

            # Manual mode calculation
            unique_labels, counts = np.unique(k_nearest_labels, return_counts=True)
            most_common_label = unique_labels[np.argmax(counts)]
            predictions.append(most_common_label)

        return np.array(predictions)
    
    def predictTest(self, X):
        # Predict probabilities for each sample in X
        if X.ndim == 1:
            X = X.reshape(1, -1)
        return np.array([self._predictTest(x) for x in X])
    
    def _predictTest(self, x):
        distances = self.compute_distance(self.X_train, x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = self.y_train[k_indices]
        k_nearest_distances = distances[k_indices]
        weights = 1 / (k_nearest_distances + 1e-10)
        weighted_sum = np.sum(weights * k_nearest_labels)
        total_weight = np.sum(weights)
        prob = weighted_sum / total_weight
        return prob

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError("Unsupported distance metric")

In [207]:
# Define data preprocessing function
def preprocess_data(train_path, test_path):
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # TODO: Implement data preprocessing
    # Handle categorical variables, scale features, etc.
    
     # Handle missing values (if necessary) - here, just dropping rows with missing values
    train_data = train_data.dropna()
    test_data = test_data.dropna()

    #changes for gender 
    train_data['Gender'] = train_data['Gender'].map({'Male': 1, 'Female': 0})
    test_data['Gender'] = test_data['Gender'].map({'Male': 1, 'Female': 0})   
    #changes for geograpy
    geography_map = {category: idx for idx, category in enumerate(train_data['Geography'].unique())}
    train_data['Geography'] = train_data['Geography'].map(geography_map)
    test_data['Geography'] = test_data['Geography'].map(geography_map)


    ###Feature Enginneering

    ###interaction terms first

    train_data['Balance_Salary_Ratio'] = train_data['Balance'] / (train_data['EstimatedSalary'] + 1e-10)
    test_data['Balance_Salary_Ratio'] = test_data['Balance'] / (test_data['EstimatedSalary'] + 1e-10)

    train_data['Age_Tenure_Ratio'] = train_data['Age'] / (train_data['Tenure'] + 1e-10)  
    test_data['Age_Tenure_Ratio'] = test_data['Age'] / (test_data['Tenure'] + 1e-10)  


    #Make a group for age
    train_data['AgeGroup'] = pd.cut(train_data['Age'], bins=[0, 29, 37, 45, 100], labels=[0, 1, 2, 3]).astype(int)
    test_data['AgeGroup'] = pd.cut(test_data['Age'], bins=[0, 29, 37, 45, 100], labels=[0, 1, 2, 3]).astype(int)
    
    #Make a group for Tenure
    train_data['TenureGroup'] = pd.cut(train_data['Tenure'], bins = [-1, 1, 3, 8, 12, 20], labels = [0, 1, 2, 3, 4]).astype(int)
    test_data['TenureGroup'] = pd.cut(test_data['Tenure'], bins = [-1, 1, 3, 8, 12, 20], labels = [0, 1, 2, 3, 4]).astype(int)
    
    # #Make a group for Balance
    train_data['BalanceGroup'] = pd.cut(train_data['Balance'], bins=[-1, 20000, 50000, 100000, 150000, 100000000],labels=[0, 1, 2, 3, 4]).astype(int)
    test_data['BalanceGroup'] = pd.cut(test_data['Balance'], bins=[-1, 20000, 50000, 100000, 150000, 100000000],labels=[0, 1, 2, 3, 4]).astype(int)
    
    train_data['CreditScoreGroup'] = pd.cut(train_data['CreditScore'], bins=[430, 550, 650, 750, 850], labels=[0, 1, 2, 3]).astype(int)
    test_data['CreditScoreGroup'] = pd.cut(test_data['CreditScore'], bins=[430, 550, 650, 750, 850], labels=[0, 1, 2, 3]).astype(int)

    

    ## Dropping Group Columns not Age 
    train_data.drop(columns='Tenure', inplace=True)
    test_data.drop(columns='Tenure', inplace=True)
    train_data.drop(columns='Balance', inplace=True)
    test_data.drop(columns='Balance', inplace=True)

    X_train = train_data.drop(columns=['id', 'CustomerId', 'Surname', 'Exited'])  # Drop unnecessary columns
    y_train = train_data['Exited']
    X_test = test_data.drop(columns=['id', 'CustomerId', 'Surname'])

    # #doing a min/max scaler 
    # X_train_scaled = (X_train - X_train.min()) / (X_train.max() - X_train.min())
    # X_test_scaled = (X_test - X_train.min()) / (X_train.max() - X_train.min())

    #standard scaler
    X_train_scaled = (X_train - X_train.mean()) / X_train.std()
    X_test_scaled = (X_test - X_train.mean()) / X_train.std()



    return X_train_scaled.values, y_train.values, X_test_scaled.values
    

In [208]:
def cross_validate(X, y, knn, n_splits=5):
    """ Perform k-fold cross-validation """
    # Convert X and y to NumPy arrays if they are not already
    X = np.array(X)
    y = np.array(y)

    # Check if X and y have the same length
    if len(X) != len(y):
        raise ValueError("X and y must have the same length")

    fold_size = len(X) // n_splits
    indices = np.arange(len(X))
    np.random.shuffle(indices)  # Shuffle the indices
    scores = []

    for fold in range(n_splits):
        # Calculate the indices for validation set
        val_indices = indices[fold * fold_size : (fold + 1) * fold_size] if fold < n_splits - 1 else indices[fold * fold_size :]

        train_indices = np.concatenate([indices[:fold * fold_size], indices[(fold + 1) * fold_size:]])
        
        X_train, X_val = X[train_indices], X[val_indices]
        y_train, y_val = y[train_indices], y[val_indices]

        # Fit the model and predict
        knn.fit(X_train, y_train)
        y_val_pred = knn.predictTrain(X_val)
        
        # Calculate accuracy or any other metric
        score = np.mean(y_val_pred == y_val)
        scores.append(score)

    return np.array(scores)

In [209]:
X, y, X_test = preprocess_data('train.csv', 'test.csv')

for k in range(12,14):
    knn = KNN(k, distance_metric='euclidean')
    cv_scores = cross_validate(X, y, knn)
    avg_cv = np.mean(cv_scores)
    print("Cross-validation scores:" + str(k) + " " + str(avg_cv))

Cross-validation scores:12 0.8780666666666667
Cross-validation scores:13 0.8807333333333334


In [203]:
# Load and preprocess data
X, y, X_test = preprocess_data('train.csv', 'test.csv')

# Create and evaluate model


knn = KNN(k=13, distance_metric='euclidean')

# Perform cross-validation
cv_scores = cross_validate(X, y, knn)

print("Cross-validation scores:", cv_scores)

# TODO: hyperparamters tuning


# TODO: Train on full dataset with optimal hyperparameters and make predictions on test set
knn.fit(X, y)
test_predictions = knn.predictTest(X_test)

# Save test predictions
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

Cross-validation scores: [0.88166667 0.88066667 0.876      0.87966667 0.88266667]
