In [60]:
import numpy as np
import pandas as pd

In [61]:
# import numpy as np

# class KNN:
#     def __init__(self, k=3, distance_metric='euclidean'):
#         self.k = k
#         self.distance_metric = distance_metric

#     def fit(self, X, y):
#         # Store the training data
#         self.X_train = X
#         self.y_train = y

#     def predict(self, X):
#         # Predict the class for each instance in X
#         predictions = []
#         for x in X:
#             # Compute distances between x and all training samples
#             distances = self.compute_distance(self.X_train, x)
#             # Sort by distance and get indices of the k nearest neighbors
#             k_indices = np.argsort(distances)[:self.k]
#             # Get the labels of the k nearest samples
#             k_nearest_labels = self.y_train[k_indices]
#             # Determine the most common class label
#             unique, counts = np.unique(k_nearest_labels, return_counts=True)
#             predictions.append(unique[np.argmax(counts)])
#         return np.array(predictions)

#     def compute_distance(self, X1, X2):
#         if self.distance_metric == 'euclidean':
#             # Compute Euclidean distance (L2 norm)
#             return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
#         elif self.distance_metric == 'manhattan':
#             # Compute Manhattan distance (L1 norm)
#             return np.sum(np.abs(X1 - X2), axis=1)
#         else:
#             raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [62]:
class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        # Store the training data
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        # Predict probabilities for the positive class (Exited = 1) for each instance in X
        probabilities = []
        for x in X:
            # Compute distances between x and all training samples
            distances = self.compute_distance(self.X_train, x)
            # Sort by distance and get indices of the k nearest neighbors
            k_indices = np.argsort(distances)[:self.k]
            # Get the labels of the k nearest samples
            k_nearest_labels = self.y_train[k_indices]
            # Calculate the probability of the positive class (e.g., Exited = 1)
            prob_positive = np.mean(k_nearest_labels)  # Proportion of 1s among k neighbors
            probabilities.append([1 - prob_positive, prob_positive])  # Return [P(0), P(1)]
        return np.array(probabilities)

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            # Compute Euclidean distance (L2 norm)
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            # Compute Manhattan distance (L1 norm)
            return np.sum(np.abs(X1 - X2), axis=1)
        else:
            raise ValueError(f"Unknown distance metric: {self.distance_metric}")

In [63]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Define your preprocessing function
def preprocess_data(train_path, test_path):
    # Load the datasets
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)
    
    # Drop irrelevant columns (e.g., 'Customer ID', 'Surname')
    train_data = train_data.drop(columns=['CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['CustomerId', 'Surname'])
    
    # Assume 'Exited' is the target variable in train_data
    # Split data into features (X) and target (y)
    X_train = train_data.drop(columns=['Exited'])  # Drop the target column in the training set
    y_train = train_data['Exited']  # Target variable
    
    X_test = test_data.copy()  # Assuming test data doesn't have the target variable
    
    # Perform one-hot encoding on categorical columns (e.g., Geography, Gender)
    categorical_columns = ['Geography', 'Gender']  # Adjust this list based on your dataset
    X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)
    
    # Ensure the test set has the same columns as the training set after one-hot encoding
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)
    
    # Initialize StandardScaler for feature scaling
    scaler = StandardScaler()
    
    # Fit the scaler on the training data and transform both training and testing data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Convert the scaled DataFrames to NumPy arrays
    X_train_scaled = X_train_scaled
    X_test_scaled = X_test_scaled
    
    # Print to verify the conversion
    print("X_train_scaled is now a NumPy array")
    print(X_train_scaled[:5])  # Print the first 5 rows of the scaled training data
    print("X_test_scaled is now a NumPy array")
    print(X_test_scaled[:5])  # Print the first 5 rows of the scaled test data
    
    # Return the preprocessed and scaled data
    return X_train_scaled, y_train, X_test_scaled

In [64]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import numpy as np

def cross_validate(X, y, knn, n_splits=5):
    """
    Perform K-Fold cross-validation on the dataset and calculate the AUC-ROC score for each fold.
    
    Parameters:
    - X: Feature dataset (array-like, not DataFrame in this case).
    - y: Target labels (array-like).
    - knn: Instance of the KNN model (with the specified k and distance metric).
    - n_splits: Number of folds for cross-validation (default is 5).
    
    Returns:
    - Mean AUC-ROC score across all folds.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    # Convert X and y to NumPy arrays (in case they are not already)
    X = np.array(X)
    y = np.array(y)

    for train_index, val_index in kf.split(X):
        # Split the dataset into training and validation sets
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Fit the model on the training set
        knn.fit(X_train, y_train)

        # Predict class probabilities for the validation set
        y_pred_proba = knn.predict(X_val)[:, 1]

        # Calculate AUC-ROC score based on predicted probabilities
        auc = roc_auc_score(y_val, y_pred_proba)
        auc_scores.append(auc)

    # Return the average AUC-ROC score across all folds
    return np.mean(auc_scores)

In [65]:
from sklearn.metrics import roc_auc_score

def hyperparameter_tuning(X, y, distance_metrics, initial_k_values=[5, 15, 25, 35, 45], groups = 3, n_splits=5):
    best_score = 0
    best_k = None
    best_metric = None

    for group in range(groups):
        print(f"Group {group + 1}: Testing k values {initial_k_values}")

        for k in initial_k_values:
            for metric in distance_metrics:
                knn = KNN(k=k, distance_metric=metric)
                cv_score = cross_validate(X, y, knn, n_splits)

                # Keep track of the best score and corresponding hyperparameters
                if cv_score > best_score:
                    best_score = cv_score
                    best_k = k
                    best_metric = metric

        print(f"Best result from group {group + 1}: k = {best_k}, metric = {best_metric}, AUC-ROC score = {best_score}")

        # Prepare the next group of k values: best_k, best_k+50, best_k+60, best_k+70, best_k+80, best_k+90
        initial_k_values = [best_k] + [best_k + i * 10 for i in range(5, 10)]

    print("Final Best AUC-ROC Score:", best_score)
    print("Final Best Hyperparameters: k =", best_k, ", distance metric =", best_metric)

    return {'k': best_k, 'distance_metric': best_metric}

In [66]:
# Preprocess the data
X_train_scaled, y_train, X_test_scaled = preprocess_data('train.csv', 'test.csv')

# Reindex to ensure the indices are sequential after preprocessing
X_train_scaled = pd.DataFrame(X_train_scaled).reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)

X_train_scaled is now a NumPy array
[[-1.73193534  0.17081798 -0.57821161  1.0669508   0.99049174 -1.10234839
   0.52984687  1.02224741  1.29671193  2.08554566 -0.52013499  0.87056371]
 [-1.7317044   0.0605992  -1.06919231  0.70902572 -0.72770619  0.77388284
  -1.88733775 -0.97823676  0.97616493 -0.47949082 -0.52013499  0.87056371]
 [-1.73147346  0.70813453 -0.70095678 -1.43852476 -0.72770619  0.77388284
   0.52984687 -0.97823676 -1.59890383 -0.47949082 -0.52013499  0.87056371]
 [-1.73124252  0.37747819  1.75394672  1.42487588  0.65021481 -1.10234839
  -1.88733775 -0.97823676  0.17405566 -0.47949082 -0.52013499  0.87056371]
 [-1.73101158  0.22592737  0.52649497  0.35110064  1.44049596 -1.10234839
   0.52984687  1.02224741 -1.01827995 -0.47949082 -0.52013499  0.87056371]]
X_test_scaled is now a NumPy array
[[ 1.73216628  1.34189251  0.4037498  -1.08059968 -0.72770619  0.77388284
   0.52984687  1.02224741  1.23167424 -0.47949082 -0.52013499 -1.14868101]
 [ 1.73239722  2.63696317 -0.45546

In [67]:
# Define possible distance metrics
distance_metrics = ['euclidean', 'manhattan']

# Perform hyperparameter tuning
best_params = hyperparameter_tuning(X_train_scaled, y_train, distance_metrics)

# Best hyperparameters will be selected based on AUC-ROC score
print(f"Best hyperparameters: k = {best_params['k']}, distance metric = {best_params['distance_metric']}")

Group 1: Testing k values [5, 15, 25, 35, 45]
Best result from group 1: k = 45, metric = manhattan, AUC-ROC score = 0.9065697399189363
Group 2: Testing k values [45, 95, 105, 115, 125, 135]
Best result from group 2: k = 95, metric = manhattan, AUC-ROC score = 0.9081257942054372
Group 3: Testing k values [95, 145, 155, 165, 175, 185]
Best result from group 3: k = 95, metric = manhattan, AUC-ROC score = 0.9081257942054372
Final Best AUC-ROC Score: 0.9081257942054372
Final Best Hyperparameters: k = 95 , distance metric = manhattan
Best hyperparameters: k = 95, distance metric = manhattan


In [68]:
knn = KNN(k=best_params['k'], distance_metric = best_params['distance_metric'])
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
test_predictions = knn.predict(X_test_scaled)

In [78]:
# Check the shape of the test set before predictions
print(f"Shape of X_test_scaled: {X_test_scaled.shape}")

# Check the shape of the test_predictions
print(f"Shape of test_predictions: {test_predictions.shape}")

# Check the number of ids
ids = pd.read_csv('test.csv')['id'].values
print(f"Number of IDs: {len(ids)}")

Shape of X_test_scaled: (10000, 12)
Shape of test_predictions: (20000,)
Number of IDs: 10000


In [74]:
# # Save test predictions using the correct 'id' column
pd.DataFrame({'id': pd.read_csv('test.csv')['id'], 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

# # Check the shape of test_predictions to verify it's 1D
# print(f"Shape of test_predictions: {test_predictions.shape}")

# # Ensure the id column is also 1D
# ids = pd.read_csv('test.csv')['id'].values  # Ensure id is a NumPy array

# # Check the shape of id and test_predictions
# print(f"Shape of ids: {ids.shape}")
# print(f"Shape of test_predictions: {test_predictions.shape}")

# # Save test predictions using the correct 'id' column
# pd.DataFrame({'id': ids, 'Exited': test_predictions}).to_csv('submissions.csv', index=False)

ValueError: array length 20000 does not match index length 10000