In [99]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score



In [100]:
# Define the KNN class with probability prediction
class KNN:
    def __init__(self, k=5, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def compute_distance(self, X1, X2):
        if self.distance_metric == 'euclidean':
            return np.sqrt(np.sum((X1 - X2) ** 2, axis=1))
        elif self.distance_metric == 'manhattan':
            return np.sum(np.abs(X1 - X2), axis=1)

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def predict_proba(self, X):
        probabilities = [self._predict_proba(x) for x in X]
        return np.array(probabilities)

    def _predict(self, x):
        distances = self.compute_distance(self.X_train, x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        return np.bincount(k_nearest_labels).argmax()

    def _predict_proba(self, x):
        distances = self.compute_distance(self.X_train, x)
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]
        prob_churn = np.sum(k_nearest_labels) / self.k
        return prob_churn


In [101]:
# Define data preprocessing function
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# # Preprocess training data function
# def preprocess_data(train_path):
#     data = pd.read_csv(train_path)
#     data = data.drop(columns=['id', 'CustomerId', 'Surname'])
#     data = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=True)
#     X = data.drop(columns=['Exited']).values
#     y = data['Exited'].values
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#     scaler = StandardScaler()
#     X_train = scaler.fit_transform(X_train)
#     X_test = scaler.transform(X_test)
#     return X_train, X_test, y_train, y_test, scaler

# # Preprocess submission data function
# def preprocess_data_submit(submit_path, scaler):
#     data = pd.read_csv(submit_path)
#     data = data.drop(columns=['id', 'CustomerId', 'Surname'])
#     data = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=True)
#     X = data.values
#     X = scaler.transform(X)
#     return X


# Preprocess data function
def preprocess_data(data_path, is_train=True):
    # Load the dataset
    data = pd.read_csv(data_path)

    # Store 'id' column if it exists (for test data)
    ids = data['id'] if 'id' in data.columns else None

    # Drop unnecessary columns
    data = data.drop(columns=['id', 'CustomerId', 'Surname']) if is_train else data.drop(columns=['id', 'CustomerId', 'Surname'], errors='ignore')

    # Convert categorical columns using One-Hot Encoding
    data = pd.get_dummies(data, columns=['Geography', 'Gender'], drop_first=True, dummy_na=False)

    # Define X and y
    X = data.drop(columns=['Exited'], errors='ignore').values if is_train else data.values  # Ensure X is a NumPy array
    y = data['Exited'].values if is_train else None  # Convert y to a NumPy array

    return X, y, ids

def preprocess_test_data(train_path, test_path):
    # Load the train dataset to get the columns and apply preprocessing to the test dataset
    train_data = pd.read_csv(train_path)
    test_data = pd.read_csv(test_path)

    # Store 'id' column if it exists (for test data)
    ids = test_data['id']

    # Drop unnecessary columns
    train_data = train_data.drop(columns=['id', 'CustomerId', 'Surname'])
    test_data = test_data.drop(columns=['id', 'CustomerId', 'Surname'], errors='ignore')

    # Convert categorical columns using One-Hot Encoding, using the same categories as the training data
    train_data = pd.get_dummies(train_data, columns=['Geography', 'Gender'], drop_first=True, dummy_na=False)
    test_data = pd.get_dummies(test_data, columns=['Geography', 'Gender'], drop_first=True, dummy_na=False)

    # Ensure both datasets have the same columns, add missing columns with 0 values
    missing_cols = set(train_data.columns) - set(test_data.columns)
    for c in missing_cols:
        test_data[c] = 0
    test_data = test_data[train_data.columns]

    # Define X and y (for test data, y is None)
    X_test = test_data.drop(columns=['Exited'], errors='ignore').values if 'Exited' in test_data.columns else test_data.values

    return X_test, ids

In [102]:
# Define cross-validation function
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# Cross-validation function
# def cross_validate(X, y, knn, n_splits=5):
#     kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
#     auc_scores = []

#     for train_index, val_index in kf.split(X):
#         # Extract training and validation data using the indices
#         X_train, X_val = X[train_index], X[val_index]
#         y_train, y_val = y[train_index], y[val_index]

#         # Ensure y_train and y_val are NumPy arrays
#         y_train = np.array(y_train)
#         y_val = np.array(y_val)

#         # Fit the KNN model and predict
#         knn.fit(X_train, y_train)
#         y_pred = knn.predict(X_val)

#         # Calculate AUC score and append
#         auc_score = roc_auc_score(y_val, y_pred)
#         auc_scores.append(auc_score)

#     return auc_scores

# Define cross-validation function
def cross_validate(X, y, knn, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []
    for train_index, val_index in kf.split(X):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_val)
        auc_score = roc_auc_score(y_val, y_pred)
        auc_scores.append(auc_score)
    return auc_scores


In [105]:
# Perform cross-validation (on training set only)
cv_scores = cross_validate(X_train, y_train, knn)
print("Cross-validation AUC scores:", cv_scores)
print("Average AUC:", np.mean(cv_scores))

# Evaluate the model on the test set (Optional)
test_predictions = knn.predict(X_test)
accuracy = accuracy_score(y_test, test_predictions)
print(f"Test set accuracy: {accuracy}")

# Preprocess the full test data for submission
X_submit = preprocess_data_submit('test.csv', scaler)

# Make probability predictions for the entire submission data (all 10,000 samples)
test_probabilities = knn.predict_proba(X_submit)

# Load the test data to extract the 'id' column
test_data = pd.read_csv('test.csv')

# Ensure that the number of predictions matches the number of rows in the test data
if len(test_probabilities) != len(test_data):
    raise ValueError(f"Number of predictions ({len(test_probabilities)}) does not match the number of rows in the test data ({len(test_data)}).")

# Create the submission DataFrame with 'id' and 'Exited' columns
submission = pd.DataFrame({
    'id': test_data['id'],
    'Exited': test_probabilities  # Predicted probabilities of churn (values between 0 and 1)
})

# Save the DataFrame to a CSV file for submission
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv' with 10,000 rows.")


Cross-validation AUC scores: [0.7671842387638033, 0.7502202493900786, 0.7783744474920946, 0.7551052631578946, 0.7576418858045185]
Average AUC: 0.7617052169216778
Test set accuracy: 0.8666666666666667
Submission file saved as 'submission.csv' with 10,000 rows.
