In [1]:
!pip3 install numba



In [8]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score
from numba import jit  # Import Numba for JIT compilation

# Function to calculate Euclidean distance using Numba
@jit(nopython=True)
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

# Numba-optimized function to find k nearest neighbors
@jit(nopython=True)
def find_k_nearest_neighbors(X_train, x, k):
    distances = np.empty(X_train.shape[0])
    for i in range(X_train.shape[0]):
        distances[i] = euclidean_distance(x, X_train[i])
    
    k_indices = np.argsort(distances)[:k]
    return k_indices

# KNN class implementation
class KNN:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train.to_numpy()  # Convert to NumPy array for indexing

    def predict(self, X_test):
        predictions = np.empty(X_test.shape[0], dtype=self.y_train.dtype)
        for i in range(X_test.shape[0]):
            predictions[i] = self._predict(X_test[i])
        return predictions

    def predict_proba(self, X_test):
        probabilities = np.empty((X_test.shape[0], 2))  # Two classes: 0 and 1
        for i in range(X_test.shape[0]):
            probabilities[i] = self._predict_proba(X_test[i])
        return probabilities

    def _predict(self, x):
        k_indices = find_k_nearest_neighbors(self.X_train, x, self.k)
        k_nearest_labels = self.y_train[k_indices]  # Fetch labels of k nearest neighbors
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

    def _predict_proba(self, x):
        k_indices = find_k_nearest_neighbors(self.X_train, x, self.k)
        k_nearest_labels = self.y_train[k_indices]
        total = len(k_nearest_labels)
        prob_1 = np.sum(k_nearest_labels) / total  # Probability of class 1
        prob_0 = 1 - prob_1  # Probability of class 0
        return np.array([prob_0, prob_1])  # Return probabilities for both classes

# Load the train and test datasets
train_path = '/kaggle/input/506-data/train.csv'
test_path = '/kaggle/input/506-data/test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

# Drop irrelevant columns
train_data = train_data.drop(['CustomerId', 'Surname'], axis=1)
test_data = test_data.drop(['CustomerId', 'Surname'], axis=1)

# Split the train data into features and target
X = train_data.drop(['Exited', 'id'], axis=1)  # Drop target and ID
y = train_data['Exited']

# Identify categorical columns (such as 'Geography', 'Gender')
categorical_cols = ['Geography', 'Gender']

# Create a pipeline for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(exclude=['object']).columns),  # Scale numeric features
        ('cat', OneHotEncoder(), categorical_cols)  # One-hot encode categorical features
    ])

# Create polynomial features
poly = PolynomialFeatures(interaction_only=True, include_bias=False)

# Combine preprocessing steps into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', poly),
    ('var_thresh', VarianceThreshold(threshold=0.1))  # Remove low-variance features
])

# Preprocess features and generate polynomial features
X_transformed = pipeline.fit_transform(X)

# Remove highly correlated features
correlation_matrix = pd.DataFrame(X_transformed).corr().abs()
high_correlation = np.where(correlation_matrix > 0.95)  # You can adjust the threshold
to_drop = set()

for i in range(len(high_correlation[0])):
    if high_correlation[0][i] != high_correlation[1][i]:
        to_drop.add(high_correlation[1][i])

X_filtered = np.delete(X_transformed, list(to_drop), axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_filtered, y, test_size=0.4, random_state=42)

# Initialize and train the KNN model from scratch
knn = KNN(k=5)  # You can adjust k
knn.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = knn.predict(X_val)

# Calculate accuracy for both sets
train_accuracy = accuracy_score(y_train, knn.predict(X_train))
val_accuracy = accuracy_score(y_val, y_val_pred)

# Print accuracy results
print(f'Training Accuracy: {train_accuracy * 100:.2f}%')
print(f'Validation Accuracy: {val_accuracy * 100:.2f}%')

# Overfitting check
if train_accuracy - val_accuracy > 0.1:  # 10% threshold for detecting overfitting, adjust as necessary
    print("Warning: Possible overfitting detected.")
else:
    print("No significant overfitting detected.")

# Preprocess test data using the same pipeline (excluding the target)
X_test = test_data.drop(['id'], axis=1)  # Drop ID
X_test_transformed = pipeline.transform(X_test)

# Remove the same highly correlated features from test data
X_test_filtered = np.delete(X_test_transformed, list(to_drop), axis=1)

# Make probability predictions on the test set
y_test_prob = knn.predict_proba(X_test_filtered)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],  # Include ID in the submission
    'Exited': y_test_prob[:, 1]  # Probability of class 1 (Exited)
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' has been saved.")


Training Accuracy: 90.60%
Validation Accuracy: 87.05%
No significant overfitting detected.
Submission file 'submission.csv' has been saved.


In [13]:
# Preprocess test data using the same pipeline (excluding the target)
X_test = test_data.drop(['id'], axis=1)  # Drop ID
X_test_transformed = pipeline.transform(X_test)

# Remove the same highly correlated features from test data
X_test_filtered = np.delete(X_test_transformed, list(to_drop), axis=1)

# Make probability predictions on the test set
y_test_prob = knn.predict_proba(X_test_filtered)

# Prepare submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],  # Include ID in the submission
    'Exited': y_test_prob[:, 1]  # Probability of class 1 (Exited)
})

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' has been saved.")


Submission file 'submission.csv' has been saved.
