In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [36]:
BASE_DIR = "../data"

TRAIN_CSV = f"{BASE_DIR}/train.csv"

df_train = pd.read_csv(TRAIN_CSV)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


In [37]:
print(df_train.describe())

                id    CustomerId    CreditScore            Age         Tenure  \
count  165034.0000  1.650340e+05  165034.000000  165034.000000  165034.000000   
mean    82516.5000  1.569201e+07     656.454373      38.125888       5.020353   
std     47641.3565  7.139782e+04      80.103340       8.867205       2.806159   
min         0.0000  1.556570e+07     350.000000      18.000000       0.000000   
25%     41258.2500  1.563314e+07     597.000000      32.000000       3.000000   
50%     82516.5000  1.569017e+07     659.000000      37.000000       5.000000   
75%    123774.7500  1.575682e+07     710.000000      42.000000       7.000000   
max    165033.0000  1.581569e+07     850.000000      92.000000      10.000000   

             Balance  NumOfProducts      HasCrCard  IsActiveMember  \
count  165034.000000  165034.000000  165034.000000   165034.000000   
mean    55478.086689       1.554455       0.753954        0.497770   
std     62817.663278       0.547154       0.430707        0.

In [38]:
df_train.duplicated().sum()

0

# 0. Preprocess

In [39]:
DROP_COLUMNS = ['id', 'CustomerId', 'Surname']

df = df_train.drop(DROP_COLUMNS, axis=1)

In [40]:
ONE_HOT_ENCODE_COLUMNS = ['Geography', 'Gender']

df = pd.get_dummies(df, columns=ONE_HOT_ENCODE_COLUMNS, drop_first=True)

In [41]:
from sklearn.preprocessing import StandardScaler

NUMERICAL_COLUMNS = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

scaler = StandardScaler()
df[NUMERICAL_COLUMNS] = scaler.fit_transform(df[NUMERICAL_COLUMNS])


In [42]:
from sklearn.preprocessing import StandardScaler


# TIE IT ALL TOGETHER

def preprocess(df):
    DROP_COLUMNS = ['id', 'CustomerId', 'Surname']

    df = df.drop(DROP_COLUMNS, axis=1)
    
    ONE_HOT_ENCODE_COLUMNS = ['Geography', 'Gender']

    df = pd.get_dummies(df, columns=ONE_HOT_ENCODE_COLUMNS, drop_first=True)
    
    NUMERICAL_COLUMNS = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

    scaler = StandardScaler()
    df[NUMERICAL_COLUMNS] = scaler.fit_transform(df[NUMERICAL_COLUMNS])
    
    print(df.columns)
    
    return df

# 1.1 Train a Baseline model - Logistic Regression

In [43]:
# Start fresh

df_train = pd.read_csv(TRAIN_CSV)

df_train = preprocess(df_train)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')


In [44]:
from sklearn.model_selection import train_test_split

X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
from sklearn.linear_model import LogisticRegression

baseline_model = LogisticRegression(random_state=42)


In [46]:
baseline_model.fit(X_train, y_train)


# 1.2 Train a More Sophisticated Model

In [62]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest Classifier
rf_classifier = RandomForestClassifier(
    n_estimators=150,        # Number of trees in the forest
    max_depth=25,          # Maximum depth of the tree
    min_samples_split=2,     # Minimum number of samples required to split an internal node
    min_samples_leaf=1,      # Minimum number of samples required to be at a leaf node
    max_features=None,     # Number of features to consider when looking for the best split ('auto' uses sqrt(n_features))
    bootstrap=True,          # Whether bootstrap samples are used when building trees
    random_state=42          # Seed for random number generation for reproducibility
)

# Fit the model on the training data
rf_classifier.fit(X_train, y_train)


# 2. Evaluate

In [63]:
# y_pred = baseline_model.predict(X_test)

# y_prob = baseline_model.predict_proba(X_test)[:, 1]  # Probability of class 1

# ###


# Make predictions on the test data
rfc_y_pred = rf_classifier.predict(X_test)




In [64]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# EVALUATE LOGISTIC REGRESSION

print("LOGISTIC REGRESSION")
print("--------------------")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, baseline_model.predict_proba(X_test)[:, 1]))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# EVALUATE RFC 
print("RANDOM FOREST CLASSIFIER")
print("--------------------")
accuracy = accuracy_score(y_test, rfc_y_pred)
classification_rep = classification_report(y_test, rfc_y_pred)
confusion_mat = confusion_matrix(y_test, rfc_y_pred)

print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", confusion_mat)


LOGISTIC REGRESSION
--------------------
Accuracy: 0.8354894416335928
ROC AUC Score: 0.8180634638481705

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.95      0.90     26052
           1       0.70      0.39      0.50      6955

    accuracy                           0.84     33007
   macro avg       0.78      0.67      0.70     33007
weighted avg       0.82      0.84      0.82     33007


Confusion Matrix:
 [[24874  1178]
 [ 4252  2703]]
RANDOM FOREST CLASSIFIER
--------------------
Accuracy: 0.8563638016178386

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.91     26052
           1       0.70      0.55      0.62      6955

    accuracy                           0.86     33007
   macro avg       0.80      0.74      0.76     33007
weighted avg       0.85      0.86      0.85     33007


Confusion Matrix:
 [[24440  1612]
 [ 3129  3826]]


# 3. Kaggle Competitions Submission

In [65]:
TEST_CSV = f"{BASE_DIR}/test.csv"

df_submission = pd.read_csv(TEST_CSV)

df_submission = preprocess(df_submission)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')


In [66]:
submit_predictions = rf_classifier.predict_proba(df_submission)[:, 1]
submit_predictions


array([7.23295619e-03, 7.21410935e-01, 9.47460868e-03, ...,
       4.26666667e-05, 3.27564053e-01, 1.85661821e-01])

In [67]:
submission_df = pd.DataFrame({'id': pd.read_csv(TEST_CSV)['id'], 'Exited': submit_predictions})

submission_df.to_csv('submission.csv', index=False)



# Kaggle Competition Submissions

Uncomment to make submissions.

In [60]:
# import os

# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
# api_key = user_secrets.get_secret("key")
# secret_value_1 = user_secrets.get_secret("username")

# os.environ['KAGGLE_USERNAME'] = secret_value_1
# os.environ['KAGGLE_KEY'] = api_key

In [61]:
# !kaggle competitions submit -c playground-series-s4e1 -f submission.csv -m "v1 baseline"
