In [1]:
# kaggle/python Docker image: https://github.com/kaggle/docker-python

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s4e1/sample_submission.csv
/kaggle/input/playground-series-s4e1/train.csv
/kaggle/input/playground-series-s4e1/test.csv


In [2]:
BASE_DIR = "/kaggle/input/playground-series-s4e1"

TRAIN_CSV = f"{BASE_DIR}/train.csv"

df_train = pd.read_csv(TRAIN_CSV)

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


In [3]:
print(df_train.describe())

                id    CustomerId    CreditScore            Age         Tenure  \
count  165034.0000  1.650340e+05  165034.000000  165034.000000  165034.000000   
mean    82516.5000  1.569201e+07     656.454373      38.125888       5.020353   
std     47641.3565  7.139782e+04      80.103340       8.867205       2.806159   
min         0.0000  1.556570e+07     350.000000      18.000000       0.000000   
25%     41258.2500  1.563314e+07     597.000000      32.000000       3.000000   
50%     82516.5000  1.569017e+07     659.000000      37.000000       5.000000   
75%    123774.7500  1.575682e+07     710.000000      42.000000       7.000000   
max    165033.0000  1.581569e+07     850.000000      92.000000      10.000000   

             Balance  NumOfProducts      HasCrCard  IsActiveMember  \
count  165034.000000  165034.000000  165034.000000   165034.000000   
mean    55478.086689       1.554455       0.753954        0.497770   
std     62817.663278       0.547154       0.430707        0.

In [4]:
df_train.duplicated().sum()

0

# 0. Preprocess

In [5]:
DROP_COLUMNS = ['id', 'CustomerId', 'Surname']

df = df_train.drop(DROP_COLUMNS, axis=1)

In [6]:
ONE_HOT_ENCODE_COLUMNS = ['Geography', 'Gender']

df = pd.get_dummies(df, columns=ONE_HOT_ENCODE_COLUMNS, drop_first=True)

In [7]:
from sklearn.preprocessing import StandardScaler

NUMERICAL_COLUMNS = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

scaler = StandardScaler()
df[NUMERICAL_COLUMNS] = scaler.fit_transform(df[NUMERICAL_COLUMNS])


In [8]:
from sklearn.preprocessing import StandardScaler


# TIE IT ALL TOGETHER

def preprocess(df):
    DROP_COLUMNS = ['id', 'CustomerId', 'Surname']

    df = df.drop(DROP_COLUMNS, axis=1)
    
    ONE_HOT_ENCODE_COLUMNS = ['Geography', 'Gender']

    df = pd.get_dummies(df, columns=ONE_HOT_ENCODE_COLUMNS, drop_first=True)
    
    NUMERICAL_COLUMNS = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']

    scaler = StandardScaler()
    df[NUMERICAL_COLUMNS] = scaler.fit_transform(df[NUMERICAL_COLUMNS])
    
    print(df.columns)
    
    return df

# 1. Train a Baseline model

In [9]:
# Start fresh

df_train = pd.read_csv(TRAIN_CSV)

df_train = preprocess(df_train)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')


In [10]:
from sklearn.model_selection import train_test_split

X = df.drop('Exited', axis=1)
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from sklearn.linear_model import LogisticRegression

baseline_model = LogisticRegression(random_state=42)


In [12]:
baseline_model.fit(X_train, y_train)


# 2. Evaluate

In [13]:
y_pred = baseline_model.predict(X_test)

y_pred


array([0, 0, 0, ..., 1, 0, 0])

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, baseline_model.predict_proba(X_test)[:, 1]))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.8353985518223407
ROC AUC Score: 0.8180489488313094

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.95      0.90     26052
           1       0.70      0.39      0.50      6955

    accuracy                           0.84     33007
   macro avg       0.78      0.67      0.70     33007
weighted avg       0.82      0.84      0.82     33007


Confusion Matrix:
 [[24874  1178]
 [ 4255  2700]]


# 3. Generate Submission

In [15]:
TEST_CSV = f"{BASE_DIR}/test.csv"

df_submission = pd.read_csv(TEST_CSV)

df_submission = preprocess(df_submission)

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')


In [16]:
submit_predictions = baseline_model.predict(df_submission)
submit_predictions


array([0, 1, 0, ..., 0, 0, 0])

In [17]:
submission_df = pd.DataFrame({'id': pd.read_csv(TEST_CSV)['id'], 'Exited': submit_predictions})

submission_df.to_csv('submission.csv', index=False)

