## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from io import StringIO  
from imblearn.over_sampling import SMOTE

## Load Data

In [2]:
# the cleaned CSV data file is in the data directory
bank_data = pd.read_csv('../data/bank_data_cleaned.csv')
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


## Separate class for predictions

In [3]:
y=bank_data['Exited']
X=bank_data.drop('Exited', axis=1)
X.shape
bank_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Tenure,10000.0,5.0128,2.892174,0.0,3.0,5.0,7.0,10.0
Balance,10000.0,76485.889288,62397.405202,0.0,0.0,97198.54,127644.24,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
HasCrCard,10000.0,0.7055,0.45584,0.0,0.0,1.0,1.0,1.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
EstimatedSalary,10000.0,100090.239881,57510.492818,11.58,51002.11,100193.915,149388.2475,199992.48
Exited,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


## Convert Categorical features to dummy variables

In [4]:
#Create dummy variables to deal with categorical inputs
X=pd.get_dummies(X, drop_first=True)
X.columns

Index(['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Geography_Germany',
       'Geography_Spain', 'Gender_Male'],
      dtype='object')

## Train/Test split

Split our data into Train and Test datasets with 75/25 proportion. 
Random state=42 for replicability.
All other manipulations with data will be fitted to Train set and applied to Test set in order to avoid data leakage from train set.

In [5]:
# Call train_test_split on X, y. Make the test_size = 0.25, and random_state = 42
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.25, random_state = 42)
print(X_train.shape, y_train.shape)

(7500, 11) (7500,)


## Oversampling with SMOTE

Initial dataset is unbalanced with minority class of Exited users of 20%. In order to improve further classification we can apply oversampling technique - SMOTE (Synthetic Minority Oversampling TEchnique) that that performs data augmentation by creating synthetic data points based on the original data points. The advantage of SMOTE is that you are not generating duplicates, but rather creating synthetic data points that are slightly different from the original data points.

In [18]:
#Use SMOTE to handle unbalanced classes in dataset
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Shape of X_train: ', X_train.shape, y_train.shape)
print('Exited values count:\n',y_train.value_counts())
X_train.describe().T


Shape of X_train:  (11920, 11) (11920,)
Exited values count:
 1    5960
0    5960
Name: Exited, dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CreditScore,11920.0,649.945973,91.258949,350.0,589.0,651.0,712.0,850.0
Age,11920.0,40.900168,10.052322,18.0,34.0,40.0,47.0,92.0
Tenure,11920.0,4.785319,2.725579,0.0,3.0,5.0,7.0,10.0
Balance,11920.0,81502.040499,61476.337999,0.0,0.0,102711.71,129581.487509,250898.09
NumOfProducts,11920.0,1.42005,0.575225,1.0,1.0,1.0,2.0,4.0
HasCrCard,11920.0,0.625839,0.483926,0.0,0.0,1.0,1.0,1.0
IsActiveMember,11920.0,0.371393,0.483197,0.0,0.0,0.0,1.0,1.0
EstimatedSalary,11920.0,100958.185881,57350.169178,90.07,52759.501099,100963.183407,150279.98,199970.74
Geography_Germany,11920.0,0.243708,0.429337,0.0,0.0,0.0,0.0,1.0
Geography_Spain,11920.0,0.176258,0.381056,0.0,0.0,0.0,0.0,1.0


## Save splitted and oversampled datasets

In [7]:
# Save to files oversampled train dataset and test dataset
file_path="../data/"

X_train_fpath = file_path + 'preprocessed/X_train.csv'
y_train_fpath = file_path + 'preprocessed/y_train.csv'
X_test_fpath = file_path + 'preprocessed/X_test.csv'
y_test_fpath =  file_path + 'preprocessed/y_test.csv'

X_train.to_csv(X_train_fpath, index=False)
y_train.to_csv(y_train_fpath, index=False)
X_test.to_csv(X_test_fpath, index=False)
y_test.to_csv(y_test_fpath, index=False)

## Scale Data

We need to Scale data for further modelling. But not all models require scaling. So we will have separate dataframes with raw and scaled data.

In [8]:
# Scale data with StandardScaler
ssc=StandardScaler()
X_train_scaled=pd.DataFrame(ssc.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
X_test_scaled=pd.DataFrame(ssc.transform(X_test), index=X_test.index, columns=X_test.columns)

## Save Scales datasets

In [9]:
X_train_scaled_fpath = file_path + 'preprocessed/X_train_scaled.csv'
X_test_scaled_fpath = file_path + 'preprocessed/X_test_scaled.csv'
X_train_scaled.to_csv(X_train_scaled_fpath, index=False)
X_test_scaled.to_csv(X_test_scaled_fpath, index=False)