## Import Libraries

In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from io import StringIO  
from imblearn.over_sampling import SMOTE
random_state=123

## Load Data

In [20]:
# the cleaned CSV data file is in the data directory
bank_data = pd.read_csv('../data/bank_data_cleaned_upd.csv')
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   CreditScore     10000 non-null  int64  
 1   Geography       10000 non-null  object 
 2   Gender          10000 non-null  object 
 3   Age             10000 non-null  int64  
 4   Balance         10000 non-null  float64
 5   NumOfProducts   10000 non-null  int64  
 6   IsActiveMember  10000 non-null  int64  
 7   Exited          10000 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 625.1+ KB


## Separate class for predictions

In [21]:
y=bank_data['Exited']
X=bank_data.drop('Exited', axis=1)
X.shape
bank_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CreditScore,10000.0,650.5288,96.653299,350.0,584.0,652.0,718.0,850.0
Age,10000.0,38.9218,10.487806,18.0,32.0,37.0,44.0,92.0
Balance,10000.0,76485.889288,62397.405202,0.0,0.0,97198.54,127644.24,250898.09
NumOfProducts,10000.0,1.5302,0.581654,1.0,1.0,1.0,2.0,4.0
IsActiveMember,10000.0,0.5151,0.499797,0.0,0.0,1.0,1.0,1.0
Exited,10000.0,0.2037,0.402769,0.0,0.0,0.0,0.0,1.0


## Convert Categorical features to dummy variables

In [22]:
#Create dummy variables to deal with categorical inputs
X=pd.get_dummies(X, drop_first=True)
X.columns

Index(['CreditScore', 'Age', 'Balance', 'NumOfProducts', 'IsActiveMember',
       'Geography_Germany', 'Geography_Spain', 'Gender_Male'],
      dtype='object')

## Train/Test split

Split our data into Train and Test datasets with 75/25 proportion. 
Random state=42 for replicability.
All other manipulations with data will be fitted to Train set and applied to Test set in order to avoid data leakage from train set.

In [37]:
# Call train_test_split on X, y. Make the test_size = 0.2, and random_state = 42
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.2, random_state = random_state)
print(X_train.shape, y_train.shape)

(8000, 8) (8000,)


## Oversampling with SMOTE

Initial dataset is unbalanced with minority class of Exited users of 20%. In order to improve further classification we can apply oversampling technique - SMOTE (Synthetic Minority Oversampling TEchnique) that that performs data augmentation by creating synthetic data points based on the original data points. The advantage of SMOTE is that you are not generating duplicates, but rather creating synthetic data points that are slightly different from the original data points.

In [38]:
#Use SMOTE to handle unbalanced classes in dataset
sm = SMOTE(random_state=random_state)
X_train, y_train = sm.fit_resample(X_train, y_train)
print('Shape of X_train: ', X_train.shape, y_train.shape)
print('Exited values count:\n',y_train.value_counts())
X_train.describe().T


Shape of X_train:  (12754, 8) (12754,)
Exited values count:
 0    6377
1    6377
Name: Exited, dtype: int64


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CreditScore,12754.0,647.826015,93.823149,350.0,584.0,648.5,712.0,850.0
Age,12754.0,40.929904,10.116848,18.0,34.0,40.0,47.0,92.0
Balance,12754.0,81624.141557,61307.243768,0.0,0.0,102491.55,129248.557356,250898.09
NumOfProducts,12754.0,1.427082,0.576513,1.0,1.0,1.0,2.0,4.0
IsActiveMember,12754.0,0.372197,0.483409,0.0,0.0,0.0,1.0,1.0
Geography_Germany,12754.0,0.24902,0.432462,0.0,0.0,0.0,0.0,1.0
Geography_Spain,12754.0,0.177435,0.382051,0.0,0.0,0.0,0.0,1.0
Gender_Male,12754.0,0.419476,0.493493,0.0,0.0,0.0,1.0,1.0


## Save splitted and oversampled datasets

In [39]:
# Save to files oversampled train dataset and test dataset
file_path="../data/"

X_train_fpath = file_path + 'preprocessed/X_train.csv'
y_train_fpath = file_path + 'preprocessed/y_train.csv'
X_test_fpath = file_path + 'preprocessed/X_test.csv'
y_test_fpath =  file_path + 'preprocessed/y_test.csv'

X_train.to_csv(X_train_fpath, index=False)
y_train.to_csv(y_train_fpath, index=False)
X_test.to_csv(X_test_fpath, index=False)
y_test.to_csv(y_test_fpath, index=False)

## Scale Data

We need to Scale data for further modelling. But not all models require scaling. So we will have separate dataframes with raw and scaled data.

In [40]:
# Scale data with StandardScaler
ssc=StandardScaler()
X_train_scaled=X_train
X_test_scaled=X_test
X_train_scaled[['Age', 'Balance', 'CreditScore']]=pd.DataFrame(ssc.fit_transform(X_train_scaled[['Age', 'Balance', 'CreditScore']]), index=X_train.index)
X_test_scaled[['Age', 'Balance', 'CreditScore']]=pd.DataFrame(ssc.transform(X_test_scaled[['Age', 'Balance', 'CreditScore']]), index=X_test.index)

## Save Scales datasets

In [41]:
X_train_scaled_fpath = file_path + 'preprocessed/X_train_scaled.csv'
X_test_scaled_fpath = file_path + 'preprocessed/X_test_scaled.csv'
X_train_scaled.to_csv(X_train_scaled_fpath, index=False)
X_test_scaled.to_csv(X_test_scaled_fpath, index=False)