In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import seaborn as sns
import warnings as wrn
wrn.filterwarnings('ignore')
from preprocess import breakdown_vars
from preprocess import dummies_ohe
from preprocess import std_z
from preprocess import Xy

In [3]:
df = pd.read_excel("bank_churn.xlsx")

In [4]:
df.to_csv("Churn-data.csv")

In [5]:
df['churn'] = df['attrition_flag'].replace(['Existing Customer', 'Attrited Customer'], [0, 1])
df.drop(columns=['clientnum','avg_open_to_buy','attrition_flag','credit_limit'], inplace=True)
df['churn'].value_counts(normalize=True) # Unbalance data....

churn
0    0.83934
1    0.16066
Name: proportion, dtype: float64

In [6]:
cats, binaries, nonormal, normal  = breakdown_vars(df)

In [7]:
cats

['gender',
 'education_level',
 'marital_status',
 'income_category',
 'card_category']

In [8]:
df =  dummies_ohe(df, cats)

In [9]:
df.columns

Index(['customer_age', 'dependent_count', 'months_on_book',
       'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'total_revolving_bal', 'total_amt_chng_q4_q1',
       'total_trans_amt', 'total_trans_ct', 'total_ct_chng_q4_q1',
       'avg_utilization_ratio', 'churn', 'gender_M',
       'education_level_Doctorate', 'education_level_Graduate',
       'education_level_High School', 'education_level_Post-Graduate',
       'education_level_Uneducated', 'education_level_Unknown',
       'marital_status_Married', 'marital_status_Single',
       'marital_status_Unknown', 'income_category_$40K - $60K',
       'income_category_$60K - $80K', 'income_category_$80K - $120K',
       'income_category_Less than $40K', 'income_category_Unknown',
       'card_category_Gold', 'card_category_Platinum', 'card_category_Silver'],
      dtype='object')

In [10]:
X,y = Xy(df,'churn')

In [11]:
y

0        0
1        0
2        0
3        0
4        0
        ..
10122    0
10123    1
10124    1
10125    1
10126    1
Name: churn, Length: 10127, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 666, stratify=y)

In [13]:
m, s = X_train['customer_age'].mean(), X_train['customer_age'].std()

In [14]:
(X_test['customer_age'] - m)/s

4018    0.215144
8196   -0.406913
6051    0.463966
8173    0.090732
8153   -0.406913
          ...   
6989    0.463966
5321   -0.780147
2927    0.712789
2784   -1.526615
8489   -0.780147
Name: customer_age, Length: 2026, dtype: float64

In [15]:
from preprocess import standardize_X_test
standardize_X_test(X_train, X_test)

customer_age
dependent_count
months_on_book
total_relationship_count
months_inactive_12_mon
contacts_count_12_mon
total_revolving_bal
total_amt_chng_q4_q1
total_trans_amt
total_trans_ct
total_ct_chng_q4_q1
avg_utilization_ratio


Unnamed: 0,customer_age,dependent_count,months_on_book,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,total_revolving_bal,total_amt_chng_q4_q1,total_trans_amt,total_trans_ct,total_ct_chng_q4_q1,avg_utilization_ratio,gender_M,education_level_Doctorate,education_level_Graduate,education_level_High School,education_level_Post-Graduate,education_level_Uneducated,education_level_Unknown,marital_status_Married,marital_status_Single,marital_status_Unknown,income_category_$40K - $60K,income_category_$60K - $80K,income_category_$80K - $120K,income_category_Less than $40K,income_category_Unknown,card_category_Gold,card_category_Platinum,card_category_Silver
4018,0.215144,1.279892,0.891793,-0.521135,-1.323546,-2.221416,1.660353,-0.879859,-0.790282,-1.275998,-1.069409,-0.568356,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8196,-0.406913,0.510437,-0.734821,-1.803552,-0.336067,-2.221416,-1.431730,-0.044243,0.235929,1.031309,-0.145963,-0.999989,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6051,0.463966,0.510437,0.641545,-0.521135,0.651412,0.489867,0.216889,-0.843129,-0.118569,0.091295,0.205030,-0.111332,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8173,0.090732,0.510437,0.516421,-1.803552,0.651412,-2.221416,0.579291,-0.282991,0.105138,0.689485,-0.893912,-0.165740,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
8153,-0.406913,-1.797928,0.015924,-1.162344,-0.336067,-0.413894,-1.431730,-0.599790,0.145381,0.860397,1.099226,-0.999989,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6989,0.463966,-0.259018,0.766669,0.761282,0.651412,-2.221416,0.664056,-0.618155,0.158993,1.244948,0.250993,2.496603,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5321,-0.780147,0.510437,-1.110193,1.402490,-0.336067,1.393629,1.408515,-0.916589,-0.643216,-1.190543,0.200851,-0.071433,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2927,0.712789,0.510437,1.392290,1.402490,0.651412,0.489867,-1.431730,-0.907407,-0.607706,0.048567,-1.533221,-0.999989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2784,-1.526615,-0.259018,-1.610690,1.402490,-0.336067,0.489867,0.750050,2.361596,-0.604156,-0.848719,0.673020,-0.147604,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# Means and variance
locations_scales = {}
for column in nonormal + normal:
    locations_scales[column] = [X_train[column].mean(), X_train[column].std()]