# Importing packages and libraries

In [1]:
# For those who have not installed these libraries before
# !pip install category_encoders
# !pip install imbalanced-learn

In [2]:
# Packages for data manipulation
import pandas as pd
import numpy as np

# Packages for visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder

# Packages for modelling
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Reading data

In [3]:
data = pd.read_csv("../Data/BankChurners.csv")
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


# Drop and rename columns

In [4]:
# Irrelevant columns
data_dropped = data.drop(columns=["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1)
data_dropped.head()

# Due to correlation > 0.7
# could try using VIF
data_dropped = data_dropped.drop(columns=["Customer_Age", "Avg_Open_To_Buy", "Total_Trans_Ct"])

data_dropped.head()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,1.335,1144,1.625,0.061
1,Existing Customer,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,1.541,1291,3.714,0.105
2,Existing Customer,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,2.594,1887,2.333,0.0
3,Existing Customer,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,1.405,1171,2.333,0.76
4,Existing Customer,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,2.175,816,2.5,0.0


In [5]:
data_dropped.columns = ['attrition_flag', 'gender', 'dependent_count', 'education_level', 
                'marital_status', 'income_category', 'card_category', 'months_on_book', 
                'total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 
                'credit_limit', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 
                'total_count_change_q4_q1', 'avg_utilization_ratio']

numerical_cols = list(data_dropped.describe().columns)
categorical_cols = [i for i in data_dropped.columns if i not in numerical_cols and i != "Attrition_Flag"]

# Encoding categorical columns

### Label Encoding `attrition_flag`, `gender`

In [6]:
data_enc = data_dropped

label_encoded = pd.get_dummies(data_enc[["attrition_flag", "gender"]], drop_first=True)
label_encoded.rename(columns={"attrition_flag_Existing Customer": "attrition_flag", "gender_M": "gender"}, inplace=True)

data_enc.drop(columns=["attrition_flag", "gender"], inplace=True)
data_enc = pd.concat([label_encoded, data_enc], axis=1)

data_enc.head()

Unnamed: 0,attrition_flag,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio
0,1,1,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,1.335,1144,1.625,0.061
1,1,0,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,1.541,1291,3.714,0.105
2,1,1,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,2.594,1887,2.333,0.0
3,1,0,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,1.405,1171,2.333,0.76
4,1,1,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,2.175,816,2.5,0.0


### OneHotEncoding `marital_status`

In [7]:
# Might need to find other ways to encode because not very correct way
onehot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

values = onehot_enc.fit_transform(data_enc[["marital_status"]]).astype(int)
labels = np.array(["divorced", "married", "single", "unknown_marital_status"]).ravel()
marital_status_df = pd.DataFrame(values, columns=labels)

data_enc = pd.concat([data_enc, marital_status_df], axis=1)
data_enc.drop("marital_status", axis=1, inplace=True)

data_enc.head()

Unnamed: 0,attrition_flag,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status
0,1,1,3,High School,$60K - $80K,Blue,39,5,1,3,12691.0,777,1.335,1144,1.625,0.061,0,1,0,0
1,1,0,5,Graduate,Less than $40K,Blue,44,6,1,2,8256.0,864,1.541,1291,3.714,0.105,0,0,1,0
2,1,1,3,Graduate,$80K - $120K,Blue,36,4,1,0,3418.0,0,2.594,1887,2.333,0.0,0,1,0,0
3,1,0,4,High School,Less than $40K,Blue,34,3,4,1,3313.0,2517,1.405,1171,2.333,0.76,0,0,0,1
4,1,1,3,Uneducated,$60K - $80K,Blue,21,5,1,0,4716.0,0,2.175,816,2.5,0.0,0,1,0,0


### Ordinal Encoding: `education_level`, `income_category`, `card_category`

In [8]:
edu_level_mapper = {"Doctorate": 7, "Post-Graduate": 6, "Graduate": 5, "College": 4, "High School": 3, "Uneducated": 2, "Unknown": 1}
data_enc["education_level"] = data_enc["education_level"].replace(edu_level_mapper)

income_cat_mapper = {"$120K +": 6, "$80K - $120K": 5, "$60K - $80K": 4, "$40K - $60K": 3, "Less than $40K": 2, "Unknown": 1}
data_enc["income_category"] = data_enc["income_category"].replace(income_cat_mapper)

card_cat_mapper = {"Platinum": 4, "Gold": 3, "Silver": 2, "Blue": 1}
data_enc["card_category"] = data_enc["card_category"].replace(card_cat_mapper)

data_enc.head()

Unnamed: 0,attrition_flag,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status
0,1,1,3,3,4,1,39,5,1,3,12691.0,777,1.335,1144,1.625,0.061,0,1,0,0
1,1,0,5,5,2,1,44,6,1,2,8256.0,864,1.541,1291,3.714,0.105,0,0,1,0
2,1,1,3,5,5,1,36,4,1,0,3418.0,0,2.594,1887,2.333,0.0,0,1,0,0
3,1,0,4,3,2,1,34,3,4,1,3313.0,2517,1.405,1171,2.333,0.76,0,0,0,1
4,1,1,3,2,4,1,21,5,1,0,4716.0,0,2.175,816,2.5,0.0,0,1,0,0


# Creating Train, Validation, Test sets

## Use stratify to ensure the percentage of each class remains the same throughout the split

In [9]:
y = data_enc["attrition_flag"]
X = data_enc.drop(columns="attrition_flag", axis=1)

# Split dataset into training and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2021, stratify=y)

## Log transformation for skewed variables, Scaling of numerical columns

In [10]:
# From EDA
skewed = ["credit_limit", "total_amt_change_q4_q1", "total_trans_amt", "total_count_change_q4_q1"]

for skewed_col in skewed:
    X_train[skewed_col]= np.where(X_train[skewed_col] > 0 , np.log(X_train[skewed_col]), 0)
    # alt
    # data_log[skewed_col] = np.log(data_log[skewed_col].mask(data_log[skewed_col] <=0)).fillna(0)

scaler = MinMaxScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols]) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[skewed_col]= np.where(X_train[skewed_col] > 0 , np.log(X_train[skewed_col]), 0)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


## Perform oversampling on minority class ("Attrited") for training set

In [11]:
oversampler = SMOTE()
X_train, y_train = oversampler.fit_resample(X_train, y_train)
y_train.value_counts()

1    6799
0    6799
Name: attrition_flag, dtype: int64

In [12]:
# From EDA
skewed = ["credit_limit", "total_amt_change_q4_q1", "total_trans_amt", "total_count_change_q4_q1"]

for skewed_col in skewed:
    X_test[skewed_col]= np.where(X_test[skewed_col] > 0 , np.log(X_test[skewed_col]), 0)
    # alt
    # data_log[skewed_col] = np.log(data_log[skewed_col].mask(data_log[skewed_col] <=0)).fillna(0)

scaler = MinMaxScaler()
X_test[numerical_cols] = scaler.fit_transform(X_test[numerical_cols]) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[skewed_col]= np.where(X_test[skewed_col] > 0 , np.log(X_test[skewed_col]), 0)
  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


# Writing out Train, Validation, Test sets to csv files

In [13]:
train = pd.concat([X_train, y_train], axis=1)
train.head()

Unnamed: 0,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status,attrition_flag
0,1,0.8,3,5,1,0.534884,0.6,0.5,0.0,0.872514,0.861343,0.764093,0.285875,0.836101,0.094094,0,0,0,1,1
1,0,0.8,5,2,1,0.674419,0.6,0.333333,0.0,0.280193,0.674613,0.69522,0.598377,0.636365,0.485485,0,1,0,0,1
2,0,0.4,5,2,1,0.395349,0.4,0.333333,0.5,0.597267,0.43385,0.702522,0.402778,0.6229,0.114114,0,1,0,0,1
3,0,1.0,4,3,1,0.627907,0.4,0.166667,0.166667,0.34933,0.0,0.701949,0.534821,0.753082,0.0,1,0,0,0,1
4,1,0.8,5,4,1,0.534884,0.6,0.333333,0.5,0.287911,0.694875,0.736876,0.636298,0.711124,0.487487,0,1,0,0,1


In [14]:
test = pd.concat([X_test, y_test], axis=1)
test.head()

Unnamed: 0,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status,attrition_flag
5107,0,0.6,5,3,1,0.418605,1.0,0.5,0.666667,0.735639,0.0,0.63075,0.308028,0.50662,0.0,0,1,0,0,0
2519,1,0.2,5,5,1,0.325581,1.0,0.333333,0.833333,0.915317,0.542312,0.898788,0.424008,0.767976,0.052419,0,0,1,0,1
8231,0,0.2,1,2,1,0.674419,0.0,0.166667,0.333333,0.135159,0.611442,0.794856,0.640698,0.761088,0.701613,0,0,0,1,1
1955,0,0.8,1,2,2,0.44186,1.0,0.5,0.333333,0.709675,0.549861,0.867994,0.564883,0.658549,0.101815,0,0,1,0,1
214,1,0.8,3,3,1,0.534884,0.8,0.5,0.333333,0.69892,0.68375,0.810204,0.211761,0.843122,0.131048,0,1,0,0,1


In [15]:
train.to_csv("../Data/train.csv")
test.to_csv("../Data/test.csv")