# Importing packages and libraries

In [1]:
# For those who have not installed these libraries before
# !pip install category_encoders
# !pip install imbalanced-learn

In [2]:
# Packages for data manipulation
import pandas as pd
import numpy as np

# Packages for visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder

# Packages for modelling
from sklearn.preprocessing import MinMaxScaler
# from category_encoders import TargetEncoder

from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Reading data

In [3]:
data = pd.read_csv("../Data/BankChurners.csv")
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


# Dropping irrelevant columns

In [4]:
data.drop(columns=["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)

data.head()

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


# Dropping columns with correlation > 0.7
- Some variables have very high correlation while some have mild correlation 
- Only 1 of the variables were kept for pairs of variables which have correlation value > 0.7 to avoid multicollinearity

** Can explore using VIF

In [5]:
data.corr(method='pearson')[(data.corr() > 0.2) & (data.corr() != 1)]

# i think this one is pre-determined in EDA, so dunnid to show here

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
Customer_Age,,,0.788912,,,,,,,,,,,
Dependent_count,,,,,,,,,,,,,,
Months_on_book,0.788912,,,,,,,,,,,,,
Total_Relationship_Count,,,,,,,,,,,,,,
Months_Inactive_12_mon,,,,,,,,,,,,,,
Contacts_Count_12_mon,,,,,,,,,,,,,,
Credit_Limit,,,,,,,,,0.995981,,,,,
Total_Revolving_Bal,,,,,,,,,,,,,,0.624022
Avg_Open_To_Buy,,,,,,,0.995981,,,,,,,
Total_Amt_Chng_Q4_Q1,,,,,,,,,,,,,0.384189,


In [6]:
data_dropped = data.drop(columns=["Customer_Age", "Avg_Open_To_Buy", "Total_Trans_Ct"])
data_dropped.head()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,1.335,1144,1.625,0.061
1,Existing Customer,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,1.541,1291,3.714,0.105
2,Existing Customer,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,2.594,1887,2.333,0.0
3,Existing Customer,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,1.405,1171,2.333,0.76
4,Existing Customer,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,2.175,816,2.5,0.0


# Renaming columns

In [7]:
data_dropped.columns = ['attrition_flag', 'gender', 'dependent_count', 'education_level', 'marital_status', 'income_category', 'card_category', 'months_on_book', 'total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 'credit_limit', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 'total_count_change_q4_q1', 'avg_utilization_ratio']

# Splitting into numerical and categorical columns

In [8]:
# Determine numerical columns
numerical_cols = list(data_dropped.describe().columns)

# Determine categorical columns
categorical_cols = [i for i in data_dropped.columns if i not in numerical_cols and i != "Attrition_Flag"]

print(f"Numerical Columns:\n{numerical_cols}\n")
print(f"Categorical Columns:\n{categorical_cols}")

Numerical Columns:
['dependent_count', 'months_on_book', 'total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 'credit_limit', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 'total_count_change_q4_q1', 'avg_utilization_ratio']

Categorical Columns:
['attrition_flag', 'gender', 'education_level', 'marital_status', 'income_category', 'card_category']


# Feature Engineering

## Log transformation for skewed variables

In [9]:
data_dropped[numerical_cols].skew()[(data_dropped[numerical_cols].skew() > 1) | (data_dropped[numerical_cols].skew() < -1)]

credit_limit                1.666726
total_amt_change_q4_q1      1.732063
total_trans_amt             2.041003
total_count_change_q4_q1    2.064031
dtype: float64

In [10]:
data_log = data_dropped
for skewed_col in ["credit_limit", "total_amt_change_q4_q1", "total_trans_amt", "total_count_change_q4_q1"]:
    data_log[skewed_col]= np.where(data_log[skewed_col] > 0 , np.log(data_log[skewed_col]), 0)
data_log.head()

# Jon: I did this prev in my own log transformation! Just storing here in case need
# skewed_cols = []

# for num in numerical_cols:
#     if df[num].skew() < -1 or df[num].skew() > 1:
#         skewed_cols.append(num)
# print(skewed_cols)

# for col in skewed_cols:
#     df[col] = np.log(df[col].mask(df[col] <=0)).fillna(0)

# df.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,attrition_flag,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio
0,Existing Customer,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,9.448648,777,0.288931,7.042286,0.485508,0.061
1,Existing Customer,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,9.018695,864,0.432432,7.163172,1.312109,0.105
2,Existing Customer,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,8.136811,0,0.953201,7.542744,0.847155,0.0
3,Existing Customer,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,8.105609,2517,0.340037,7.065613,0.847155,0.76
4,Existing Customer,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,8.458716,0,0.777029,6.704414,0.916291,0.0


In [11]:
data_log[numerical_cols].skew()

dependent_count            -0.020826
months_on_book             -0.106565
total_relationship_count   -0.162452
months_inactive_12_month    0.633061
contacts_count_12_month     0.011006
credit_limit                0.457081
total_revolving_bal        -0.148837
total_amt_change_q4_q1     -1.024660
total_trans_amt             0.262210
total_count_change_q4_q1   -1.125235
avg_utilization_ratio       0.718008
dtype: float64

# Scaling columns using normalisation
- To transform the data points to range (0,1)

In [12]:
# plt.rcParams['figure.figsize'] = 5,5
# data_log[numerical_cols].boxplot(rot=90)

In [13]:
data_scaled = data_log
scaler = MinMaxScaler()

data_scaled[numerical_cols] = scaler.fit_transform(data_log[numerical_cols]) 

In [14]:
# plt.rcParams['figure.figsize'] = 5,5
# data_scaled[numerical_cols].boxplot(rot=90)

In [15]:
data_scaled.head()

Unnamed: 0,attrition_flag,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio
0,Existing Customer,M,0.6,High School,Married,$60K - $80K,Blue,0.604651,0.8,0.166667,0.5,0.685166,0.308701,0.839748,0.225019,0.83088,0.061061
1,Existing Customer,F,1.0,Graduate,Single,Less than $40K,Blue,0.72093,1.0,0.166667,0.333333,0.549874,0.343266,0.86437,0.25869,1.0,0.105105
2,Existing Customer,M,0.6,Graduate,Married,$80K - $120K,Blue,0.534884,0.6,0.166667,0.0,0.272374,0.0,0.953725,0.364413,0.904872,0.0
3,Existing Customer,F,0.8,High School,Unknown,Less than $40K,Blue,0.488372,0.4,0.666667,0.166667,0.262556,1.0,0.848516,0.231517,0.904872,0.760761
4,Existing Customer,M,0.6,Uneducated,Married,$60K - $80K,Blue,0.186047,0.8,0.166667,0.0,0.373667,0.0,0.923497,0.130911,0.919017,0.0


## Encoding categorical columns
- Converting categorical columns to numeric values for better fitting to the model
- attrition_flag -> nominal, binary
- gender -> nominal, binary
- education_level -> ordinal
- marital_status -> nominal
- income_category -> ordinal
- card_category -> ordinal

In [16]:
# Jon: I think these should be inside the EDA notebook?
# Visualising categorical columns 
# fig, axes = plt.subplots(nrows = 2, ncols=3)
# plt.rcParams['figure.figsize'] = 10, 10

# data["Attrition_Flag"].value_counts().plot.bar(ax=axes[0,0], rot=30)
# data["Gender"].value_counts().plot.bar(ax=axes[0,1], rot=30)
# data["Education_Level"].value_counts().plot.bar(ax=axes[0,2], rot=30)
# data["Marital_Status"].value_counts().plot.bar(ax=axes[1,0], rot=30)
# data["Income_Category"].value_counts().plot.bar(ax=axes[1,1], rot=30)
# data["Card_Category"].value_counts().plot.bar(ax=axes[1,2], rot=30)

In [17]:
# # drop_first: features that only require one column during one hot encoding will use only one. 
# # Rationale: to reduce dimensions further and prevent collinearity between 2 parameters
# features_onehotencoded = pd.get_dummies(data[categorical_cols], drop_first=True)

# # Comparison of what categorical columns:
# print("Number of categorical columns before encoding:", len(categorical_cols))
# print("Number of categorical columns after encoding:", len(features_onehotencoded.columns))

# # Combine numerical columns back
# processed_features = pd.concat([features_onehotencoded, data_scaled[numerical_cols]], axis = 1)

# # Comparison of what columns changed:
# print("\nAll features before encoding:\n" + str(list(data.columns)[1:])) # Don't display target variable
# print("\nAll features after encoding:\n" + str(list(processed_features.columns)))

# processed_features.head()

### Label Encoding `attrition_flag`, `gender`

In [18]:
label_encoded = pd.get_dummies(data_scaled[["attrition_flag", "gender"]], drop_first=True)
label_encoded.rename(columns={"attrition_flag_Existing Customer": "attrition_flag", "gender_M": "gender"}, inplace=True)

data_scaled.drop(columns=["attrition_flag", "gender"], inplace=True)
data_scaled = pd.concat([label_encoded, data_scaled], axis=1)

data_scaled.head()

Unnamed: 0,attrition_flag,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio
0,1,1,0.6,High School,Married,$60K - $80K,Blue,0.604651,0.8,0.166667,0.5,0.685166,0.308701,0.839748,0.225019,0.83088,0.061061
1,1,0,1.0,Graduate,Single,Less than $40K,Blue,0.72093,1.0,0.166667,0.333333,0.549874,0.343266,0.86437,0.25869,1.0,0.105105
2,1,1,0.6,Graduate,Married,$80K - $120K,Blue,0.534884,0.6,0.166667,0.0,0.272374,0.0,0.953725,0.364413,0.904872,0.0
3,1,0,0.8,High School,Unknown,Less than $40K,Blue,0.488372,0.4,0.666667,0.166667,0.262556,1.0,0.848516,0.231517,0.904872,0.760761
4,1,1,0.6,Uneducated,Married,$60K - $80K,Blue,0.186047,0.8,0.166667,0.0,0.373667,0.0,0.923497,0.130911,0.919017,0.0


### OneHotEncoding `marital_status`

In [19]:
# Might need to find other ways to encode because not very correct way
onehot_enc = OneHotEncoder()
reshape = np.array(data_scaled["marital_status"]).reshape(-1, 1)
values = onehot_enc.fit_transform(reshape).toarray().astype(int)
labels = np.array(["divorced", "married", "single", "unknown_marital_status"]).ravel()
marital_status_df = pd.DataFrame(values, columns=labels)

data_scaled = data_scaled.join(marital_status_df["divorced"])
data_scaled = data_scaled.join(marital_status_df["married"])
data_scaled = data_scaled.join(marital_status_df["single"])
data_scaled = data_scaled.join(marital_status_df["unknown_marital_status"])

data_scaled.drop("marital_status", axis=1, inplace=True)

data_scaled.head()

Unnamed: 0,attrition_flag,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status
0,1,1,0.6,High School,$60K - $80K,Blue,0.604651,0.8,0.166667,0.5,0.685166,0.308701,0.839748,0.225019,0.83088,0.061061,0,1,0,0
1,1,0,1.0,Graduate,Less than $40K,Blue,0.72093,1.0,0.166667,0.333333,0.549874,0.343266,0.86437,0.25869,1.0,0.105105,0,0,1,0
2,1,1,0.6,Graduate,$80K - $120K,Blue,0.534884,0.6,0.166667,0.0,0.272374,0.0,0.953725,0.364413,0.904872,0.0,0,1,0,0
3,1,0,0.8,High School,Less than $40K,Blue,0.488372,0.4,0.666667,0.166667,0.262556,1.0,0.848516,0.231517,0.904872,0.760761,0,0,0,1
4,1,1,0.6,Uneducated,$60K - $80K,Blue,0.186047,0.8,0.166667,0.0,0.373667,0.0,0.923497,0.130911,0.919017,0.0,0,1,0,0


### Ordinal Encoding: `education_level`, `income_category`, `card_category`

In [20]:
edu_level_mapper = {"Doctorate": 7, "Post-Graduate": 6, "Graduate": 5, "College": 4, "High School": 3, "Uneducated": 2, "Unknown": 1}
data_scaled["education_level"] = data_scaled["education_level"].replace(edu_level_mapper)

income_cat_mapper = {"$120K +": 6, "$80K - $120K": 5, "$60K - $80K": 4, "$40K - $60K": 3, "Less than $40K": 2, "Unknown": 1}
data_scaled["income_category"] = data_scaled["income_category"].replace(income_cat_mapper)

card_cat_mapper = {"Platinum": 4, "Gold": 3, "Silver": 2, "Blue": 1}
data_scaled["card_category"] = data_scaled["card_category"].replace(card_cat_mapper)

data_scaled.head()

Unnamed: 0,attrition_flag,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status
0,1,1,0.6,3,4,1,0.604651,0.8,0.166667,0.5,0.685166,0.308701,0.839748,0.225019,0.83088,0.061061,0,1,0,0
1,1,0,1.0,5,2,1,0.72093,1.0,0.166667,0.333333,0.549874,0.343266,0.86437,0.25869,1.0,0.105105,0,0,1,0
2,1,1,0.6,5,5,1,0.534884,0.6,0.166667,0.0,0.272374,0.0,0.953725,0.364413,0.904872,0.0,0,1,0,0
3,1,0,0.8,3,2,1,0.488372,0.4,0.666667,0.166667,0.262556,1.0,0.848516,0.231517,0.904872,0.760761,0,0,0,1
4,1,1,0.6,2,4,1,0.186047,0.8,0.166667,0.0,0.373667,0.0,0.923497,0.130911,0.919017,0.0,0,1,0,0


# Creating Train, Validation, Test sets

## Use stratify to ensure the percentage of each class remains the same throughout the split

In [21]:
y = data_scaled["attrition_flag"]
X = data_scaled.drop(columns="attrition_flag", axis=1)

# Split dataset into training and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2021, stratify=y)
print("Ratio for training + validation set\n", y_train.value_counts(normalize=True), "\n")
print("Ratio for test set\n", y_test.value_counts(normalize=True))

# # Split training dataset into training and validation subsets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2021, stratify=y_train)
print("Ratio for training set\n", y_train.value_counts(normalize=True), "\n")
print("Ratio for cal set\n", y_test.value_counts(normalize=True))

Ratio for training + validation set
 1    0.839279
0    0.160721
Name: attrition_flag, dtype: float64 

Ratio for test set
 1    0.839585
0    0.160415
Name: attrition_flag, dtype: float64
Ratio for training set
 1    0.839342
0    0.160658
Name: attrition_flag, dtype: float64 

Ratio for cal set
 1    0.839585
0    0.160415
Name: attrition_flag, dtype: float64


In [22]:
print(len(data))
print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))
print("Total:", len(X_train) + len(X_val) + len(X_test))

10127
Train size: 6075
Validation size: 2026
Test size: 2026
Total: 10127


## Perform oversampling on minority class ("Attrited") for training set

In [23]:
oversampler = SMOTE()
X_train, y_train = oversampler.fit_resample(X_train, y_train)
y_train.value_counts()

1    5099
0    5099
Name: attrition_flag, dtype: int64

In [24]:
y_train

0        1
1        1
2        1
3        1
4        1
        ..
10193    0
10194    0
10195    0
10196    0
10197    0
Name: attrition_flag, Length: 10198, dtype: uint8

In [25]:
print("\nTrain size after over sampling:", len(y_train))


Train size after over sampling: 10198


# Writing out Train, Validation, Test sets to csv files

In [26]:
train = pd.concat([X_train, y_train], axis=1)
train.head()

Unnamed: 0,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status,attrition_flag
0,0,0.8,1,3,1,0.418605,0.8,0.166667,0.333333,0.123262,0.549464,0.723746,0.571438,0.675667,0.650651,0,0,1,0,1
1,0,0.8,1,3,1,0.581395,0.6,0.333333,0.5,0.174586,0.0,0.751884,0.599208,0.700689,0.0,0,0,0,1,1
2,0,0.6,4,3,1,0.488372,0.4,0.166667,0.333333,0.257095,0.402463,0.7881,0.640195,0.696118,0.311311,0,1,0,0,1
3,0,0.4,5,2,1,0.534884,0.6,0.5,0.5,0.058833,0.392133,0.768042,0.620189,0.713148,0.56957,1,0,0,0,1
4,0,0.4,5,2,1,0.767442,0.8,0.333333,0.333333,0.404833,0.655542,0.730922,0.268858,0.604718,0.317317,0,1,0,0,1


In [27]:
val = pd.concat([X_val, y_val], axis=1)
val.head()

Unnamed: 0,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status,attrition_flag
4922,1,0.8,3,5,1,0.534884,1.0,0.166667,0.0,0.122966,0.524831,0.701375,0.492803,0.731547,0.621622,0,0,0,1,1
9826,0,0.6,4,2,1,0.465116,0.6,0.333333,0.333333,0.433408,0.860151,0.749726,0.941269,0.717796,0.38038,0,0,1,0,1
3447,1,0.4,2,3,1,0.744186,0.4,0.166667,0.5,0.466336,0.564164,0.680913,0.538623,0.562644,0.224224,1,0,0,0,1
5523,0,0.6,3,2,1,0.651163,0.4,0.5,0.333333,0.022925,0.0,0.761274,0.420989,0.487253,0.0,0,1,0,0,0
4223,0,0.6,4,1,1,0.534884,0.6,0.333333,0.666667,0.337953,0.634485,0.736876,0.596527,0.650828,0.379379,0,0,1,0,1


In [28]:
test = pd.concat([X_test, y_test], axis=1)
test.head()

Unnamed: 0,gender,dependent_count,education_level,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,total_amt_change_q4_q1,total_trans_amt,total_count_change_q4_q1,avg_utilization_ratio,divorced,married,single,unknown_marital_status,attrition_flag
5107,0,0.6,5,3,1,0.418605,1.0,0.5,0.666667,0.735639,0.0,0.624601,0.303989,0.527017,0.0,0,1,0,0,0
2519,1,0.2,5,5,1,0.325581,1.0,0.333333,0.833333,0.915317,0.542312,0.847168,0.418448,0.731547,0.052052,0,0,1,0,1
8231,0,0.2,1,2,1,0.674419,0.0,0.166667,0.333333,0.135159,0.611442,0.760867,0.632296,0.726157,0.696697,0,0,0,1,1
1955,0,0.8,1,2,2,0.44186,1.0,0.5,0.333333,0.709675,0.549861,0.821598,0.557476,0.645912,0.101101,0,0,1,0,1
214,1,0.8,3,3,1,0.534884,0.8,0.5,0.333333,0.69892,0.68375,0.773612,0.208984,0.790354,0.13013,0,1,0,0,1


In [29]:
train.to_csv("../Data/train.csv")
val.to_csv("../Data/validation.csv")
test.to_csv("../Data/test.csv")