# CS421: Introduction to Machine Learning
## Project: Predicting Credit Card Customer Churn
### Model: Random Forest
---

# 1. Setting up the notebook

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

sns.set_theme(style = "darkgrid")

### Import dataset

In [2]:
data = pd.read_csv("BankChurners.csv")
data.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [3]:
data.drop(["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)
data.head(1)

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061


In [4]:
data.columns = ['attrition_flag', 'customer_age', 'gender', 'dependent_count',
       'education_level', 'marital_status', 'income_category', 'card_category',
       'months_on_book', 'total_relationship_count', 'months_inactive_12_month',
       'contacts_count_12_month', 'credit_limit', 'total_revolving_bal',
       'avg_open_to_buy', 'total_amt_change_q4_q1', 'total_trans_amt',
       'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']

numerical = ['customer_age', 'dependent_count', 'months_on_book', 
             'total_relationship_count', 'months_inactive_12_month',
             'contacts_count_12_month', 'credit_limit', 'total_revolving_bal',
             'avg_open_to_buy', 'total_amt_change_q4_q1', 'total_trans_amt',
             'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']

categorical = ['attrition_flag', 'gender','education_level', 
                    'marital_status', 'income_category', 'card_category']

data.head(1)

Unnamed: 0,attrition_flag,customer_age,gender,dependent_count,education_level,marital_status,income_category,card_category,months_on_book,total_relationship_count,months_inactive_12_month,contacts_count_12_month,credit_limit,total_revolving_bal,avg_open_to_buy,total_amt_change_q4_q1,total_trans_amt,total_trans_count,total_count_change_q4_q1,avg_utilization_ratio
0,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061


# 2. Feature Engineering

## 2.1 Handling Numerical Features

### Log Transformation 

In [5]:
# Retrieve name of skewed columns
skewed = []
for skew_val in data[numerical].skew():
    if skew_val > 1 or skew_val < -1:
        skewed.append( data[numerical].skew()[ data[numerical].skew() == skew_val ].index[0] )
        print( data[numerical].skew()[ data[numerical].skew() == skew_val ].index[0])
        print(round(skew_val, 3) )

credit_limit
1.667
avg_open_to_buy
1.662
total_amt_change_q4_q1
1.732
total_trans_amt
2.041
total_count_change_q4_q1
2.064


In [6]:
for skewed_col in [ 'credit_limit', 'avg_open_to_buy', 'total_trans_amt']:
    data[ skewed_col ]= np.log( data[skewed_col] )

## 2.2 Handling Categorical Features

### Determining whether the categorical column is ordinal or categorical

In [11]:
for cat in categorical:
    print( data[cat].value_counts() )
    print()

Existing Customer    8500
Attrited Customer    1627
Name: attrition_flag, dtype: int64

F    5358
M    4769
Name: gender, dtype: int64

Graduate         3128
High School      2013
Unknown          1519
Uneducated       1487
College          1013
Post-Graduate     516
Doctorate         451
Name: education_level, dtype: int64

Married     4687
Single      3943
Unknown      749
Divorced     748
Name: marital_status, dtype: int64

Less than $40K    3561
$40K - $60K       1790
$80K - $120K      1535
$60K - $80K       1402
Unknown           1112
$120K +            727
Name: income_category, dtype: int64

Blue        9436
Silver       555
Gold         116
Platinum      20
Name: card_category, dtype: int64



### Checking cardinality of each category

In [13]:
for column in categorical:
    unique_categories = data[column].nunique()
    print( column + ": " + str(unique_categories) )

attrition_flag: 2
gender: 2
education_level: 7
marital_status: 4
income_category: 6
card_category: 4


**Conclusions:**<br>
- attrition_flag -> nominal, binary
- gender -> nominal, binary
- education_level -> ordinal
- marital_status -> nominal
- income_category -> ordinal
- card_category -> ordinal