In [None]:
# To help with reading and manipulating data
import pandas as pd
import numpy as np

# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# To be used for missing value imputation
from sklearn.impute import SimpleImputer

# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
    BaggingClassifier,
)
from xgboost import XGBClassifier

# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,
    plot_confusion_matrix,
)

# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)

# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)

# To supress warnings
import warnings

warnings.filterwarnings("ignore")

# This will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black

Description
Background & Context

The Thera bank recently saw a steep decline in the number of users of their credit card, credit cards are a good source of income for banks because of different kinds of fees charged by the banks like annual fees, balance transfer fees, and cash advance fees, late payment fees, foreign transaction fees, and others. Some fees are charged to every user irrespective of usage, while others are charged under specified circumstances.

Customers’ leaving credit cards services would lead bank to loss, so the bank wants to analyze the data of customers and identify the customers who will leave their credit card services and reason for same – so that bank could improve upon those areas

You as a Data scientist at Thera bank need to come up with a classification model that will help the bank improve its services so that customers do not renounce their credit cards

You need to identify the best possible model that will give the required performance

Objective

Explore and visualize the dataset.
Build a classification model to predict if the customer is going to churn or not
Optimize the model using appropriate techniques
Generate a set of insights and recommendations that will help the bank
Data Dictionary:

CLIENTNUM: Client number. Unique identifier for the customer holding the account
Attrition_Flag: Internal event (customer activity) variable - if the account is closed then "Attrited Customer" else "Existing Customer"
Customer_Age: Age in Years
Gender: Gender of the account holder
Dependent_count: Number of dependents
Education_Level:  Educational Qualification of the account holder - Graduate, High School, Unknown, Uneducated, College(refers to a college student), Post-Graduate, Doctorate.
Marital_Status: Marital Status of the account holder
Income_Category: Annual Income Category of the account holder
Card_Category: Type of Card
Months_on_book: Period of relationship with the bank
Total_Relationship_Count: Total no. of products held by the customer
Months_Inactive_12_mon: No. of months inactive in the last 12 months
Contacts_Count_12_mon: No. of Contacts between the customer and bank in the last 12 months
Credit_Limit: Credit Limit on the Credit Card
Total_Revolving_Bal: The balance that carries over from one month to the next is the revolving balance
Avg_Open_To_Buy: Open to Buy refers to the amount left on the credit card to use (Average of last 12 months)
Total_Trans_Amt: Total Transaction Amount (Last 12 months)
Total_Trans_Ct: Total Transaction Count (Last 12 months)
Total_Ct_Chng_Q4_Q1: Ratio of the total transaction count in 4th quarter and the total transaction count in 1st quarter
Total_Amt_Chng_Q4_Q1: Ratio of the total transaction amount in 4th quarter and the total transaction amount in 1st quarter
Avg_Utilization_Ratio: Represents how much of the available credit the customer spent


In [None]:
df = pd.read_csv('BankChurners.csv')

In [None]:
df.shape

In [None]:
df.head()

* Dependant Variable = Attrition_Flag
* It looks like the Income_Category attribute has some data 'abc' that does not fit will look at it furthere down
* Can drop clientnum as it wont be relevant to build this model but will be in use of the model later. 
* Generate One Hots
    - Attrition_Flag
    - Gender
    - Education_Level
    - Married_Status
    - Income_Category
    - Card_Category

In [None]:
df = df.drop(columns='CLIENTNUM')
one_hots = ['Attrition_Flag','Gender','Education_Level','Marital_Status','Income_Category', 'Card_Category']

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
# Generating a data frame of missing values by count and %
missing = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False)
missing = pd.DataFrame(missing, columns=['%'])
missing['count'] = df.isnull().sum()

In [None]:
print('Missing Values in the data frame')
missing.head()

There are a large number of missing values in:
     - Education_Level
     - Martial_Status

Will have to look into these attributues in addition to the Income_Category to see if we can impute the data or need to drop those rows

In [None]:
print('The count of unique values in the data frame')
df.nunique().sort_values(ascending=False)

In [None]:
# Printing a list of unique values, values shown in text file. Some categories have a large number of values so dropping those

vc = df.columns.drop(['Avg_Open_To_Buy','Credit_Limit','Total_Revolving_Bal','Total_Amt_Chng_Q4_Q1', 'Avg_Utilization_Ratio','Total_Ct_Chng_Q4_Q1','Total_Trans_Ct','Customer_Age','Months_on_book',])
for i in vc:
    print(i,'Has the following Unique Values')
    print(df[i].value_counts().sort_values(ascending=False))
    print('-'*50)

### Data Initial observations
* CLEINTNUM is not needed for this model we can use the index and drop this column
* Attrition_Flag is our dependant variable
<br/><br/>

* The following columns have missing values or other issues we will need to fix
   - Education_Level - has a number of missing values, will drop those as there is no hard data to fill in
   - Marital_Status - has a number of missing values, will drop those as there is no hard data to fill in
   - Income_Category - has a category 'abc', those rows might need to be dropped if no correlations can be found to impute the data. 
   <br/><br/>

* The following categorical columns can be dummies
   - Attrition_Flag
   - Gender
   - Education_Level
   - Married_Status
   - Income_Category
   - Card_Category
   <br/><br/>

* Customer Observation:
   -  Customers range in age from mid 20s to early 70s. The average customer age is mid 40s
   -  Customers have an average of 2.3 dependants
   -  On average customers have been with the bank for right at 3 years, with the longest being 56 months. This is a newer bank
   -  The min age for a customer is 13 months, this means the bank has not signed any new customers in over a year.
   - The typical customer has 3-4 products from the bank
   - Most inactive customers have been so for just over 2 months, it would appear inactive accounts are closed after 12 months
   - The bank attempts to contact a customer 2-3 times a year
   - The average credit limit 8600 with the max being 34,500
   - The typical customer carries over a roughly 1200 balance month to month  
   - Most customers use less than 25% of their available credit    
   <br/><br/>

* Attributes with Possible Outliers:
   - Credit_Limit - has a high end roughly 3 times that of the 75% quartile. While this is most likely an outlier I do not know it is bad for the model. Will dig more into this.
   - Avg_Utilization_Ratio - has a max almost double the 75%. That being said this might skew the model but is data that is important to identifying customer trends.
   - Customer_Age - Has outliers but considering the nature of the attribute will not treat
   - Months_on_Book - Has outliers but considering the nature of the attribute will not treat
   - Credit_Limit - Has outliers but considering the nature of the attribute will not treat
   - Total_Trans_Amt - Has outliers but considering the nature of the attribute will not treat
   <br/><br/>

* General Observations:
   - The income category has a value of ‘abc’ that will need treatment
   <br/><br/>


### Fixing Education_Level, Marital_Status & Income_Categories  

- Dropping missing values  
- Evaluating if there is a way to impute Income_Categories from the data available

In [None]:
# Dropping rows with missing values
df.dropna(inplace=True)

# Confirming all missing values have been treated
df.isnull().sum().sort_index(ascending=True)

In [None]:
# Lets look at Income Category
df['Income_Category'].value_counts().sort_values(ascending=True)

In [None]:
# Creating a dataframe for rows with 'abc' for Income_Category to look at the data
abc = df.loc[df['Income_Category'] == 'abc']
abc.head()

There is not a good correlation to impute a usable value instead of 'abc' will drop all rows with that value.

In [None]:
df = df[df['Income_Category'] != 'abc']
df.head()

In [None]:
df.shape

Between missing value treatments and the bad data for income we have cut out about 30% of our data set. 

## EDA  
-------------------------------
#### Reusable Functions
- Reusing functions provided in the class  
- Functions for both Univariate and Bivariate Analysis

In [None]:
# reusing provided function for generating graphs

def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
    """
    Boxplot and histogram combined

    data: dataframe
    feature: dataframe column
    figsize: size of figure (default (12,7))
    kde: whether to show the density curve (default False)
    bins: number of bins for histogram (default None)
    """
    f2, (ax_box2, ax_hist2) = plt.subplots(
        nrows=2,  # Number of rows of the subplot grid= 2
        sharex=True,  # x-axis will be shared among all subplots
        gridspec_kw={"height_ratios": (0.25, 0.75)},
        figsize=figsize,
    )  # creating the 2 subplots
    
    sns.boxplot(
        data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
    )  # boxplot will be created and a star will indicate the mean value of the column
    sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
    ) if bins else sns.histplot(
        data=data, x=feature, kde=kde, ax=ax_hist2
    )  # For histogram
    ax_hist2.axvline(
        data[feature].mean(), color="green", linestyle="--"
    )  # Add mean to the histogram
    ax_hist2.axvline(
        data[feature].median(), color="black", linestyle="-"
    )  # Add median to the histogram

In [None]:
# reusing provided function for generating graphs
# function to create labeled barplots


def labeled_barplot(data, feature, perc=False, n=None):
    """
    Barplot with percentage at the top

    data: dataframe
    feature: dataframe column
    perc: whether to display percentages instead of count (default is False)
    n: displays the top n category levels (default is None, i.e., display all levels)
    """

    total = len(data[feature])  # length of the column
    count = data[feature].nunique()
    if n is None:
        plt.figure(figsize=(count + 1, 5))
    else:
        plt.figure(figsize=(n + 1, 5))

    plt.xticks(rotation=90, fontsize=15)
    ax = sns.countplot(
        data=data,
        x=feature,
        palette="Paired",
        order=data[feature].value_counts().index[:n].sort_values(),
    )

    for p in ax.patches:
        if perc == True:
            label = "{:.1f}%".format(
                100 * p.get_height() / total
            )  # percentage of each class of the category
        else:
            label = p.get_height()  # count of each level of the category

        x = p.get_x() + p.get_width() / 2  # width of the plot
        y = p.get_height()  # height of the plot

        ax.annotate(
            label,
            (x, y),
            ha="center",
            va="center",
            size=12,
            xytext=(0, 5),
            textcoords="offset points",
        )  # annotate the percentage

    plt.show()  # show the plot

In [None]:
# reusing provided function for generating graphs

def stacked_barplot(data, predictor, target):
    """
    Print the category counts and plot a stacked bar chart
    data: dataframe
    predictor: independent variable
    target: target variable
    """
    count = data[predictor].nunique()
    sorter = data[target].value_counts().index[-1]
    tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
        by=sorter, ascending=False
    )
    print(tab1)
    print("-" * 120)
    tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
        by=sorter, ascending=False
    )
    tab.plot(kind="bar", stacked=True, figsize=(count + 5, 5))
    plt.legend(
        loc="lower left", frameon=False,
    )
    plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
    plt.show()

In [None]:
# creating histograms
df.hist(figsize=(14, 14))
plt.show()

In [None]:
histogram_boxplot(df, 'Avg_Utilization_Ratio')

In [None]:
histogram_boxplot(df, 'Customer_Age')

In [None]:
histogram_boxplot(df, 'Dependent_count')

In [None]:
histogram_boxplot(df, 'Months_on_book')

In [None]:
histogram_boxplot(df, 'Total_Relationship_Count')

In [None]:
histogram_boxplot(df, 'Credit_Limit')

In [None]:
histogram_boxplot(df, 'Total_Trans_Amt')

In [None]:
histogram_boxplot(df, 'Total_Trans_Ct')

In [None]:
# plot = ['Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 'Total_Relationship_Count', 'Total_Trans_Ct']
plot = ['Dependent_Count']

In [None]:
for i in plot:
    labeled_barplot(plot, i, perc=True)