In [None]:
#%%

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix

import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

In [None]:
#%%

df = pd.read_csv('./data/BankChurners.csv')

y = df['Attrition_Flag']

cols_to_drop = [col for col in df.columns if col.startswith('Naive_Bayes_Classifier')] + ['Attrition_Flag']
df = df.drop(columns=cols_to_drop)

In [None]:
# %%

df.describe()

Unnamed: 0,CLIENTNUM,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,739177600.0,46.32596,2.346203,35.928409,3.81258,2.341167,2.455317,8631.953698,1162.814061,7469.139637,0.759941,4404.086304,64.858695,0.712222,0.274894
std,36903780.0,8.016814,1.298908,7.986416,1.554408,1.010622,1.106225,9088.77665,814.987335,9090.685324,0.219207,3397.129254,23.47257,0.238086,0.275691
min,708082100.0,26.0,0.0,13.0,1.0,0.0,0.0,1438.3,0.0,3.0,0.0,510.0,10.0,0.0,0.0
25%,713036800.0,41.0,1.0,31.0,3.0,2.0,2.0,2555.0,359.0,1324.5,0.631,2155.5,45.0,0.582,0.023
50%,717926400.0,46.0,2.0,36.0,4.0,2.0,2.0,4549.0,1276.0,3474.0,0.736,3899.0,67.0,0.702,0.176
75%,773143500.0,52.0,3.0,40.0,5.0,3.0,3.0,11067.5,1784.0,9859.0,0.859,4741.0,81.0,0.818,0.503
max,828343100.0,73.0,5.0,56.0,6.0,6.0,6.0,34516.0,2517.0,34516.0,3.397,18484.0,139.0,3.714,0.999


In [None]:
# %%

df.isna().any().any()

False

In [None]:
# %%

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10127 non-null  int64  
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Op

In [None]:
#%%

# 1. Exploratoy Data Analysis (EDA)
def analyze_feature_types(df):
    feature_types = {
        'numerical_features': {
            'int': [],
            'float': []
        },
        'categorical_features': {
            'object': [],
            'potential_categorical': []  # numerical columns with few unique values
        }
    }
    
    for column in df.columns:
        # Get dtype and number of unique values
        dtype = df[column].dtype
        n_unique = df[column].nunique()
        
        # Check if numerical column might actually be categorical
        if dtype in ['int64', 'float64'] and n_unique <= 10:  # threshold of 10 unique values
            feature_types['categorical_features']['potential_categorical'].append(
                f"{column} ({n_unique} unique values)")
            
        # Categorize based on dtype
        if dtype in ['int64', 'int32']:
            feature_types['numerical_features']['int'].append(column)
        elif dtype in ['float64', 'float32']:
            feature_types['numerical_features']['float'].append(column)
        elif dtype == 'object':
            feature_types['categorical_features']['object'].append(column)
    
    return feature_types

feature_types = analyze_feature_types(df)

print("\nDetailed Feature Analysis")
print("\nNumerical Features:")
print("Integer columns:", feature_types['numerical_features']['int'])
print("Float columns:", feature_types['numerical_features']['float'])

print("\nCategorical Features:")
print("Object columns:", feature_types['categorical_features']['object'])
print("\nPotentially Categorical (numerical with ≤10 unique values):")
print(feature_types['categorical_features']['potential_categorical'])


Detailed Feature Analysis

Numerical Features:
Integer columns: ['CLIENTNUM', 'Customer_Age', 'Dependent_count', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Total_Revolving_Bal', 'Total_Trans_Amt', 'Total_Trans_Ct']
Float columns: ['Credit_Limit', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']

Categorical Features:
Object columns: ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category']

Potentially Categorical (numerical with ≤10 unique values):
['Dependent_count (6 unique values)', 'Total_Relationship_Count (6 unique values)', 'Months_Inactive_12_mon (7 unique values)', 'Contacts_Count_12_mon (7 unique values)']


In [None]:
# %%

# Checking data distrbution
# Customer Age
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=df['Customer_Age'],name='Age Box Plot',boxmean=True)
tr2=go.Histogram(x=df['Customer_Age'],name='Age Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Customer Ages")
fig.show()

In [None]:
#%%

# Dependent Count (Family Size)
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=df['Dependent_count'],name='Dependent count Box Plot',boxmean=True)
tr2=go.Histogram(x=df['Dependent_count'],name='Dependent count Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Dependent counts (close family size)")
fig.show()

In [None]:
#%%

# Education Level proportion
ex.pie(df,names='Education_Level',title='Propotion Of Education Levels',hole=0.33)

In [None]:
#%%

# Marriage Status proportion
ex.pie(df,names='Marital_Status',title='Propotion Of Different Marriage Statuses',hole=0.33)

In [None]:
#%% 

# Income levels proportion
ex.pie(df,names='Income_Category',title='Propotion Of Different Income Levels',hole=0.33)

In [None]:
#%%

# Card Categories proportion
ex.pie(df,names='Card_Category',title='Propotion Of Different Card Categories',hole=0.33)

In [None]:
#%%

# How longer the customer is part of the bank proportion
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=df['Months_on_book'],name='Months on book Box Plot',boxmean=True)
tr2=go.Histogram(x=df['Months_on_book'],name='Months on book Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of months the customer is part of the bank")
fig.show()

In [None]:
#%%

# No of total prod/customer 

fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=df['Total_Relationship_Count'],name='Total no. of products Box Plot',boxmean=True)
tr2=go.Histogram(x=df['Total_Relationship_Count'],name='Total no. of products Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of Total no. of products held by the customer")
fig.show()

In [None]:
#%%

# No of months inactive
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=df['Months_Inactive_12_mon'],name='number of months inactive Box Plot',boxmean=True)
tr2=go.Histogram(x=df['Months_Inactive_12_mon'],name='number of months inactive Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the number of months inactive in the last 12 months")
fig.show()

In [None]:
#%% 

# Distribution of credit limit
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=df['Credit_Limit'],name='Credit_Limit Box Plot',boxmean=True)
tr2=go.Histogram(x=df['Credit_Limit'],name='Credit_Limit Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the Credit Limit")
fig.show()

# Distribution of total transaction amount
fig = make_subplots(rows=2, cols=1)

tr1=go.Box(x=df['Total_Trans_Amt'],name='Total_Trans_Amt Box Plot',boxmean=True)
tr2=go.Histogram(x=df['Total_Trans_Amt'],name='Total_Trans_Amt Histogram')

fig.add_trace(tr1,row=1,col=1)
fig.add_trace(tr2,row=2,col=1)

fig.update_layout(height=700, width=1200, title_text="Distribution of the Total Transaction Amount (Last 12 months)")
fig.show()

In [None]:
# %%

# Check target distribution
ex.pie(y,names='Attrition_Flag',title='Proportion of churn vs not churn customers',hole=0.33)