# Importing Libraries

Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown

Auxiliary Functions and Configurations

In [None]:
pd.options.display.float_format = '{:.2f}'.format
plt.style.use('ggplot')

# Data Understanding

## Initial Data Collection

In [2]:
df = pd.read_csv('bank_customer_churn_prediction.csv')
df

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,15606229,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,15569892,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,15584532,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,15682355,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


## Data Description

The dataset collected from Kaggle (https://www.kaggle.com/datasets/gauravtopre/bank-customer-churn-dataset?resource=download).

- customer_id, <font color='red'>unused variable</font>.
- credit_score, <font color='chartreuse'>used as input</font>.
- country, <font color='chartreuse'>used as input</font>.
- gender, <font color='chartreuse'>used as input</font>.
- age, <font color='chartreuse'>used as input</font>.
- tenure, <font color='chartreuse'>used as input</font>.
- balance, <font color='chartreuse'>used as input</font>.
- products_number, <font color='chartreuse'>used as input</font>.
- credit_card, <font color='chartreuse'>used as input</font>.
- active_member, <font color='chartreuse'>used as input</font>.
- estimated_salary, <font color='chartreuse'>used as input</font>.
- churn, <font color='orange'>used as the target</font>. 1 if the client has left the bank during some period or 0 if he/she has not.

Data Information

In [4]:
df_info = pd.DataFrame({'Not Null': df.notnull().count(),
              'Null': df.isnull().sum(),
              'Perce Null': df.isnull().sum() / len(df),
              'Dtype': df.dtypes,
             })

df_dtype = pd.DataFrame(df_info['Dtype'].value_counts())
df_dtype['Perce'] = round(df_dtype['Dtype'] / df_dtype['Dtype'].sum(), 2)

text = f'Dataset has {df.shape[0]} rows and {df.shape[1]} columns. From these, we have:'

df_info = df_info.style.background_gradient(cmap='jet', subset=['Perce Null']).format({'Perce Null': '{:.2%}'})
df_dtype = df_dtype.style.background_gradient(cmap='YlGn', subset=['Perce']).format({'Perce': '{:.2%}'})

display(Markdown("<H3 style='text-align:left;float:lfet;'>Information about the Dataset"))
display(Markdown(f'<H5> {text}'))
display(df_info)
display(Markdown("<H3 style='text-align:left;float:lfet;'>About Dtypes we have:"))
display(df_dtype)

<H3 style='text-align:left;float:lfet;'>Information about the Dataset

<H5> Dataset has 10000 rows and 12 columns. From these, we have:

Unnamed: 0,Not Null,Null,Perce Null,Dtype
customer_id,10000,0,0.00%,int64
credit_score,10000,0,0.00%,int64
country,10000,0,0.00%,object
gender,10000,0,0.00%,object
age,10000,0,0.00%,int64
tenure,10000,0,0.00%,int64
balance,10000,0,0.00%,float64
products_number,10000,0,0.00%,int64
credit_card,10000,0,0.00%,int64
active_member,10000,0,0.00%,int64


<H3 style='text-align:left;float:lfet;'>About Dtypes we have:

Unnamed: 0,Dtype,Perce
int64,8,67.00%
object,2,17.00%
float64,2,17.00%


In [26]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
customer_id,10000.0,15690940.57,71936.19,15565701.0,15628528.25,15690738.0,15753233.75,15815690.0
credit_score,10000.0,650.53,96.65,350.0,584.0,652.0,718.0,850.0
age,10000.0,38.92,10.49,18.0,32.0,37.0,44.0,92.0
tenure,10000.0,5.01,2.89,0.0,3.0,5.0,7.0,10.0
balance,10000.0,76485.89,62397.41,0.0,0.0,97198.54,127644.24,250898.09
products_number,10000.0,1.53,0.58,1.0,1.0,1.0,2.0,4.0
credit_card,10000.0,0.71,0.46,0.0,0.0,1.0,1.0,1.0
active_member,10000.0,0.52,0.5,0.0,0.0,1.0,1.0,1.0
estimated_salary,10000.0,100090.24,57510.49,11.58,51002.11,100193.91,149388.25,199992.48
churn,10000.0,0.2,0.4,0.0,0.0,0.0,0.0,1.0


## Data Exploration

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(df.columns, 1):
    plt.subplot(4,3,i)
    plt.title(f"Distribution of {col} Data")
    sns.histplot(df[col], kde=True)
    plt.tight_layout()
    plt.plot()

In [None]:
fig = px.histogram(df, x="age", y="balance", color="churn",
                   marginal="box", # or violin, rug
                   hover_data=df.columns)

fig.show()