___

<center><h1>Bank Churn Analysis</h1></center>

___

<center><h2>I Cloud 7</h2></center><br>
<center> Data Scientist Jr. Assessment </center>

___
<p></p>
<center style="color: #AA6373; font-weight: 400;"><strong>Presented by:</strong></center>
<center style="color: #AA6373; font-weight: 400;">Jorge Forero L.</center>

<center>September 2024</center>
<p></p>

In [1]:
# Common Modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Data Preparation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Modelling
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Testing and Evaluation
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, roc_curve, accuracy_score, make_scorer
from sklearn.model_selection import GridSearchCV
import unittest

In [2]:
file_path = '../src/Bank+Customer+Churn/Bank_Churn.csv'

# Load the CSV file into a DataFrame, handling potential errors
try:
    # Load the CSV with specified encoding and handle any bad lines
    df_bank_churn = pd.read_csv(file_path, encoding='ISO-8859-1', on_bad_lines='skip')
    print("Data loaded successfully. Displaying the first 5 rows:")
    print(df_bank_churn.head())

    # Print the column names
    print("\nColumn names:")
    print(df_bank_churn.columns)

except FileNotFoundError:
    print("Error: File not found at the specified path.")
except pd.errors.ParserError:
    print("Error: Could not parse the CSV file. Please check the file format.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Data loaded successfully. Displaying the first 5 rows:
   CustomerId   Surname  CreditScore Geography  Gender  Age  Tenure  \
0    15634602  Hargrave          619    France  Female   42       2   
1    15647311      Hill          608     Spain  Female   41       1   
2    15619304      Onio          502    France  Female   42       8   
3    15701354      Boni          699    France  Female   39       1   
4    15737888  Mitchell          850     Spain  Female   43       2   

     Balance  NumOfProducts  HasCrCard  IsActiveMember  EstimatedSalary  \
0       0.00              1          1               1        101348.88   
1   83807.86              1          0               1        112542.58   
2  159660.80              3          1               0        113931.57   
3       0.00              2          0               0         93826.63   
4  125510.82              1          1               1         79084.10   

   Exited  
0       1  
1       0  
2       1  
3       0  
4      

## Exploratory Data Analysis

#### Volume of Data

In [3]:
# Count the number of records (rows) and features (columns)
num_records = len(df_bank_churn)
num_features = len(df_bank_churn.columns)

# Print the size of the dataset
print(f"The dataset has {num_records} records and {num_features} features.")

The dataset has 10000 records and 13 features.


In [4]:
# Basic data exploration
print("Shape of the dataset:", df_bank_churn.shape)
print("Data types:", df_bank_churn.dtypes)
print("Missing values in each column:", df_bank_churn.isnull().sum())
df_bank_churn.describe()

Shape of the dataset: (10000, 13)
Data types: CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object
Missing values in each column: CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


Unnamed: 0,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


#### Data Cleaning

##### Fixing Data Types

In [5]:
# Change the data types of the specified columns to Boolean because of the description of each
df_bank_churn['HasCrCard'] = df_bank_churn['HasCrCard'].astype(bool)
df_bank_churn['IsActiveMember'] = df_bank_churn['IsActiveMember'].astype(bool)
df_bank_churn['Exited'] = df_bank_churn['Exited'].astype(bool)

print(df_bank_churn[['HasCrCard', 'IsActiveMember', 'Exited']].dtypes)

HasCrCard         bool
IsActiveMember    bool
Exited            bool
dtype: object


In [6]:
# List of columns to drop
columns_to_drop = ['Surname', 'Gender', 'CustomerId']

df_cleaned = df_bank_churn.drop(columns=columns_to_drop)

df_cleaned.head()

Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,42,2,0.0,1,True,True,101348.88,True
1,608,Spain,41,1,83807.86,1,False,True,112542.58,False
2,502,France,42,8,159660.8,3,True,False,113931.57,True
3,699,France,39,1,0.0,2,False,False,93826.63,False
4,850,Spain,43,2,125510.82,1,True,True,79084.1,False


#### Categorical Feature Summary

Here we assess the distribution of values across categorical features 

In [7]:
# Set the count to distinct values in each categorical feature
for col_name in ['Geography', 'NumOfProducts']:
    print(f"\nDistinct counts for {col_name}:")
    print(df_cleaned[col_name].value_counts())



Distinct counts for Geography:
Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

Distinct counts for NumOfProducts:
NumOfProducts
1    5084
2    4590
3     266
4      60
Name: count, dtype: int64
