In [1]:
import pandas as pd

In [2]:
import numpy as py

In [3]:
import matplotlib.pyplot as plt

In [5]:
%pip install seaborn

In [6]:
import seaborn as sns

In [8]:
#import the dataset
df = pd.read_csv('Bank_churn.csv')

In [9]:
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,EstimatedSalary,Balance,NumOfProducts,HasCrCard,Tenure.1,IsActiveMember,Exited
0,15634602,Hargrave,619,FRA,Female,42.0,2,101348.88,0.0,1,Yes,2,Yes,1
1,15647311,Hill,608,Spain,Female,41.0,1,112542.58,83807.86,1,Yes,1,Yes,0
2,15619304,Onio,502,French,Female,42.0,8,113931.57,159660.8,3,No,8,No,1
3,15701354,Boni,699,FRA,Female,39.0,1,93826.63,0.0,2,No,1,No,0
4,15737888,Mitchell,850,Spain,Female,43.0,2,79084.1,125510.82,1,Yes,2,Yes,0


In [12]:
#Need to clean the data
unique_Geography = df['Geography'].unique()

In [13]:
print(unique_Geography)

['FRA' 'Spain' 'French' 'France' 'Germany']


## There are 3 different options for France so we need to replace 'FRA' and 'French' with 'France


In [18]:
df = df.replace(to_replace= ["FRA", "French"], value = "France")

In [19]:
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,EstimatedSalary,Balance,NumOfProducts,HasCrCard,Tenure.1,IsActiveMember,Exited
0,15634602,Hargrave,619,France,Female,42.0,2,101348.88,0.0,1,Yes,2,Yes,1
1,15647311,Hill,608,Spain,Female,41.0,1,112542.58,83807.86,1,Yes,1,Yes,0
2,15619304,Onio,502,France,Female,42.0,8,113931.57,159660.8,3,No,8,No,1
3,15701354,Boni,699,France,Female,39.0,1,93826.63,0.0,2,No,1,No,0
4,15737888,Mitchell,850,Spain,Female,43.0,2,79084.1,125510.82,1,Yes,2,Yes,0


In [20]:
unique_Geography = df['Geography'].unique()
print(unique_Geography)

['France' 'Spain' 'Germany']


## Notice we have the column 'tenure' twice in the df so we need to drop one of them

In [22]:
df = df.drop("Tenure.1", axis = 'columns')

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CustomerId       10001 non-null  int64  
 1   Surname          9998 non-null   object 
 2   CreditScore      10001 non-null  int64  
 3   Geography        10001 non-null  object 
 4   Gender           10001 non-null  object 
 5   Age              9998 non-null   float64
 6   Tenure           10001 non-null  int64  
 7   EstimatedSalary  10001 non-null  float64
 8   Balance          10001 non-null  float64
 9   NumOfProducts    10001 non-null  int64  
 10  HasCrCard        10001 non-null  object 
 11  IsActiveMember   10001 non-null  object 
 12  Exited           10001 non-null  int64  
dtypes: float64(3), int64(5), object(5)
memory usage: 820.5+ KB


## Need to check if there are any duplicated CustomerId in the df

In [26]:
df.duplicated('CustomerId').sum()

np.int64(1)

## There is a duplicated CustomerId in the df so let us drop the row of the duplicated Id

In [28]:
df = df.drop_duplicates(subset =['CustomerId'])

In [29]:
df.head()

Unnamed: 0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,EstimatedSalary,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,15634602,Hargrave,619,France,Female,42.0,2,101348.88,0.0,1,Yes,Yes,1
1,15647311,Hill,608,Spain,Female,41.0,1,112542.58,83807.86,1,Yes,Yes,0
2,15619304,Onio,502,France,Female,42.0,8,113931.57,159660.8,3,No,No,1
3,15701354,Boni,699,France,Female,39.0,1,93826.63,0.0,2,No,No,0
4,15737888,Mitchell,850,Spain,Female,43.0,2,79084.1,125510.82,1,Yes,Yes,0


## Make the CustomerId the df index

In [31]:
df = df.set_index('CustomerId')

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 15634602 to 15628319
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Surname          9997 non-null   object 
 1   CreditScore      10000 non-null  int64  
 2   Geography        10000 non-null  object 
 3   Gender           10000 non-null  object 
 4   Age              9997 non-null   float64
 5   Tenure           10000 non-null  int64  
 6   EstimatedSalary  10000 non-null  float64
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  object 
 10  IsActiveMember   10000 non-null  object 
 11  Exited           10000 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 820.3+ KB


## We can see now that there are 3 null values for surname and age
### For the surname one could go back to the database and search for the CustomerId and the corresponding surname. However for the purpose of this project, I will replace the null values with 'NotProvided'

### For age, I will replace with a statistical value (the median)


In [33]:
new_surname = { "Surname": "NotProvided"
}
df = df.fillna(value=new_surname)

age_median = df['Age'].median()
df['Age'] = df['Age'].fillna(value=age_median)
df

Unnamed: 0_level_0,Surname,CreditScore,Geography,Gender,Age,Tenure,EstimatedSalary,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15634602,Hargrave,619,France,Female,42.0,2,101348.88,0.00,1,Yes,Yes,1
15647311,Hill,608,Spain,Female,41.0,1,112542.58,83807.86,1,Yes,Yes,0
15619304,Onio,502,France,Female,42.0,8,113931.57,159660.80,3,No,No,1
15701354,Boni,699,France,Female,39.0,1,93826.63,0.00,2,No,No,0
15737888,Mitchell,850,Spain,Female,43.0,2,79084.10,125510.82,1,Yes,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15606229,Obijiaku,771,France,Male,39.0,5,96270.64,0.00,2,No,No,0
15569892,Johnstone,516,France,Male,35.0,10,101699.77,57369.61,1,Yes,Yes,0
15584532,Liu,709,France,Female,36.0,7,42085.58,0.00,1,Yes,Yes,1
15682355,Sabbatini,772,Germany,Male,42.0,3,92888.52,75075.31,2,No,No,1


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 15634602 to 15628319
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Surname          10000 non-null  object 
 1   CreditScore      10000 non-null  int64  
 2   Geography        10000 non-null  object 
 3   Gender           10000 non-null  object 
 4   Age              10000 non-null  float64
 5   Tenure           10000 non-null  int64  
 6   EstimatedSalary  10000 non-null  float64
 7   Balance          10000 non-null  float64
 8   NumOfProducts    10000 non-null  int64  
 9   HasCrCard        10000 non-null  object 
 10  IsActiveMember   10000 non-null  object 
 11  Exited           10000 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 820.3+ KB


## Let us check that estimated salary is as it should be and that there are no negative values

In [37]:
df.sort_values(by='EstimatedSalary')

Unnamed: 0_level_0,Surname,CreditScore,Geography,Gender,Age,Tenure,EstimatedSalary,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15580203,NotProvided,674,Spain,Male,37.0,6,-999999.00,120193.42,1,No,No,0
15756954,NotProvided,538,France,Female,37.0,2,-999999.00,0.00,1,Yes,Yes,0
15728693,NotProvided,574,Germany,Female,37.0,3,-999999.00,141349.43,1,Yes,Yes,0
15791053,Lucciano,709,Germany,Male,45.0,4,11.58,122917.71,1,Yes,Yes,1
15679693,Walker,625,France,Male,31.0,5,90.07,0.00,2,Yes,Yes,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15709136,Adams,620,France,Male,28.0,8,199909.32,0.00,2,Yes,Yes,0
15762331,Moss,569,France,Male,37.0,9,199929.17,178755.84,1,No,No,0
15697270,Gannon,608,Spain,Male,27.0,4,199953.33,153325.10,1,Yes,Yes,0
15634359,Dyer,639,Germany,Female,41.0,5,199970.74,98635.77,1,No,No,0


### We can replace the negative values with a statistical value (the median)

In [38]:
est_salary_median = df['EstimatedSalary'].median()
df['EstimatedSalary'] = df['EstimatedSalary'].replace(to_replace = -999999.00, value = est_salary_median)

In [39]:
df.sort_values(by='EstimatedSalary')

Unnamed: 0_level_0,Surname,CreditScore,Geography,Gender,Age,Tenure,EstimatedSalary,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
CustomerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15791053,Lucciano,709,Germany,Male,45.0,4,11.58,122917.71,1,Yes,Yes,1
15679693,Walker,625,France,Male,31.0,5,90.07,0.00,2,Yes,Yes,0
15786463,Hsing,645,Germany,Female,59.0,8,91.75,121669.93,2,No,No,1
15639662,Phillips,710,France,Male,38.0,2,96.27,0.00,2,No,No,0
15602851,Ozioma,629,France,Male,40.0,9,106.67,0.00,1,No,No,0
...,...,...,...,...,...,...,...,...,...,...,...,...
15709136,Adams,620,France,Male,28.0,8,199909.32,0.00,2,Yes,Yes,0
15762331,Moss,569,France,Male,37.0,9,199929.17,178755.84,1,No,No,0
15697270,Gannon,608,Spain,Male,27.0,4,199953.33,153325.10,1,Yes,Yes,0
15634359,Dyer,639,Germany,Female,41.0,5,199970.74,98635.77,1,No,No,0


# Analysing the data

- What attributes are more common among churners than non-churners? Can churn be predicted using the variables in the data?

- What do the overall demographics of the bank's customers look like?

- Is there a difference between German, French, and Spanish customers i-n terms of account behavior?

- What types of segments exist within the bank's customers?

In [40]:
df['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [41]:
df.replace({'Geography':{'France':2 , 'Germany':1 , 'Spain':0}}, inplace=True)

  df.replace({'Geography':{'France':2 , 'Germany':1 , 'Spain':0}}, inplace=True)


In [43]:
df['Gender'].value_counts()

Gender
Male      5457
Female    4543
Name: count, dtype: int64

In [44]:
df.replace({'Gender':{'Male':0, 'Female':1}}, inplace=True)

  df.replace({'Gender':{'Male':0, 'Female':1}}, inplace=True)


In [47]:
df['NumOfProducts'].value_counts()

NumOfProducts
1    5084
2    4590
3     266
4      60
Name: count, dtype: int64

## As the count value for 3 and 4 products used is very small compared to 1 and 2 we are going to clump 3 and 4 togther with 2 so we avoid working with small sample sizes

In [48]:
df.replace({'NumOfProducts':{1:0, 2:1 , 3:1 , 4:1}}, inplace=True)

In [50]:
df['HasCrCard'].value_counts()

HasCrCard
Yes    5151
No     4849
Name: count, dtype: int64

In [54]:
df.replace({'HasCrCard':{'Yes':1, 'No': 0}}, inplace=True)

  df.replace({'HasCrCard':{'Yes':1, 'No': 0}}, inplace=True)


In [56]:
df['IsActiveMember'].value_counts()

IsActiveMember
Yes    5151
No     4849
Name: count, dtype: int64