In [2]:
# Importing libraries for the notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Loading the csv file to get the data
df = pd.read_csv("../datasets/german_credit_data.csv",sep=',',index_col=0)
df.tail()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,40,male,3,own,little,little,3857,30,car,good
997,38,male,2,own,little,,804,12,radio/TV,good
998,23,male,2,free,little,little,1845,45,radio/TV,bad
999,27,male,2,own,moderate,moderate,4576,45,car,good


In [6]:
# Checking the null values
df.shape, df.isnull().sum()
# Out of 1000 rows,
# 183 null values for Saving accounts,
# 394 null values for Checking account
df["Checking account"].unique()
# Check how the missing values are spread
# to figure out a way to clean these values
test = pd.crosstab(df["Checking account"],df["Saving accounts"],dropna=False)
test

Saving accounts,little,moderate,quite rich,rich,NaN
Checking account,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
little,219,12,8,6,29
moderate,152,47,11,14,45
rich,41,5,4,3,10
,191,39,40,25,99


In [17]:
# let's fill the values
# I assume I can use the values from one column to fill the other
# eg: if Rich in Checking account, then Rich in Saving accounts
df_filled = df.copy()
df_filled["checking_saving_accounts"] = df["Saving accounts"].combine_first(df["Checking account"])
# we get the nomber of values down to 99 from 183 before

mode = df_filled["checking_saving_accounts"].mode()[0]
df_filled['checking_saving_accounts'] = df_filled['checking_saving_accounts'].fillna(mode)
df_filled
# we filled the remaining categorical values using the mode

# Now we drop the former columns with null values
df_filled = df_filled.drop(["Saving accounts","Checking account"],axis=1)
df_filled.isna().sum()

Age                         0
Sex                         0
Job                         0
Housing                     0
Credit amount               0
Duration                    0
Purpose                     0
Risk                        0
checking_saving_accounts    0
dtype: int64

In [18]:
# I need to make the target numbers as
# they are currently strings ('good' / 'bad')
df_filled['Risk'] = df['Risk'].replace({'bad':1,'good':0})
df_filled.Risk

  df_filled['Risk'] = df['Risk'].replace({'bad':1,'good':0})


0      0
1      1
2      0
3      0
4      1
      ..
995    0
996    0
997    0
998    1
999    0
Name: Risk, Length: 1000, dtype: int64

In [None]:
# Now I want to tackle class imbalance
df_filled.Risk.value_counts()
# 70% good credits vs 30% bad ones
# we need to make it balanced to 
# have correct representation

Risk
0    700
1    300
Name: count, dtype: int64

In [None]:
# We have a clean dataset where we can lead several statistical studies
# First, I want to confirm that the features "Saving accounts"
# and "Checking account" are not independent
# We will perform that through a pearson chi square test
from scipy.stats import chi2_contingency
print(chi2_contingency(pd.crosstab(
    df["Saving accounts"], 
    df["Checking account"]
)))
# We see that the p value is small compared to
# alpha = 0.05 (p=1.097e-06) so we can conclude
# that these variables are highly correlated
# Hence we can keep only one that will have
# more samples and more insightful as the number
# of null values will be smaller.


Chi2ContingencyResult(statistic=np.float64(38.053417890130305), pvalue=np.float64(1.0966700103224331e-06), dof=6, expected_freq=array([[193.37164751, 176.79693487,  41.83141762],
       [ 30.03831418,  27.46360153,   6.49808429],
       [ 10.79501916,   9.8697318 ,   2.33524904],
       [ 10.79501916,   9.8697318 ,   2.33524904]]))


In [None]:
# From this EDA, I can know that:
# - My target is in the right type (numerical) but the classes are imbalanced (70/30)
# - My features have no more null values in order to have maximum training samples for a working model
# - Highly correlated variable like saving & checking accounts are merged to have better results