In [4]:
import pandas as pd
import math
from open_fairprune.data_util import DATA_PATH

In [5]:
def age_to_binary(age):
    try:
        if math.isnan(float(age)):
            return "NaN"
        if int(age) <= 10950:
            return 'young'
        else:
            return 'old'
    except:
        return 'NaN'

In [6]:
df = pd.read_csv(DATA_PATH / "Train_Dataset.csv", low_memory=False)
splits = {
    "train": df.ID % 7 <= 4,
    "dev": df.ID % 7 == 5,
    "test": df.ID % 7 == 6,  # Around 15%
}
df = df[splits['train']]
df['Gender_Marital'] = df['Client_Gender'] + ' ' + df['Client_Marital_Status']
df["Age"] = df["Age_Days"].apply(age_to_binary)


In [7]:
df["Default"].value_counts()
9845/(9845+112011)

0.0807920824579832

In [8]:
print((df.isnull().sum() / len(df)).sort_values(ascending=False))

Own_House_Age                 0.656974
Score_Source_1                0.564028
Social_Circle_Default         0.507732
Client_Occupation             0.339361
Score_Source_3                0.221002
Credit_Bureau                 0.152780
ID_Days                       0.048966
Gender_Marital                0.047208
Score_Source_2                0.046680
Population_Region_Relative    0.039867
Loan_Annuity                  0.039189
Employed_Days                 0.030365
House_Own                     0.030216
Client_Income_Type            0.030078
Client_Housing_Type           0.029998
Credit_Amount                 0.029998
Child_Count                   0.029963
Bike_Owned                    0.029963
Phone_Change                  0.029952
Loan_Contract_Type            0.029929
Active_Loan                   0.029917
Age_Days                      0.029917
Application_Process_Hour      0.029894
Registration_Days             0.029825
Client_Income                 0.029699
Client_Education         

In [9]:
cross_tab = pd.crosstab(df["Client_Marital_Status"], df["Default"])
ratios_matrix = cross_tab.apply(lambda r: r/r.sum(), axis=1)
print(ratios_matrix)
print(df["Client_Marital_Status"].value_counts())

Default                       0         1
Client_Marital_Status                    
D                      0.921455  0.078545
M                      0.920602  0.079398
S                      0.904174  0.095826
W                      0.935521  0.064479
Client_Marital_Status
M    62483
S    12387
D     5360
W     4358
Name: count, dtype: int64


In [10]:
cross_tab = pd.crosstab(df["Client_Gender"], df["Default"])
ratios_matrix = cross_tab.apply(lambda r: r/r.sum(), axis=1)
print(ratios_matrix)
print(df["Client_Gender"].value_counts())

Default               0         1
Client_Gender                    
Female         0.899156  0.100844
Male           0.929552  0.070448
XNA            1.000000  0.000000
Client_Gender
Male      56070
Female    29263
XNA           2
Name: count, dtype: int64


In [11]:
cross_tab = pd.crosstab(df["Client_Gender"], df["Default"])
ratios_matrix = cross_tab.apply(lambda r: r/r.sum(), axis=1)
ratios_matrix

Default,0,1
Client_Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.899156,0.100844
Male,0.929552,0.070448
XNA,1.0,0.0


In [12]:
cross_tab = pd.crosstab(df["Age"], df["Default"])
ratios_matrix = cross_tab.apply(lambda r: r/r.sum(), axis=1)
print(ratios_matrix)
print(df["Age"].value_counts())



Default         0         1
Age                        
NaN      0.919602  0.080398
old      0.924469  0.075531
young    0.888898  0.111102
Age
old      71944
young    12484
NaN       2612
Name: count, dtype: int64


In [13]:
cross_tab = pd.crosstab(df["Gender_Marital"], df["Default"])
ratios_matrix = cross_tab.apply(lambda r: r/r.sum(), axis=1)
print(ratios_matrix)
print(df["Gender_Marital"].value_counts())

Default                0         1
Gender_Marital                    
Female D        0.876182  0.123818
Female M        0.906807  0.093193
Female S        0.868054  0.131946
Female W        0.866667  0.133333
Male D          0.934947  0.065053
Male M          0.928616  0.071384
Male S          0.924304  0.075696
Male W          0.939484  0.060516
XNA M           1.000000  0.000000
Gender_Marital
Male M      38720
Female M    22534
Male S       7649
Female S     4517
Male D       4089
Male W       4032
Female D     1163
Female W      225
XNA M           2
Name: count, dtype: int64
