In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
import altair as alt
alt.data_transformers.enable('data_server')
import math

### Import Data

In [2]:
df = pd.read_csv("application_record.csv")
credit_df = pd.read_csv("credit_record.csv")

In [3]:
display(df.head())
credit_df.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0


Unnamed: 0,ID,MONTHS_BALANCE,STATUS
0,5001711,0,X
1,5001711,-1,0
2,5001711,-2,0
3,5001711,-3,0
4,5001712,0,C


In [223]:
df.isna().sum()

ID                          0
CODE_GENDER                 0
FLAG_OWN_CAR                0
FLAG_OWN_REALTY             0
CNT_CHILDREN                0
AMT_INCOME_TOTAL            0
NAME_INCOME_TYPE            0
NAME_EDUCATION_TYPE         0
NAME_FAMILY_STATUS          0
NAME_HOUSING_TYPE           0
DAYS_BIRTH                  0
DAYS_EMPLOYED               0
FLAG_MOBIL                  0
FLAG_WORK_PHONE             0
FLAG_PHONE                  0
FLAG_EMAIL                  0
OCCUPATION_TYPE        134203
CNT_FAM_MEMBERS             0
dtype: int64

### Preprocessing

In [224]:
# Remove ID Field
if "ID" in df.columns:
    df = df.drop("ID", axis = 1)

# Improve human readability of certain fields
df["YEARS_BIRTH"] = round(abs(df["DAYS_BIRTH"])/365, 2)
df["DAYS_EMPLOYED"] = df["DAYS_EMPLOYED"].apply(lambda x: 0 if x > 0 else x)
df["YEARS_EMPLOYED"] = round(df["DAYS_EMPLOYED"]/-365, 2)
df["OCCUPATION_TYPE"] = df["OCCUPATION_TYPE"].fillna("Misc")

df.drop(["DAYS_BIRTH", "DAYS_EMPLOYED"], axis = 1, inplace = True)

# Map numeric columns with two or less unique values 
for column in df.select_dtypes("int64").columns:
    valuelen = len(df[column].value_counts().index)
    if valuelen == 1:
        df = df.drop(column, axis = 1)

for column in ["FLAG_WORK_PHONE", "FLAG_PHONE",	"FLAG_EMAIL"]:
    df[column] = df[column].map({1: "Y", 0: "N"})

df.head()
    


Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,YEARS_BIRTH,YEARS_EMPLOYED
0,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Y,N,N,Misc,2.0,32.89,12.44
1,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,Y,N,N,Misc,2.0,32.89,12.44
2,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,N,N,N,Security staff,2.0,58.83,3.11
3,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,N,Y,Y,Sales staff,1.0,52.36,8.36
4,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,N,Y,Y,Sales staff,1.0,52.36,8.36


In [225]:
df["YEARS_EMPLOYED"].describe()

count    438557.000000
mean          5.952069
std           6.568213
min          -0.000000
25%           1.020000
50%           4.020000
75%           8.500000
max          48.030000
Name: YEARS_EMPLOYED, dtype: float64

Target Data:

MONTHS_BALANCE:   
The month of the extracted data is the starting point, backwards, 0 is the current month, -1 is the previous month, and so on

Status:  
    0: 1-29 days past due   
    1: 30-59 days past due   
    2: 60-89 days overdue   
    3: 90-119 days overdue  
    4: 120-149 days overdue   
    5: Overdue or bad debts, write-offs for more than 150 days C: paid off that month X: No loan for the month  

## EDA

### Describe Data

In [226]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   CODE_GENDER          438557 non-null  object 
 1   FLAG_OWN_CAR         438557 non-null  object 
 2   FLAG_OWN_REALTY      438557 non-null  object 
 3   CNT_CHILDREN         438557 non-null  int64  
 4   AMT_INCOME_TOTAL     438557 non-null  float64
 5   NAME_INCOME_TYPE     438557 non-null  object 
 6   NAME_EDUCATION_TYPE  438557 non-null  object 
 7   NAME_FAMILY_STATUS   438557 non-null  object 
 8   NAME_HOUSING_TYPE    438557 non-null  object 
 9   FLAG_WORK_PHONE      438557 non-null  object 
 10  FLAG_PHONE           438557 non-null  object 
 11  FLAG_EMAIL           438557 non-null  object 
 12  OCCUPATION_TYPE      438557 non-null  object 
 13  CNT_FAM_MEMBERS      438557 non-null  float64
 14  YEARS_BIRTH          438557 non-null  float64
 15  YEARS_EMPLOYED   

In [227]:
df.describe()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,CNT_FAM_MEMBERS,YEARS_BIRTH,YEARS_EMPLOYED
count,438557.0,438557.0,438557.0,438557.0,438557.0
mean,0.42739,187524.3,2.194465,43.82988,5.952069
std,0.724882,110086.9,0.897207,11.465823,6.568213
min,0.0,26100.0,1.0,20.52,-0.0
25%,0.0,121500.0,2.0,34.28,1.02
50%,0.0,160780.5,2.0,42.82,4.02
75%,1.0,225000.0,3.0,53.38,8.5
max,19.0,6750000.0,20.0,69.04,48.03


In [228]:
df.select_dtypes("object").columns

Index(['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE',
       'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE'],
      dtype='object')

In [229]:
charts = []

for i, column in enumerate(df.select_dtypes("object").columns):
    totrows = df.shape[0]
    valcounts = df[column].value_counts().sort_index().reset_index()
    valcounts["percentage"] = round(valcounts[column]*100/totrows, 1).astype(str) +"%"
    base = alt.Chart(valcounts, title = column).encode(
            x=alt.X('index', title = "Category"),
            y=alt.Y(column, title = "Count"),
            text = 'percentage'
        ).properties(
            width={"step": 35},
            height=300
        )

    charts.append(base.mark_bar() + base.mark_text(align='center', baseline='line-top', dy = -15))


In [230]:
alt.hconcat(charts[0], charts[1], charts[2], charts[7], charts[8], charts[9])

In [231]:
alt.hconcat(charts[3], charts[4],charts[5], charts[6])

In [232]:
charts[10]

In [233]:
df.select_dtypes(["int64", "float64"]).columns

Index(['CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'CNT_FAM_MEMBERS', 'YEARS_BIRTH',
       'YEARS_EMPLOYED'],
      dtype='object')

In [234]:
numcharts = []
cols = list(df.select_dtypes(["int64", "float64"]).columns)

for column in cols:

    base = alt.Chart(df[column].reset_index(), title = column).encode(
            x=alt.X(column, bin = True),
            y='count()'
        ).properties(
            height=300
        )

    numcharts.append(base.mark_bar())

In [235]:
alt.hconcat(numcharts[0], numcharts[1])

In [236]:
alt.hconcat(numcharts[2], numcharts[3])

In [237]:
numcharts[4]

In [238]:
cor_data = df.select_dtypes(["int64", "float64"]).corr().stack().reset_index()
cor_data.rename(columns = {0: "Correlation Coefficient", "level_0" : "Variable 1", "level_1" : "Variable 2" }, inplace = True)
cor_data["coeficient"] = np.round(cor_data["Correlation Coefficient"], 2)

cor_data.head()

Unnamed: 0,Variable 1,Variable 2,Correlation Coefficient,coeficient
0,CNT_CHILDREN,CNT_CHILDREN,1.0,1.0
1,CNT_CHILDREN,AMT_INCOME_TOTAL,0.019177,0.02
2,CNT_CHILDREN,CNT_FAM_MEMBERS,0.884781,0.88
3,CNT_CHILDREN,YEARS_BIRTH,-0.349089,-0.35
4,CNT_CHILDREN,YEARS_EMPLOYED,0.038844,0.04


In [239]:
base = alt.Chart(cor_data).encode(
    x='Variable 1:O',
    y='Variable 2:O'    
    ).properties(
        width=300,
        height=300
    )

# Text layer with correlation labels
# Colors are for easier readability
text = base.mark_text().encode(
    text='coeficient',
    color=alt.condition(
        alt.datum.correlation > 0.5, 
        alt.value('white'),
        alt.value('black')
    )
)

# The correlation heatmap itself
cor_plot = base.mark_rect().encode(
    color='Correlation Coefficient:Q'
)

cor_plot + text # The '+' means overlaying the text and rect layer

In [1]:
boxplot1 = alt.Chart(df).mark_boxplot().encode(
    x = alt.X("CNT_CHILDREN")
)
boxplot1

NameError: name 'alt' is not defined

0         0
1         0
2         0
3         0
4         0
         ..
438552    0
438553    0
438554    0
438555    0
438556    0
Name: CNT_CHILDREN, Length: 438557, dtype: int64