In [163]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

In [164]:
dataset = pd.read_csv('./credit_risk_dataset.csv')
dataset.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [165]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
person_age                    32581 non-null int64
person_income                 32581 non-null int64
person_home_ownership         32581 non-null object
person_emp_length             31686 non-null float64
loan_intent                   32581 non-null object
loan_grade                    32581 non-null object
loan_amnt                     32581 non-null int64
loan_int_rate                 29465 non-null float64
loan_status                   32581 non-null int64
loan_percent_income           32581 non-null float64
cb_person_default_on_file     32581 non-null object
cb_person_cred_hist_length    32581 non-null int64
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [166]:
dataset.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [167]:
dataset.index

RangeIndex(start=0, stop=32581, step=1)

In [168]:
dataset.shape

(32581, 12)

In [169]:
dataset['person_home_ownership'].unique()

array(['RENT', 'OWN', 'MORTGAGE', 'OTHER'], dtype=object)

In [170]:
dataset['person_home_ownership'].value_counts()

RENT        16446
MORTGAGE    13444
OWN          2584
OTHER         107
Name: person_home_ownership, dtype: int64

In [171]:
dataset['loan_intent'].unique()

array(['PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT',
       'DEBTCONSOLIDATION'], dtype=object)

In [172]:
dataset['loan_intent'].value_counts()

EDUCATION            6453
MEDICAL              6071
VENTURE              5719
PERSONAL             5521
DEBTCONSOLIDATION    5212
HOMEIMPROVEMENT      3605
Name: loan_intent, dtype: int64

In [173]:
dataset['cb_person_default_on_file'].unique()

array(['Y', 'N'], dtype=object)

In [174]:
dataset['cb_person_default_on_file'].value_counts()

N    26836
Y     5745
Name: cb_person_default_on_file, dtype: int64

In [175]:
dataset['loan_grade'].unique()

array(['D', 'B', 'C', 'A', 'E', 'F', 'G'], dtype=object)

In [176]:
dataset['loan_grade'].value_counts()

A    10777
B    10451
C     6458
D     3626
E      964
F      241
G       64
Name: loan_grade, dtype: int64

In [177]:
# Replace the LOAN_GRADE column values with numerical values for better training.
# ['D', 'B', 'C', 'A', 'E', 'F', 'G'].
loan_grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
dataset['loan_grade'] = dataset['loan_grade'].map(loan_grade_mapping)

# Do the same for the LOAN_INTENT column values too.
# ['PERSONAL', 'EDUCATION', 'MEDICAL', 'VENTURE', 'HOMEIMPROVEMENT', 'DEBTCONSOLIDATION'].
loan_intent_mapping = {'PERSONAL': 1, 'EDUCATION': 2, 'MEDICAL': 3, 'VENTURE': 4, 'HOMEIMPROVEMENT': 5, 'DEBTCONSOLIDATION': 6}
dataset['loan_intent'] = dataset['loan_intent'].map(loan_intent_mapping)

# For the PERSON_HOME_OWNERSHIP column
# ['RENT', 'OWN', 'MORTGAGE', 'OTHER'].
person_home_ownership_mapping = {'RENT': 1, 'OWN': 2, 'MORTGAGE': 3, 'OTHER': 4}
dataset['person_home_ownership'] = dataset['person_home_ownership'].map(person_home_ownership_mapping)

# For the CB_PERSON_DEFAULT_ON_FILE column values.
cb_person_default_on_file_mapping = {'Y': 1, 'N': 0}
dataset['cb_person_default_on_file'] = dataset['cb_person_default_on_file'].map(cb_person_default_on_file_mapping)

dataset.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,1,123.0,1,4,35000,16.02,1,0.59,1,3
1,21,9600,2,5.0,2,2,1000,11.14,0,0.1,0,2
2,25,9600,3,1.0,3,3,5500,12.87,1,0.57,0,3
3,23,65500,1,4.0,3,3,35000,15.23,1,0.53,0,2
4,24,54400,1,8.0,3,3,35000,14.27,1,0.55,1,4


In [178]:
def convert_to_percentage(value):
    return int(value * 100)

In [179]:
dataset['loan_percent_income'] = dataset['loan_percent_income'].apply(convert_to_percentage)

In [180]:
dataset.head(2)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,1,123.0,1,4,35000,16.02,1,59,1,3
1,21,9600,2,5.0,2,2,1000,11.14,0,10,0,2
