In [1]:
import pandas as pd
import sys
sys.path.append('..src')
from missing_values import fill_missing_values

In [2]:
df = pd.read_csv('../data/credit_risk_dataset.csv')
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [3]:
df.shape

(32581, 12)

In [4]:
df.isnull().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [5]:
df = fill_missing_values(df, method='mean', columns=['person_emp_length','loan_int_rate'])

In [6]:
df.isnull().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [7]:
df['person_age'].describe()

count    32581.000000
mean        27.734600
std          6.348078
min         20.000000
25%         23.000000
50%         26.000000
75%         30.000000
max        144.000000
Name: person_age, dtype: float64

In [8]:
df['person_emp_length'].describe()

count    32581.000000
mean         4.789686
std          4.085333
min          0.000000
25%          2.000000
50%          4.000000
75%          7.000000
max        123.000000
Name: person_emp_length, dtype: float64

In [9]:
from outliers import detect_outliers_iqr, detect_outliers_zscores, detect_outliers_isolation_forest, handle_outliers, remove_outliers

In [10]:
outliers_iqr = detect_outliers_iqr(df, 'person_age')
print(outliers_iqr)

       person_age  person_income person_home_ownership  person_emp_length  \
81            144         250000                  RENT                4.0   
183           144         200000              MORTGAGE                4.0   
575           123          80004                  RENT                2.0   
747           123          78000                  RENT                7.0   
29121          50         900000              MORTGAGE               11.0   
...           ...            ...                   ...                ...   
32576          57          53000              MORTGAGE                1.0   
32577          54         120000              MORTGAGE                4.0   
32578          65          76000                  RENT                3.0   
32579          56         150000              MORTGAGE                5.0   
32580          66          42000                  RENT                2.0   

             loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status 

In [11]:
outliers_zscores = detect_outliers_zscores(df, 'person_age')
print(outliers_zscores)

       person_age  person_income person_home_ownership  person_emp_length  \
81            144         250000                  RENT                4.0   
183           144         200000              MORTGAGE                4.0   
575           123          80004                  RENT                2.0   
747           123          78000                  RENT                7.0   
29121          50         900000              MORTGAGE               11.0   
...           ...            ...                   ...                ...   
32576          57          53000              MORTGAGE                1.0   
32577          54         120000              MORTGAGE                4.0   
32578          65          76000                  RENT                3.0   
32579          56         150000              MORTGAGE                5.0   
32580          66          42000                  RENT                2.0   

             loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status 

In [12]:
outliers_isolation_forest = detect_outliers_isolation_forest(df, 'person_age')
print(outliers_isolation_forest)

       person_age  person_income person_home_ownership  person_emp_length  \
0              22          59000                  RENT              123.0   
1              21           9600                   OWN                5.0   
5              21           9900                   OWN                2.0   
9              21          10000                   OWN                6.0   
10             22          85000                  RENT                6.0   
...           ...            ...                   ...                ...   
32576          57          53000              MORTGAGE                1.0   
32577          54         120000              MORTGAGE                4.0   
32578          65          76000                  RENT                3.0   
32579          56         150000              MORTGAGE                5.0   
32580          66          42000                  RENT                2.0   

           loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \

In [13]:
outliers_zscores = detect_outliers_zscores(df, 'person_emp_length')
print(outliers_zscores)

       person_age  person_income person_home_ownership  person_emp_length  \
0              22          59000                  RENT              123.0   
210            21         192000              MORTGAGE              123.0   
17834          34         948000              MORTGAGE               18.0   
17877          34         334000                   OWN               18.0   
17888          34          61200                  RENT               18.0   
...           ...            ...                   ...                ...   
32515          53         106000              MORTGAGE               38.0   
32516          52          29000                   OWN               23.0   
32534          76          75000                  RENT               23.0   
32539          61         148000              MORTGAGE               30.0   
32562          61         160000              MORTGAGE               30.0   

             loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status 

In [14]:
df_cleaned = remove_outliers(df, 'person_age', 60)
max(df_cleaned['person_age'])

60

In [15]:
max(df_cleaned['person_age'])

60

In [16]:
df_cleaned['person_age'].describe()

count    32511.000000
mean        27.640214
std          5.956044
min         20.000000
25%         23.000000
50%         26.000000
75%         30.000000
max         60.000000
Name: person_age, dtype: float64

In [17]:
df_cleaned = handle_outliers(df_cleaned, 'person_emp_length', method='remove')
max(df_cleaned['person_emp_length'])

14.0

In [18]:
df_cleaned['person_emp_length'].describe()


count    31666.000000
mean         4.442272
std          3.395190
min          0.000000
25%          2.000000
50%          4.000000
75%          7.000000
max         14.000000
Name: person_emp_length, dtype: float64

In [19]:
df_cleaned.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [20]:
df_cleaned.shape

(31666, 12)

In [21]:
from normalization import normalize_data

In [22]:
# Standard scaling (mean=0, std=1)
df_normalize = normalize_data(df_cleaned, method='standard', columns=['person_age', 'person_income'])

In [23]:
df_normalize.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
1,-1.105402,-1.102053,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,-0.416099,-1.102053,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,-0.760751,0.005339,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,-0.588425,-0.214555,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
5,-1.105402,-1.09611,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,N,2


In [24]:
from encoding import encode_data

In [25]:
df_label_encoded = encode_data(df_normalize, method='onehot', columns= ['person_home_ownership', 'loan_intent'])
df_onehot_encoded = encode_data(df_label_encoded, method='label', columns= ['loan_grade', 'cb_person_default_on_file'])

In [26]:
df_onehot_encoded.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
1,-1.105402,-1.102053,5.0,1,1000,11.14,0,0.1,0,2,False,False,True,False,False,True,False,False,False,False
2,-0.416099,-1.102053,1.0,2,5500,12.87,1,0.57,0,3,True,False,False,False,False,False,False,True,False,False
3,-0.760751,0.005339,4.0,2,35000,15.23,1,0.53,0,2,False,False,False,True,False,False,False,True,False,False
4,-0.588425,-0.214555,8.0,2,35000,14.27,1,0.55,1,4,False,False,False,True,False,False,False,True,False,False
5,-1.105402,-1.09611,2.0,0,2500,7.14,1,0.25,0,2,False,False,True,False,False,False,False,False,False,True
