In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [34]:
credit_df = pd.read_csv('credit_risk_dataset.csv')
credit_df.shape

(32581, 12)

In [35]:
credit_df.head(5)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [36]:
credit_df.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [37]:
credit_df.dropna(axis=0,inplace=True)

In [38]:
credit_df.shape

(28638, 12)

In [39]:
credit_df.reset_index(drop=True, inplace = True)

In [40]:
credit_df.shape

(28638, 12)

In [41]:
credit_df = credit_df[credit_df['person_age'] < 80]

In [42]:
credit_df.shape

(28632, 12)

In [43]:
credit_df.head(2)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2


### Grouping age 

In [44]:
max_ = credit_df['person_age'].max()
min_ = credit_df['person_age'].min()
print(f"maximum {max_}")
print(f"minimum {min_}")

maximum 78
minimum 20


In [45]:
credit_df['age_group'] = pd.cut(credit_df['person_age'],
                           bins=[20, 26, 36, 46, 56, 66],
                           labels=['20-25', '26-35', '36-45', '46-55', '56-65'])

### Income Group 

In [46]:
max_ = credit_df['person_income'].max()
min_ = credit_df['person_income'].min()

print(f"maximum {max_}")
print(f"minimum {min_}")

maximum 2039784
minimum 4000


In [47]:
def income_group(arr):
    lenarr = len(arr)
    for i in range(0,lenarr-1):
        next = arr[i]+1
        num_people = credit_df['person_income'].between(next,arr[i+1]).sum()
        print(f'There are {num_people} people with an income between {next} and {arr[i+1]}.')
        
income_group([0, 25000, 50000, 75000, 100000,float('inf')])

print(credit_df.shape[0]) 

There are 1972 people with an income between 1 and 25000.
There are 10198 people with an income between 25001 and 50000.
There are 8531 people with an income between 50001 and 75000.
There are 4197 people with an income between 75001 and 100000.
There are 3734 people with an income between 100001 and inf.
28632


In [48]:
credit_df['income_group'] = pd.cut(credit_df['person_income'],
                              bins=[0, 25000, 50000, 75000, 100000, float('inf')],
                              labels=['low', 'low-middle', 'middle', 'high-middle', 'high'])

In [49]:
credit_df.head(2)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_group
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3,20-25,middle
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25,low


### LOAN AMOUNT

In [50]:
max_loan_amount = credit_df['loan_amnt'].max()
min_loan_amount = credit_df['loan_amnt'].min()

print(f"maximum {max_loan_amount}")
print(f"minimum {min_loan_amount}")

maximum 35000
minimum 500


In [51]:
def loan_amount_group(arr):
    lenarr = len(arr)
    for i in range(0,lenarr-1):
        next = arr[i]+1
        num_people = credit_df['loan_amnt'].between(next,arr[i+1]).sum()
        print(f'There are {num_people} people with an loan_amount between {next} and {arr[i+1]}.')
        
loan_amount_group([0, 5000, 10000, 15000, float('inf')])

print(credit_df.shape[0]) 

There are 8182 people with an loan_amount between 1 and 5000.
There are 10607 people with an loan_amount between 5001 and 10000.
There are 5430 people with an loan_amount between 10001 and 15000.
There are 4413 people with an loan_amount between 15001 and inf.
28632


In [52]:
credit_df['loan_amount_group'] = pd.cut(credit_df['loan_amnt'],
                                   bins=[0, 5000, 10000, 15000, float('inf')],
                                   labels=['small', 'medium', 'large', 'very large'])

### Valid Employment Length 

In [53]:
credit_df = credit_df[credit_df['person_emp_length'] <= 60]

In [54]:
credit_df.shape

(28630, 15)

In [56]:
credit_df.head(2)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_group,loan_amount_group
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25,low,small
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25,low,medium


In [57]:
credit_df['person_home_ownership'].value_counts()

RENT        14547
MORTGAGE    11797
OWN          2192
OTHER          94
Name: person_home_ownership, dtype: int64

### Creating New Features 

In [58]:
# Create loan-to-income ratio
credit_df['loan_to_income_ratio'] = credit_df['loan_amnt'] / credit_df['person_income']

# Create loan-to-employment length ratio
credit_df['loan_to_emp_length_ratio'] =  credit_df['person_emp_length']/ credit_df['loan_amnt'] 

# Create interest rate-to-loan amount ratio
credit_df['int_rate_to_loan_amt_ratio'] = credit_df['loan_int_rate'] / credit_df['loan_amnt']

In [59]:
raw_data = credit_df
raw_data.head(2)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_group,loan_amount_group,loan_to_income_ratio,loan_to_emp_length_ratio,int_rate_to_loan_amt_ratio
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25,low,small,0.104167,0.005,0.01114
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25,low,medium,0.572917,0.000182,0.00234


In [61]:
raw_data.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length', 'age_group',
       'income_group', 'loan_amount_group', 'loan_to_income_ratio',
       'loan_to_emp_length_ratio', 'int_rate_to_loan_amt_ratio'],
      dtype='object')

In [64]:
raw_data.to_csv('./data/pre_processed_credit.csv',header=True, index = False)

In [67]:
d = pd.read_csv("./data/pre_processed_credit.csv")
d.head(2)

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,age_group,income_group,loan_amount_group,loan_to_income_ratio,loan_to_emp_length_ratio,int_rate_to_loan_amt_ratio
0,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2,20-25,low,small,0.104167,0.005,0.01114
1,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3,20-25,low,medium,0.572917,0.000182,0.00234


In [68]:
d.shape

(28630, 18)