In [1]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from scipy import stats
import plotly.express as px
import math 
from sklearn.preprocessing import LabelEncoder
import shap

In [3]:
pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_rows = 200
pd.options.display.max_columns = 150

In [4]:
#Reading the Dataset
new_data=pd.read_csv('accepted_2007_to_2018Q4.csv',low_memory=False)

In [5]:
new_data.shape

(2195670, 151)

In [6]:
#Response/Target Variable 

new_data.loan_status.value_counts()

Fully Paid                                             1048714
Current                                                 851419
Charged Off                                             259849
Late (31-120 days)                                       20652
In Grace Period                                           8055
Late (16-30 days)                                         4164
Does not meet the credit policy. Status:Fully Paid        1988
Does not meet the credit policy. Status:Charged Off        761
Default                                                     37
Name: loan_status, dtype: int64

In [None]:
#Considering only Fully Paid, Default, Charged Off Loans.
resp=['Current']
Loan_Main = new_data[~new_data['loan_status'].isin(resp)] 
Loan_Main["loan_status"] = np.where(Loan_Main["loan_status"].str.contains("Fully"), 0, 1)

#New Dataset shape
Loan_Main.shape

# Changing employee length to numerical.
Loan_Main['emp_length'].fillna("< 1 year",inplace= True)
dict_emp_length = {'10+ years':10, '6 years':6, '4 years':4, '< 1 year':0.5, '2 years':2,'9 years':9, '5 years':5, '3 years':3, '7 years':7, '1 year':1,'8 years':8}
Loan_Main['emp_length'].replace(dict_emp_length, inplace=True)

#Seperating Loan_Main dataset into two based on Application_Type (Individual / Joint Type)
grouped = Loan_Main.groupby(Loan_Main.application_type)
Individual_Loan=grouped.get_group("Individual")
Joint_Loan=grouped.get_group("Joint App")

In [None]:
Individual_Loan.shape, Joint_Loan.shape

In [None]:
# Changing purpose into numerical using labelencoder
col=['purpose']
enc= LabelEncoder()
Individual_Loan.loc[:,col]= Individual_Loan.loc[:,col].apply(enc.fit_transform)

In [None]:
#Loan Issued over the years 

Individual_Loan['issue_d'] = pd.to_datetime(Individual_Loan['issue_d'])

Individual_Loan['issue_d'].dt.year.value_counts().sort_index().plot.bar()
plt.title('Loans Issued Per Year')
plt.xlabel('Financial Year')
plt.ylabel('Count')
plt.tight_layout()

plt.show()

In [None]:
#set fraction
frac = 0.85
#get fractions and make deep copy
df_train = Individual_Loan.loc[Individual_Loan['issue_d']  < Individual_Loan['issue_d'].quantile(frac)]
df_test =  Individual_Loan.loc[Individual_Loan['issue_d'] >= Individual_Loan['issue_d'].quantile(frac)]

In [12]:
#save as compressed zipped files to save space
df_test.to_csv('data/df_test.csv.zip')
df_train.to_csv('data/df_train.csv.zip')

In [13]:
df_train.shape,df_test.shape

((1106465, 151), (209622, 151))