# Lending Club Project

## 1 Introduction:
- #### 1.1 Objective:Using Borrower’s credit profile, historical credit performance, credit behavior and macroeconomic condition to forecast its potential credit default behavior (Charged Off/Fully Paid) 

## 2 Data preprocessing:
- #### 2.1 Load data and select useful target values
- #### 2.2 Feature cleaning and Handle missing value
    - ##### Feature cleaning
        - ##### 2.2(a) Merge joint and not joint features
        - ##### 2.2(b) Covert revol_util to float
        - ##### 2.2(c) Add the value of two applicants for some joint feature
    - ##### Handle missing vaule
        - ##### 2.2(d) Remove meaningless data
- #### 2.3 Feature Engineering
- #### 2.4 Deal with outlier
- #### 2.5 Encode categorial features and feature normalization

## 3 Methology:
- #### 3.0 train test split and normalization
- #### 3.1 Logistic Regression model
- #### 3.2 Neural Network Model: MLP
- #### 3.3 Random Forest Model
- #### 3.4 Decision Tree Model

## 4 Conclusion:

In [4]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import quandl
import quandl

quandl.ApiConfig.api_key = '9y_-mxHB3Tj5WboS_z6W'
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

## 2 Data preprocessing

- ### 2.1 Load data and select useful target values
Only three loan status are used: “Charged-off”, “Default” and “Fully-paid”.
“Charged-off” and “Default” are categorized into the same class “Default”


In [5]:
############## Read raw data
#Load Data from data directory
file_name_list = ['LoanStats3a','LoanStats3b','LoanStats3c','LoanStats3d']
#                  'LoanStats_2016Q1','LoanStats_2016Q2','LoanStats_2016Q3',
#                  'LoanStats_2016Q4','LoanStats_2017Q1','LoanStats_2017Q2',
#                  'LoanStats_2017Q3','LoanStats_2017Q4']

#Create a raw data dataframe
raw_data = pd.DataFrame()

#For all data file in the file_name_list
for i in range(len(file_name_list)):
    #Read CSV file
    #Skip the first row; header=0 denotes the first line of data rather than the first line of the file
    temp = pd.read_csv('./data/'+file_name_list[i]+'.csv', 
                       skiprows=1,header=0, encoding = "ISO-8859-1")
    #Concatenate all all fles together
    raw_data = pd.concat([raw_data, temp])

It is a summary of the raw_data and explaination of features

In [6]:
print("There are",raw_data.shape[1],"features in the raw_data")


There are 144 features in the raw_data


#### Sample ratio portion of data, keep the "Fully Paid" and "Default" entries and remove entires with NaN values
- Only keep a random subset of ratio*total_number of rows
- Remove some rows with NaN values
- Only keep the "Fully Paid" and "Default" entries
- Remove entries with NaN values

In [7]:
ratio = 0.1
samplen = int(ratio*raw_data.shape[0])
#Only keep a random subset of ratio*total_number of rows
raw_data = raw_data.sample(n=samplen)
#re-index the subset
raw_data.reindex()

raw_data.index = np.arange(raw_data.shape[0])

#Remove some rows with NaN values
raw_data = raw_data[~raw_data['acc_open_past_24mths'].isna()]
raw_data = raw_data[~raw_data['tot_coll_amt'].isna()]

#Only keep the "Fully Paid" and "Default" entries 
raw_data = raw_data[(raw_data['loan_status']!='Late (31-120 days)') & 
                    (raw_data['loan_status']!= 'In Grace Period') &
                    (raw_data['loan_status']!='Current') &
                    (raw_data['loan_status']!='Late (16-30 days)') &
                    (~raw_data['loan_status'].isna())]

#Remove entries with NaN values
raw_data = raw_data.dropna(how='all')

In [7]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

### 2.2 Feature cleaning and Handle missing value

- ### Feature cleaning
    - #### 2.2(a) Merge joint and not joint features
    Check the existence of "joint" features; if so, replace the original feature by the joint one 


In [8]:
#Process joint features
raw_data['annual_inc'] = raw_data['annual_inc'].astype(float)
raw_data['annual_inc_joint'] = raw_data['annual_inc_joint'].astype(float)
joint_features1 = ['annual_inc','dti','verification_status','revol_bal']
for ft in joint_features1:
    ft_joint = ft+'_joint'
    #Check the existence of "joint" features; if so, replace the original feature by the joint one 
    if ft != 'verification_status':
        raw_data[ft] = raw_data.apply(lambda x: x[ft] if math.isnan(x[ft_joint]) else x[ft_joint], axis=1)
    else:
        raw_data[ft] = raw_data.apply(lambda x: x[ft_joint] if type(x[ft_joint]) is str else x[ft], axis=1)



   - #### 2.2(b) Covert revol_util to float
converting from XX% to a value between 0 and 1

In [9]:
#Process the revol_util feature: converting from XX% to a value between 0 and 1; Note that x[:-1] is to remove the % symbol
raw_data['revol_util'] = raw_data['revol_util'].apply(lambda x: float(x[:-1])/100 if type(x) is str else x/100)
raw_data['sec_app_revol_util'] = raw_data['sec_app_revol_util']/100


   - #### 2.2(c) Add the value of two applicants for some joint feature

In [10]:
#For some features about two applicants, add their values
joint_features2 = ['earliest_cr_line','inq_last_6mths','mort_acc','open_acc',
                   'revol_util','open_act_il','num_rev_accts',
                   'chargeoff_within_12_mths','collections_12_mths_ex_med',
                   'mths_since_last_major_derog']
for ft in joint_features2:
    ft_joint = 'sec_app_'+ft
    if ft != 'earliest_cr_line':
        raw_data[ft] = raw_data.apply(lambda x: x[ft] if math.isnan(x[ft_joint]) else x[ft]+x[ft_joint], axis=1)


   - #### 2.2(d) Feature cleaning

##### meaning for all the features

In [11]:
#Read the information about each column/feature from the col_info.csv file
col_info = pd.read_csv('./data/data_process.csv', index_col=0, encoding = "ISO-8859-1")
print(col_info)

                                                             description  \
id                         A unique LC assigned ID for the loan listing.   
member_id               A unique LC assigned Id for the borrower member.   
loan_amnt              The listed amount of the loan applied for by t...   
funded_amnt            The total amount committed to that loan at tha...   
funded_amnt_inv        The total amount committed by investors for th...   
...                                                                  ...   
settlement_status      The status of the borrowerâs settlement plan...   
settlement_date        The date that the borrower agrees to the settl...   
settlement_amount      The loan amount that the borrower has agreed t...   
settlement_percentage  The settlement amount as a percentage of the p...   
settlement_term        The number of months that the borrower will be...   

                      delete?        delete reason special process method  \
id        

##### Drop 'earliest_cr_line','sec_app_earliest_cr_line' features for now

In [28]:
#Drop 'earliest_cr_line','sec_app_earliest_cr_line' features for now
#drop the columns in the data; axis=1 means dropping the columns
raw_data = raw_data.drop(['earliest_cr_line','sec_app_earliest_cr_line'], axis=1)
#drop the columns in the feature description table ; axis=0 means dropping the rows
col_info = col_info.drop(['earliest_cr_line','sec_app_earliest_cr_line'], axis=0)

KeyError: "['earliest_cr_line' 'sec_app_earliest_cr_line'] not found in axis"

##### Remove the features that are annotated with "yes" in the "delete?" column in the col_info 

In [13]:
drop_list = col_info.index[col_info['delete?']=='yes']
raw_data = raw_data.drop(drop_list, axis=1)
col_info = col_info.drop(drop_list, axis=0)

##### Remove some joint features

In [27]:
#Remove some joint features
drop_list = col_info.index[(col_info['special process method']=='joint-feature') |
            (col_info['special process method']=='joint-feature;add to first applicant value') |
            (col_info['special process method']=='joint-feature;transform to month until loan issue data, add to first applicant value')]
raw_data = raw_data.drop(drop_list, axis=1)
col_info = col_info.drop(drop_list, axis=0)

### Handle missing value
- ##### 2.2(d) Remove meaningless data
    - ##### Remove features if they contain 50% missing values

In [15]:
#Remove features if they contain 50% missing values 
drop_list = col_info.index[col_info['Misssing Percent%'] > 50]
raw_data = raw_data.drop(drop_list, axis=1)
col_info = col_info.drop(drop_list, axis=0)

   - ##### Process the feature according to the instruction  in col_info

In [16]:
#"if not 0, set as 1"
select_fts = col_info.index[col_info['special process method']==
                               'if not 0, set as 1']
for ft in select_fts:
    raw_data[ft] = raw_data[ft].apply(lambda x: '1' if x!=0 else str(x))

#"if nan, set as 0; else set as 1"
raw_data['emp_title'] = raw_data['emp_title'].apply(lambda x: 0 if type(x) is float else 1)

#"if nan, set as 0; else transform to 1/(1+x)"
select_fts = col_info.index[col_info['special process method']==
                                 'if nan, set as 0; else transform to 1/(1+x)']
for ft in select_fts:
    raw_data[ft] = raw_data[ft].apply(lambda x: 0 if math.isnan(x) else 1/(1+x))

#"if nan use median value of zipcode"
select_fts = col_info.index[col_info['special process method']=='if nan use median value of zipcode']
for ft in select_fts:
     ft_dict = (raw_data[ft].groupby(by=raw_data.zip_code).median()).to_dict()
     raw_data[ft] = raw_data.apply(lambda x: ft_dict[x.zip_code] if math.isnan(x[ft]) else x[ft], axis=1)


 - ##### For home ownership features, merge the two infrequent "ANY" and "NONE" labels

In [17]:
raw_data['home_ownership'] = raw_data['home_ownership'].apply(lambda x: 'OTHER'
                             if x in ['ANY','NONE'] else x)

- ##### Remove policy code feature, because all rows have the same value

In [18]:
#Remove policy code feature, because all rows have the same value
raw_data = raw_data.drop(['policy_code'], axis=1)
col_info = col_info.drop(['policy_code'], axis=0)

In [19]:
number_of_feature_row = raw_data.shape[1]
print("After handling the missing value from",file_name_list,",there are",number_of_feature_row,"features")

After handling the missing value from ['LoanStats3a', 'LoanStats3b', 'LoanStats3c', 'LoanStats3d'] ,there are 60 features


### 2.3 Feature Engineering

#### Make a copy of the raw_data named X_loan and remove entries with NaN value 

In [20]:
X_loan = raw_data.copy()
#Remove entries with NaN values
X_loan.dropna(inplace=True)

#Check the sizes of the raw data, the processed data and the feature info
print("By checking the size of the raw data, there are ",raw_data.shape[0],"rows and",raw_data.shape[1],"columns")
print("By checking the size of the X_loan data, there are",X_loan.shape[0],"rows and",X_loan.shape[1],"columns")
print("By checking the size of the col_info, there are",col_info.shape[0],"rows and",col_info.shape[1],"columns")

By checking the size of the raw data, there are  76038 rows and 60 columns
By checking the size of the X_loan data, there are 69534 rows and 60 columns
By checking the size of the col_info, there are 60 rows and 7 columns


### Zipcode feature engineering
- Map the first three digits to mean household income（x[:3] means the first three digits in the zipcode string）
- Add the feature info and index to col_info（Two "no"s meaning "dont delete" and "non-categorical"）
- Add the marcoeconomic feature 
- Add the price level of state info

In [21]:
def get_zip2inc(path='./data/ZIP.csv'):
    
    """
    Returns:
        zip_to_inc: dict, map the first three digits to mean household income
    """
    
    zip_data = pd.read_csv(path)
    zip_data.Zip = zip_data.Zip.apply(lambda x: "{:0>5}".format(x)) # 100->00100 padding the zip into 5 space number string
    
    def helper(x):

        x = x.split(',')
        new_x = ""
        for s in x:
            new_x += s
        
        try:
            rtn_value = int(new_x)
        except ValueError:
            rtn_value = new_x
            
        return rtn_value
    
    zip_data.Mean = zip_data.apply(lambda x: x.Median if x.Mean=='.' else x.Mean, axis=1)
    for ft in ['Mean','Pop']:
        zip_data[ft] = zip_data[ft].apply(helper)
    
    # map the three digit zip code to population-weighted mean household income
    zip_data['Zip_3'] = zip_data.Zip.apply(lambda x: x[:3])
    
    zip_to_inc = {}
    for zip_code, df in zip_data.groupby('Zip_3'):
        zip_to_inc[zip_code] = (df['Mean'] * df['Pop']).sum() / df['Pop'].sum()
    
    return zip_to_inc


#Start processing the zipcode feature
zip_to_inc = get_zip2inc()

#Get the first three digits and map to mean household income; x[:3] means the first three digits in the zipcode string
#Add to the dataframe
X_loan['mean_household_inc'] = X_loan.zip_code.apply(lambda x: zip_to_inc[x[:3]]
                                    if x[:3] in zip_to_inc.keys() else np.nan)
#Add the feature info and index to col_info
#Two "no"s meaning "dont delete" and "non-categorical"
ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['mean_household_inc'], columns=col_info.columns)
col_info = pd.concat([col_info, ft_inform], axis=0)


#### macroeconomic environment feature:
- #### Barron's Confidence:
yield(AAA) / yiedl(BBB), ML/AAAEY, ML/BBBEY from quandl

In [22]:
def get_confi_idx(df_loan):
    
    """
    MapE the issue_d to the macroeconomic data at that time via quandl.
    Barron's Confidence: yield(AAA) / yiedl(BBB), ML/AAAEY, ML/BBBEY from quandl
    
    Returns:
        pandas.Series
    """
    
    AAA_yield_data = quandl.get("ML/AAAEY", collapse="monthly", start_date="2001-01-01", end_date="2017-12-31")
    BBB_yield_data = quandl.get("ML/BBBEY", collapse="monthly", start_date="2001-01-01", end_date="2017-12-31")
    Barron_confi_idx = AAA_yield_data.BAMLC0A1CAAAEY / BBB_yield_data.BAMLC0A4CBBBEY
    confi_idx_dict = {}
    for d, idx in Barron_confi_idx.iteritems():
        str_d = d.strftime("%b-%Y")
        confi_idx_dict[str_d] = idx
    
    return df_loan.issue_d.apply(lambda x: confi_idx_dict[x])

#Map issue_d to Barron_confi_idx 
#Add confi_idx feature to the dataframe 
X_loan['confi_idx'] = get_confi_idx(X_loan)

#Add the feature info and index to col_info
ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]], index=['confi_idx'], columns=col_info.columns)
col_info = pd.concat([col_info, ft_inform], axis=0)

#### Add the price level of state feature and some extra features
- RPP(regional price parities) --> the regional price parities according to the state
- Satacct_num_to_totacct_num(ratio of satisfactory accounts) --> num of satisfactory/num of all open account
- Revbalgt0_num_to_revtrd_num(ratio between the  number of revolving trades with balance --> 0 and the number of currently active revolving trades) --> num_rev_tl_bal_gt_0/num_actv_rev_tl
- Extra feature: loan_inc_ratio(the ratio of loan amount to annual income) -->  loan amount / annual income
- Extra feature: loan_rev_ratio(the ratio of loan amount to revol_balance) --> loan amount / revol balance 

In [23]:
#Read the price level of state info from states.csv file
states = pd.read_csv("./data/states.csv", index_col=0)
abb_to_full = { str(states.loc[idx][0]) : str(idx)  for idx in states.index}

rpp_data = pd.read_csv("./data/download.csv", header=4, usecols=['GeoName', 'LineCode', '2010'])
rpp_all_data = rpp_data[rpp_data['LineCode'] == 1.0]
full_to_rpp = {rpp_all_data.loc[idx, 'GeoName'] : rpp_all_data.loc[idx, '2010'] for idx in rpp_all_data.index}

def process_RPP_via_abbre_state(df_loan, abb2full, full2RPP):
    
    """
    Process the SARPP Regional Price Parities by states for each loan
    The Regional Price Parities measure the price level of the area. 
    (For the sake of simpilicity, use the 2010 data)
    
    Source: https://www.bea.gov/data/prices-inflation/regional-price-parities-state-and-metro-area
    
    Args:
        df_loan: pandas.DateFrame, the original data
        abb2full: dictionary, map the abbreviation to full name of states, e.g. "AK" : "Alaska"
        full2RPP: dictionary, map the full name of states to the RPP at 2010
        
    Returns:
        RPP: pandas.Series, the regional price parities according to the state   
    """
    
    RPP = df_loan.addr_state.apply(lambda x: full2RPP[ abb2full[x] ] )
    return RPP

#Add the RPP feature to the dataframe
X_loan['RPP'] = process_RPP_via_abbre_state(X_loan, abb_to_full, full_to_rpp)
#Add the feature info and index to col_info
ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]], index=['RPP'], columns=col_info.columns)
col_info = pd.concat([col_info, ft_inform], axis=0)


#Compute the ratio of satisfactory accounts
temp = X_loan['num_sats']/X_loan['open_acc']
#Deal with inf values
temp[(temp==np.inf) | (temp==-np.inf)] = 100
#Add the ratio feature to dataframe
X_loan['satacct_num_to_totacct_num'] = temp
#Add the feature info and index to col_info
ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['satacct_num_to_totacct_num'], columns=col_info.columns)
col_info = pd.concat([col_info, ft_inform], axis=0)


#Compute the ratio between the  number of revolving trades with balance > 0 and the number of currently active revolving trades
temp = X_loan['num_rev_tl_bal_gt_0']/X_loan['num_actv_rev_tl']
#Deal with inf values
temp[(temp==np.inf) | (temp==-np.inf)] = 100
#Add the ratio feature to dataframe
X_loan['revbalgt0_num_to_revtrd_num'] = temp
#Add the feature info and index to col_info
ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['revbalgt0_num_to_revtrd_num'], columns=col_info.columns)
col_info = pd.concat([col_info, ft_inform], axis=0)


#Compute extra feature: loan amount / annual income
temp = X_loan['loan_amnt']/X_loan['annual_inc']
temp[(temp==np.inf) | (temp==-np.inf)] = 20
X_loan['loan_inc_ratio'] = temp
ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['loan_inc_ratio'], columns=col_info.columns)
col_info = pd.concat([col_info, ft_inform], axis=0)

#X_loan['sqrt_loan_inc_ratio'] = np.sqrt(temp)
#ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['sqrt_loan_inc_ratio'], columns=col_info.columns)
#col_info = pd.concat([col_info, ft_inform], axis=0)

#X_loan['ln_loan_inc_ratio'] = np.log(temp)
#temp[(temp==np.inf) | (temp==-np.inf)] = -20
#ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['ln_loan_inc_ratio'], columns=col_info.columns)
#col_info = pd.concat([col_info, ft_inform], axis=0)

##Compute extra feature: 1/dti, sqrt(dti)
#X_loan['r_dti'] = 1/X_loan['dti']
#temp[(temp==np.inf) | (temp==-np.inf)] = 20
#ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['r_dti'], columns=col_info.columns)
#col_info = pd.concat([col_info, ft_inform], axis=0)

#X_loan['sqrt_dti'] = np.sqrt(X_loan['dti'])
#ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['sqrt_dti'], columns=col_info.columns)
#col_info = pd.concat([col_info, ft_inform], axis=0)


#Compute extra feature: loan amount / revol balance
temp = X_loan['loan_amnt']/X_loan['revol_bal']
temp[(temp==np.inf) | (temp==-np.inf)] = 20
X_loan['loan_rev_ratio'] = temp
ft_inform = pd.DataFrame([[np.nan,'no',np.nan,np.nan,'no',0,0]],index=['loan_rev_ratio'], columns=col_info.columns)
col_info = pd.concat([col_info, ft_inform], axis=0)

#Drop some useless features
drop_list = ['issue_d','zip_code','addr_state']
X_loan = X_loan.drop(drop_list, axis=1)
col_info = col_info.drop(drop_list, axis=0)
X_loan = X_loan.drop(['tot_hi_cred_lim'], axis=1)
col_info = col_info.drop(['tot_hi_cred_lim'], axis=0)

X_loan.dropna(inplace=True)
X_loan.fillna(0,inplace=True)


#### Generate labels for moel trainning 

In [24]:
y_loan = X_loan['loan_status'].apply(lambda x: 0 if x=='Fully Paid' else 1)
#Remove label column from data and col_info
X_loan = X_loan.drop(['loan_status'], axis=1)
col_info = col_info.drop(['loan_status'], axis=0)

In [30]:
print("After adding the marcoeconomic environment feature, price level state features and some extra features")
print("There are",X_loan.shape[1],"features")
print(X_loan['loan_status'])

After adding the marcoeconomic environment feature, price level state features and some extra features
There are 62 features


KeyError: 'loan_status'

- ### 2.5 Encode categorial features and feature normalization
 - use the function OneHotEncoder from sklearn.preprocessing to encode the categorial features 

In [26]:
from sklearn.preprocessing  import OneHotEncoder

#Find all unique values of each categorical feature
#Select categorical features in col_info
categorical_list = col_info.index[~(col_info.categorical=='no')]
unique_values = []
for ft in categorical_list:
    unique_values.append(X_loan[ft].unique())

# One hot encoding of categorical features
enc = OneHotEncoder(categories=unique_values)
X_cat = X_loan[categorical_list]
enc.fit(X_cat)
X_cat_transform = enc.transform(X_cat).toarray()

#Add new columns into dataframe
col_list = []
for i in range(len(categorical_list)):
    ft = categorical_list[i]
    uniq_vals = unique_values[i]
    for j in range(len(uniq_vals)):
        col_list.append(ft+'_'+str(uniq_vals[j]))
X_cat_transform = pd.DataFrame(X_cat_transform, index=X_loan.index,
                               columns=col_list)
X_loan = pd.concat([X_loan, X_cat_transform], axis=1)

#Remove the original categorical columns
X_loan = X_loan.drop(categorical_list, axis=1)

print(X_loan.shape)
print(y_loan.shape)

ValueError: Unsorted categories are not supported for numerical categories

## 3 Methology

- #### 3.0 train test split and normalization

In [None]:
from sklearn.model_selection import train_test_split

#Split train/test data
X_train, X_test, y_train, y_test = train_test_split(X_loan, y_loan, test_size = 0.25) 

def standardize_numerical_features(X, features, n_features, c_features):
    
    """
    Use StandardScaler to standardize the numerical features in the dataset.
    
    Args:
        X: pd.DataFrame
        features: all features
        n_features: list, names of numerical features
        c_features: list, names of categorical features
        
    Returns: 
        tuple(scaler object, X_std), X_std is a ndarray (numerical, categorical)
        
    """
    
    from sklearn.preprocessing import StandardScaler
    
    # Use standardize scaler to transform the numerical features
    X = pd.DataFrame(X, columns=features)
    X.fillna(X.mean(), inplace=True)
    scaler = StandardScaler()
    X_std = scaler.fit_transform(X[n_features])
    # Concatenate the numerical and categorical features into ndarray
    X_std = np.append(X_std, X[c_features].values, axis=1)
    
    return (scaler, X_std)

def transform_numerical_features(sc, X, features, n_features, c_features):
    
    """
    Use the fitted StandardScaler to transform the numerical features of dataset
    """
    X = pd.DataFrame(X, columns=features)
    X.fillna(X.mean(), inplace=True)
    X_std = sc.transform(X[n_features])
    X_std = np.append(X_std, X[c_features].values, axis=1)
    
    return X_std



#Fit the normalizer only on training data
numerical_list = X_loan.columns #col_info.index[col_info.categorical=='no']
categorical_list = [] #col_list
sc, X_train_std = standardize_numerical_features(X_train, X_loan.columns,
                                                 numerical_list, categorical_list)
#Perform the scaler on test data
X_test_std = transform_numerical_features(sc, X_test, X_loan.columns,
                                          numerical_list, categorical_list)

##### Transfer the training and test dataset into certain type which could fit in the machine learning model 

In [None]:
def conf_mat_stats(conf_mat, type_str):
    TN = conf_mat[0][0]
    TP = conf_mat[1][1]
    FN = conf_mat[0][1]
    FP = conf_mat[1][0]
    print("In the"+type_str+"set, the confusion matrix is: \n{}".format(conf_mat))

def auc_roc_evaluate(fpr, tpr, type_str):
    
    from sklearn.metrics import auc
    
    roc_auc = auc(fpr, tpr)

    print("AUC: ")
    print(roc_auc)
    print("\n")

    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic: '+type_str)
    plt.legend(loc="lower right")
    plt.show()


def train_test_evaluate(clf):
    
    from sklearn.metrics import roc_curve
    global X_train_std, X_test_std
    
    y_train_predict = clf.predict(X_train_std)
    train_confusion_m = confusion_matrix(y_train, y_train_predict)
    train_acc = accuracy_score(y_train, y_train_predict)
    conf_mat_stats(train_confusion_m, 'train')
    print("\n")
    
    y_test_predict = clf.predict(X_test_std)
    test_confusion_m = confusion_matrix(y_test, y_test_predict)
    test_acc = accuracy_score(y_test, y_test_predict)
    conf_mat_stats(test_confusion_m, 'test')
    print("\n")

    print("Accuracy: Train:{:.2f}% Test:{:.2f}%".format(train_acc*100, test_acc*100))

    # roc curve    
    y_train_prob = clf.predict_proba(X_train_std)
    y_test_prob = clf.predict_proba(X_test_std)
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_prob[:,1])
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_prob[:,1])
    
    auc_roc_evaluate(fpr_train, tpr_train, 'train')
    auc_roc_evaluate(fpr_test, tpr_test, 'test')
    
    return (y_train_predict, y_test_predict)


#Replace inf values; transform numpy array to dataframes
X_train_std = pd.DataFrame(X_train_std, columns=X_loan.columns)
X_test_std = pd.DataFrame(X_test_std, columns=X_loan.columns)
X_train_std.replace([np.inf], 20)                                                                                           
X_train_std.replace([-np.inf], -20)                                                                                           
X_test_std.replace([np.inf], 20)
X_test_std.replace([-np.inf], -20)

#Transform dataframes to numpy arrays
X_train_std = X_train_std.values
X_test_std = X_test_std.values

- #### 3.1 Logistic Regression model



In [None]:
#L2 Regularization hyperparameter
parameters = {'C':[0.01, 1]}

#Logistic regression classifier with L2, n_jobs is the number of threads used. Use 2 or 4 if running on your laptop
lr = LogisticRegression(penalty='l2', n_jobs=40)

#Three fold cross-validation for Grid search; verbose=10 meaning output all information
gscv_lr = GridSearchCV(lr, parameters, cv=3, verbose=10)
gscv_lr.fit(X_train_std, y_train)

#Evaluation
train_test_evaluate(gscv_lr)
print(gscv_lr.best_params_)

- #### 3.2 Neual Network MLP Model

In [None]:
#Neural Network Model: MLP
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier(alpha=1)
parameters = {'alpha':[0.5,1,2,4], 'solver':['lbfgs'], 'hidden_layer_sizes':[5,10,20]}
gscv_mlp = GridSearchCV(MLP, parameters, cv=3, verbose=10)
gscv_mlp.fit(X_train_std, y_train)

train_test_evaluate(gscv_mlp)
print(gscv_mlp.best_params_)

- #### 3.3 Random Forest Model

In [None]:
#Random Forest
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=100, max_depth=15, n_jobs = 40)
parameters = {'n_estimators':[200], 'min_samples_leaf':[5], 'max_depth':[10]}
gscv_rf = GridSearchCV(RF, parameters, cv=3, verbose=10)
gscv_rf.fit(X_train_std, y_train)

train_test_evaluate(gscv_rf)
print(gscv_rf.best_params_)

importances = gscv_rf.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X_train_std.shape[1]):
        print("%d. feature %s (%f)" % (f + 1, X_loan.columns[indices[f]], importances[indices[f]]))


- #### 3.4 Decision Tree Model

In [None]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier(max_depth=15)
parameters = {'max_depth':[10,20,30], 'min_samples_leaf':[5,10]}
gscv_dt = GridSearchCV(DT, parameters, cv=3, verbose=10)
gscv_dt.fit(X_train_std, y_train)

train_test_evaluate(gscv_dt)
print(gscv_dt.best_params_)

importances = gscv_dt.best_estimator_.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(X_train_std.shape[1]):
        print("%d. feature %s (%f)" % (f + 1, X_loan.columns[indices[f]], importances[indices[f]]))
