In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
borrower_table = pd.read_csv('./borrower_table.csv')
loan_table = pd.read_csv('./loan_table.csv',parse_dates=['date'])

In [3]:
borrower_table.head()

Unnamed: 0,loan_id,is_first_loan,fully_repaid_previous_loans,currently_repaying_other_loans,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number
0,289774,1,,,8000,0.49,3285,1073,0,0,47,3
1,482590,0,1.0,0.0,4500,1.03,636,5299,1,13500,33,1
2,135565,1,,,6900,0.82,2085,3422,1,24500,38,8
3,207797,0,1.0,0.0,1200,0.82,358,3388,0,0,24,1
4,828078,0,0.0,0.0,6900,0.8,2138,4282,1,18100,36,1


In [4]:
borrower_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101100 entries, 0 to 101099
Data columns (total 12 columns):
loan_id                                            101100 non-null int64
is_first_loan                                      101100 non-null int64
fully_repaid_previous_loans                        46153 non-null float64
currently_repaying_other_loans                     46153 non-null float64
total_credit_card_limit                            101100 non-null int64
avg_percentage_credit_card_limit_used_last_year    94128 non-null float64
saving_amount                                      101100 non-null int64
checking_amount                                    101100 non-null int64
is_employed                                        101100 non-null int64
yearly_salary                                      101100 non-null int64
age                                                101100 non-null int64
dependent_number                                   101100 non-null int64
dtypes: fl

In [5]:
loan_table.head()

Unnamed: 0,loan_id,loan_purpose,date,loan_granted,loan_repaid
0,19454,investment,2012-03-15,0,
1,496811,investment,2012-01-17,0,
2,929493,other,2012-02-09,0,
3,580653,other,2012-06-27,1,1.0
4,172419,business,2012-05-21,1,0.0


In [6]:
loan_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101100 entries, 0 to 101099
Data columns (total 5 columns):
loan_id         101100 non-null int64
loan_purpose    101100 non-null object
date            101100 non-null datetime64[ns]
loan_granted    101100 non-null int64
loan_repaid     47654 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 3.9+ MB


# Data Processing

In [7]:
data = borrower_table.merge(loan_table,left_on='loan_id', right_on='loan_id')

In [8]:
# choose data with loan granted 
# the data with loan not granted is not useful for profit calculation
data = data[data['loan_granted']==1]
data.head()

Unnamed: 0,loan_id,is_first_loan,fully_repaid_previous_loans,currently_repaying_other_loans,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number,loan_purpose,date,loan_granted,loan_repaid
2,135565,1,,,6900,0.82,2085,3422,1,24500,38,8,other,2012-07-16,1,1.0
5,423171,1,,,6100,0.53,6163,5298,1,29500,24,1,other,2012-11-07,1,1.0
7,200139,1,,,4000,0.57,602,2757,1,31700,36,8,business,2012-09-19,1,0.0
8,991294,0,1.0,0.0,7000,0.52,2575,2917,1,58900,33,3,emergency_funds,2012-12-04,1,1.0
9,875332,0,1.0,0.0,4300,0.83,722,892,1,5400,32,7,business,2012-01-20,1,1.0


In [9]:
# checking missing values
missing = data.isnull().sum()
missing[missing>0]

fully_repaid_previous_loans                        25789
currently_repaying_other_loans                     25789
avg_percentage_credit_card_limit_used_last_year      903
dtype: int64

In [10]:
# parse date information and extract month, week, and dayofweek information
data['month'] = data['date'].apply(lambda x: x.month)
data['week'] = data['date'].apply(lambda x: x.week)
data['dayofweek'] = data['date'].apply(lambda x: x.dayofweek)

# drop useless features
data = data.drop(labels=['loan_id', 'date', 'loan_granted'], axis=1, inplace=False)

In [11]:
# there is a clear connection between 'is_first_loan' 
# and 'fully_repaid_previous_loans', 'currently_repaying_other_loans'
print(data[data['is_first_loan'] == 1]['fully_repaid_previous_loans'].unique())
print(data[data['is_first_loan'] == 1]['currently_repaying_other_loans'].unique())

[nan]
[nan]


In [12]:
# fill missing values with -1 for 'fully_repaid_previous_loans' and 'currently_repaying_other_loans'
data = data.fillna({'fully_repaid_previous_loans': -1, 'currently_repaying_other_loans': -1})

In [13]:
median = data['avg_percentage_credit_card_limit_used_last_year'].median()
data = data.fillna({'avg_percentage_credit_card_limit_used_last_year':median})

In [14]:
# encoding 'loan_purpose'
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

encoder = LabelEncoder()
data['loan_purpose'] = encoder.fit_transform(data['loan_purpose'])

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [15]:
# define categorical features
categorical_feature = ['loan_purpose', 'is_first_loan', 'fully_repaid_previous_loans', 
                       'currently_repaying_other_loans', 'is_employed']
target = 'loan_repaid'

data[categorical_feature] = data[categorical_feature].astype('category')
data[target] = data[target].astype('category')

In [16]:
# split into training and test data set
X = data.drop('loan_repaid', axis=1)
y = data['loan_repaid']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [17]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(
    n_estimators=50,
    criterion='gini',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    #min_weight_fraction_leaf=0.0,
    max_features='auto',
    #max_leaf_nodes=None,
    #min_impurity_decrease=0.0,
    #min_impurity_split=None,
    #bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=0,
    #verbose=0,
    #warm_start=False,
    #class_weight='balanced'
)

In [18]:
clf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [19]:
feature_importance_values = clf.feature_importances_
#feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})


In [20]:
feature_importance_values

array([1.56999823e-03, 3.65311854e-03, 7.66062977e-02, 7.83588364e-02,
       1.96428151e-02, 3.14336076e-01, 3.00180946e-01, 5.52419069e-02,
       1.39378007e-01, 2.85443407e-04, 7.00226251e-03, 3.10807072e-03,
       2.26483427e-04, 2.43595772e-04, 1.66142507e-04])

In [21]:
predictions = clf.predict(X_test)

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
cv = StratifiedKFold(n_splits=5, random_state=123, shuffle=True)
#results = pd.DataFrame(columns=['training_score', 'test_score'])
fprs, tprs, scores = [], [], []
    
for (train, test), i in zip(cv.split(X_train, y_train), range(5)):
    clf.fit(X.iloc[train], y.iloc[train])
    _, _, auc_score_train = compute_roc_auc(train)
    fpr, tpr, auc_score = compute_roc_auc(test)
    scores.append((auc_score_train, auc_score))
    fprs.append(fpr)
    tprs.append(tpr)

plot_roc_curve(fprs, tprs);
pd.DataFrame(scores, columns=['AUC Train', 'AUC Test'])

NameError: name 'compute_roc_auc' is not defined

In [23]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [56]:
print('Accuracy: ', accuracy_score(y_test, predictions))
print('confusion matrix:', confusion_matrix(y_test, predictions))
print('F1:', f1_score(y_test, predictions))

Accuracy:  0.9121201947288904
confusion matrix: [[3685  552]
 [ 495 7182]]
F1: 0.932061514502628
