In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import itertools
import os
import re
from collections import namedtuple
from pathlib import Path

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

_PRNG = np.random.RandomState(0)

In [2]:
_DATA_DIR = "./data"
_FILE_PROCESSED_DATASET = "lendingclub_2016-2018_processed.csv"
_FILE_PATH_PROCESSED_DATASET = Path(_DATA_DIR + "/" + _FILE_PROCESSED_DATASET)

_IS_PROCESSED_DATASET = False

df = None
# Read in the data, check if a preprocessed dataset exists
if _FILE_PATH_PROCESSED_DATASET.is_file():
    print("Preprocessed dataset found, reading preprocessed dataset")
    _IS_PROCESSED_DATASET = True
    df = pd.read_csv(_FILE_PATH_PROCESSED_DATASET)
else:
    # Read all data
    print("Preprocessed dataset not found, reading all data")
    _dataset_loan_columns = ["loan_amnt",
                             "int_rate",
                             "term",
                             "grade",
                             "sub_grade",
                             "installment",
                             "annual_inc",
                             "loan_status",
                             "verification_status",
                             "purpose"]

    dataset_loans = {}
    for file_name in os.listdir(_DATA_DIR):
        if file_name.endswith(".csv"):
            full_file_path = _DATA_DIR + "/" + file_name
            print(full_file_path)
            loan_period = re.search(r'\d{4}Q[0-4]{1}', file_name).group(0)
            dataset_loans[loan_period] = pd.read_csv(full_file_path, usecols=_dataset_loan_columns)[_dataset_loan_columns]
    
    # Combine all the data into a single dataframe
    print("Combining all the data into a single dataframe")
    for k, dataset in dataset_loans.items():
        if df is None:
            df = dataset
            continue
        df = df.append(dataset, ignore_index=True)

    del dataset_loans
        
    # Convert column types
    print("Converting column types")
    df["loan_amnt"] = df["loan_amnt"].astype(int)
    df["annual_inc"] = df["annual_inc"].astype(int)
    df["int_rate"] = df["int_rate"].apply(lambda int_rate: float(int_rate[:-1]))
    df["term"] = df["term"].apply(lambda term: int(re.search(r'\d+', term).group(0)))
    
    # Remove outliers
    # Annual income must be greater than 0 and less than 1 million
    # Assume 0 is an outlier
    # Assume millioniare+ are outliers
    print("Removing outliers")
    df = df.loc[(df["annual_inc"] > 0) & (df["annual_inc"] < 1000000)]
    
    # Convert non-ordinal categorical variables to binary vectors
    print("Convert non-ordinal categorical variables to binary vectors (this can take a while)")
    total_columns_to_process = len(df["verification_status"].unique()) + len(df["purpose"].unique())
    columns_processed = 0
    for verification_status in df["verification_status"].unique():
        columns_processed += 1
        print(f"[{columns_processed}/{total_columns_to_process}]", verification_status)
        df["vs_" + verification_status] = df.apply(lambda row: int(row["verification_status"] == verification_status), axis=1)
    for purpose in df["purpose"].unique():
        columns_processed += 1
        print(f"[{columns_processed}/{total_columns_to_process}]", purpose)
        df["p_" + purpose] = df.apply(lambda row: int(row["purpose"] == purpose), axis=1)

    df.drop(columns=["verification_status"], inplace=True)
    df.drop(columns=["purpose"], inplace=True)
    
    # Save processed dataset for future use
    print("Saving processed dataset")
    df.to_csv(_FILE_PATH_PROCESSED_DATASET, index=False, header=True)
    
print("Done!")

Preprocessed dataset found, reading preprocessed dataset
Done!


In [3]:
print("Number of loans in combined dataset:", len(df))
df.head(5)

Number of loans in combined dataset: 1371066


Unnamed: 0,loan_amnt,int_rate,term,grade,sub_grade,installment,annual_inc,loan_status,vs_Source Verified,vs_Verified,...,p_other,p_major_purchase,p_vacation,p_moving,p_medical,p_car,p_house,p_renewable_energy,p_wedding,p_educational
0,5000,17.27,36,D,D2,178.94,62000,Fully Paid,1,0,...,0,0,0,0,0,0,0,0,0,0
1,22000,6.49,36,A,A2,674.18,134000,Current,0,1,...,0,0,0,0,0,0,0,0,0,0
2,30000,10.75,60,B,B4,648.54,125000,Fully Paid,0,1,...,0,0,0,0,0,0,0,0,0,0
3,10000,16.29,36,D,D1,353.01,40000,Charged Off,1,0,...,0,0,0,0,0,0,0,0,0,0
4,12000,9.75,36,B,B3,385.8,120000,Current,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Check if we have any null values
df[df.isnull().any(axis=1)]

Unnamed: 0,loan_amnt,int_rate,term,grade,sub_grade,installment,annual_inc,loan_status,vs_Source Verified,vs_Verified,...,p_other,p_major_purchase,p_vacation,p_moving,p_medical,p_car,p_house,p_renewable_energy,p_wedding,p_educational


### Preprocessing

In [5]:
# Convert ordinal categorical variables to numerical
from sklearn.preprocessing import LabelEncoder  

le_grade = LabelEncoder()
df["grade"] = le_grade.fit_transform(df["grade"])

le_sub_grade = LabelEncoder()
df["sub_grade"] = le_sub_grade.fit_transform(df["sub_grade"])

In [6]:
df_pos = df.loc[df["loan_status"] == "Fully Paid"]
df_neg = df.loc[(df["loan_status"] == "Default") | (df["loan_status"] == "Charged Off")]

In [7]:
df_pos["loan_status"].value_counts()

Fully Paid    382014
Name: loan_status, dtype: int64

In [8]:
df_neg["loan_status"].value_counts()

Charged Off    108758
Default            35
Name: loan_status, dtype: int64

In [9]:
df_pos[["loan_amnt", "int_rate", "term", "grade", "sub_grade", "installment", "annual_inc"]].corr()

Unnamed: 0,loan_amnt,int_rate,term,grade,sub_grade,installment,annual_inc
loan_amnt,1.0,0.117732,0.353937,0.11421,0.116027,0.958982,0.422926
int_rate,0.117732,1.0,0.368597,0.962965,0.987197,0.155487,-0.099715
term,0.353937,0.368597,1.0,0.35749,0.36607,0.136393,0.079493
grade,0.11421,0.962965,0.35749,1.0,0.971276,0.148621,-0.099893
sub_grade,0.116027,0.987197,0.36607,0.971276,1.0,0.151058,-0.10812
installment,0.958982,0.155487,0.136393,0.148621,0.151058,1.0,0.403238
annual_inc,0.422926,-0.099715,0.079493,-0.099893,-0.10812,0.403238,1.0


In [10]:
df_neg[["loan_amnt", "int_rate", "term", "grade", "sub_grade", "installment", "annual_inc"]].corr()

Unnamed: 0,loan_amnt,int_rate,term,grade,sub_grade,installment,annual_inc
loan_amnt,1.0,0.191178,0.387081,0.187644,0.193173,0.945589,0.45097
int_rate,0.191178,1.0,0.376418,0.962659,0.983896,0.248679,-0.066134
term,0.387081,0.376418,1.0,0.372196,0.383811,0.142168,0.109504
grade,0.187644,0.962659,0.372196,1.0,0.975763,0.238671,-0.068205
sub_grade,0.193173,0.983896,0.383811,0.975763,1.0,0.244191,-0.070253
installment,0.945589,0.248679,0.142168,0.238671,0.244191,1.0,0.419332
annual_inc,0.45097,-0.066134,0.109504,-0.068205,-0.070253,0.419332,1.0


In [11]:
# Take a random even subsample
SUBSAMPLE_AMOUNT = 100000

df_pos = df_pos.sample(n=SUBSAMPLE_AMOUNT, replace=False, random_state=_PRNG)
df_neg = df_neg.sample(n=SUBSAMPLE_AMOUNT, replace=False, random_state=_PRNG)

df_pos.drop(columns=["loan_status"], inplace=True)
df_neg.drop(columns=["loan_status"], inplace=True)
print("Subsample pos:", len(df_pos))
print("Subsample neg:", len(df_neg))

Subsample pos: 100000
Subsample neg: 100000


In [12]:
# Combine subsamples
df_subsample_X = pd.concat([df_pos, df_neg], ignore_index=True, copy=True)
df_subsample_X = df_subsample_X.astype(float)
df_subsample_Y = [1] * len(df_pos) + [0] * len(df_neg)

assert len(df_subsample_X) == len(df_subsample_Y), "Dataset and labels must be the same size"

### Linear SVM

In [None]:
# Most libraries recommend a hard normalization, mapping the min and max values of a given dimension to [0,1]
# However, a soft normalization is also feasible using StandardScaler
# https://neerajkumar.org/writings/svm/
# https://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf

scaler = MinMaxScaler(feature_range=(0, 1))
# scaler = StandardScaler()

df_subsample_scaled_X = scaler.fit_transform(df_subsample_X)

In [None]:
# Create train/test split
# The training set is much smaller due to computation constraints, SVMs have poor scalability
X_train, X_test, Y_train, Y_test = train_test_split(df_subsample_scaled_X, df_subsample_Y, train_size=0.1, test_size=0.2, stratify=df_subsample_Y, random_state=_PRNG)
print("Training set size:", len(X_train))
print("Training label size:", len(Y_train))
print("Test set size:", len(X_test))
print("Test label size:", len(Y_test))

In [None]:
# Use grid search cross-validation to find the best C, using F1-score
parameters = {
    "C" : [2 ** power for power in range(-5, 9)]
}

parameters = {
    "C" : [0.0625]
}

clf = GridSearchCV(SVC(kernel="linear", cache_size=4000, probability=True, random_state=_PRNG), param_grid=parameters, scoring="f1", cv=5, refit=True, n_jobs=-1, verbose=1)
clf.fit(X_train, Y_train)
print (clf.best_score_, clf.best_params_)
clf.best_estimator_

In [None]:
svm = clf.best_estimator_

svm.score(X_test, Y_test)

In [None]:
for col_name, value in zip(df_pos.columns, svm.coef_.ravel()):
    print("{:20s} : {:.4f}".format(col_name, value))

In [None]:
# Plot SVM feature weights
coef = svm.coef_.ravel()
coef_sorted = np.argsort(coef)[::-1]

col = list(df_pos.columns)
colors = ["lightcoral" if feature_weight < 0 else "lightgreen" for feature_weight in coef[coef_sorted]]

plt.figure(figsize=(16, 8))
plt.bar(list(range(len(coef))), coef[coef_sorted], color=colors)
plt.xticks(np.arange(0, len(coef)), [col[idx] for idx in coef_sorted], rotation=60, ha="right")
plt.xlabel("Feature Name")
plt.ylabel("Feature Weight")
plt.title("LendingClub 2016-2018 - SVM Feature Weights")
plt.savefig("res/Prediction/SVM - Feature Weights.png", bbox_inches='tight')
plt.show()

In [None]:
svm.predict_proba(X_test[:10])

### Decision Tree

In [None]:
# scaler = MinMaxScaler(feature_range=(0, 1))
scaler = StandardScaler()

df_subsample_scaled_X = scaler.fit_transform(df_subsample_X)

In [None]:
# Create train/test split
X_train, X_test, Y_train, Y_test = train_test_split(df_subsample_scaled_X, df_subsample_Y, test_size=0.125, stratify=df_subsample_Y, random_state=_PRNG)
print("Training set size:", len(X_train))
print("Training label size:", len(Y_train))
print("Test set size:", len(X_test))
print("Test label size:", len(Y_test))

In [None]:
# Use random search to find best hyperparameters
parameters = {
    "max_depth"             : range(1, 31),
    "min_samples_split"     : range(2, 21),
    "min_samples_leaf"      : range(1, 11),
    "min_impurity_decrease" : np.arange(0, 0.05, 0.002),
    "max_features"          : ["sqrt", "log2", None]
}

parameters = {
    "max_depth"             : range(1, 11),
    "max_features"          : ["sqrt", "log2", None]
}

clf = RandomizedSearchCV(DecisionTreeClassifier(criterion="gini", random_state=_PRNG), param_distributions=parameters, n_iter=250, scoring="f1", cv=5, refit=True, n_jobs=-1, random_state=_PRNG, verbose=1)
clf.fit(X_train, Y_train)

print(clf.best_score_, clf.best_params_)
clf.best_estimator_

In [None]:
decision_tree = clf.best_estimator_

decision_tree.score(X_test, Y_test)

In [None]:
decision_tree.predict_proba(X_train[:10])

### RandomForest

In [13]:
# scaler = MinMaxScaler(feature_range=(0, 1))
scaler = StandardScaler()

df_subsample_scaled_X = scaler.fit_transform(df_subsample_X)

In [14]:
# Create train/test split
X_train, X_test, Y_train, Y_test = train_test_split(df_subsample_scaled_X, df_subsample_Y, test_size=0.125, stratify=df_subsample_Y, random_state=_PRNG)
print("Training set size:", len(X_train))
print("Training label size:", len(Y_train))
print("Test set size:", len(X_test))
print("Test label size:", len(Y_test))

Training set size: 175000
Training label size: 175000
Test set size: 25000
Test label size: 25000


In [15]:
# Use random search to find best hyperparameters
parameters = {
    "n_estimators"          : range(10, 200, 10),
    "max_depth"             : range(1, 31),
    "min_samples_split"     : range(2, 21),
    "min_samples_leaf"      : range(1, 11),
    "min_impurity_decrease" : np.arange(0, 0.05, 0.002),
    "max_features"          : ["sqrt", "log2", None]
}

parameters = {
    "max_depth"             : range(1, 2),
    "max_features"          : ["sqrt", "log2", None]
}

clf = RandomizedSearchCV(RandomForestClassifier(n_estimators=100, criterion="gini", n_jobs=-1, random_state=_PRNG), param_distributions=parameters, n_iter=250, scoring="f1", cv=5, refit=True, n_jobs=-1, random_state=_PRNG, verbose=1)

clf.fit(X_train, Y_train)

print(clf.best_score_, clf.best_params_)
clf.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   25.1s finished


0.5922970199090193 {'max_features': 'sqrt', 'max_depth': 1}


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False,
            random_state=<mtrand.RandomState object at 0x00000261D8ABB558>,
            verbose=0, warm_start=False)

In [16]:
random_forest = clf.best_estimator_

random_forest.score(X_test, Y_test)

0.63784

In [17]:
random_forest.predict_proba(X_train[:10])

array([[0.41698937, 0.58301063],
       [0.54554798, 0.45445202],
       [0.54718403, 0.45281597],
       [0.44906729, 0.55093271],
       [0.42957291, 0.57042709],
       [0.54483287, 0.45516713],
       [0.41139824, 0.58860176],
       [0.42393509, 0.57606491],
       [0.57462066, 0.42537934],
       [0.58081604, 0.41918396]])

### Gradient Boosting

In [18]:
# scaler = MinMaxScaler(feature_range=(0, 1))
scaler = StandardScaler()

df_subsample_scaled_X = scaler.fit_transform(df_subsample_X)

In [19]:
# Create train/test split
X_train, X_test, Y_train, Y_test = train_test_split(df_subsample_scaled_X, df_subsample_Y, test_size=0.125, stratify=df_subsample_Y, random_state=_PRNG)
print("Training set size:", len(X_train))
print("Training label size:", len(Y_train))
print("Test set size:", len(X_test))
print("Test label size:", len(Y_test))

Training set size: 175000
Training label size: 175000
Test set size: 25000
Test label size: 25000


In [22]:
# Use random search to find best hyperparameters
parameters = {
    "learning_rate"         : np.arange(.001, .2, 0.002),
    "subsample"             : np.arange(.5, 1.0, 0.1),
    "max_depth"             : range(1, 31),
    "min_samples_split"     : range(2, 21),
    "min_samples_leaf"      : range(1, 11),
    "max_depth"             : range(3, 10),
    "min_impurity_decrease" : np.arange(0, 0.05, 0.002),
    "max_features"          : ["sqrt", "log2", None]
}

parameters = {
    "max_features"          : ["sqrt", "log2", None]
}

clf = RandomizedSearchCV(GradientBoostingClassifier(loss="deviance", n_estimators=100, random_state=_PRNG, n_iter_no_change=30, validation_fraction=0.1, tol=1e-4), param_distributions=parameters, n_iter=250, scoring="f1", cv=5, refit=True, n_jobs=-1, random_state=_PRNG, verbose=1)

clf.fit(X_train, Y_train)

print(clf.best_score_, clf.best_params_)
clf.best_estimator_

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.1min finished


0.6329155660875708 {'max_features': None}


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=30, presort='auto',
              random_state=<mtrand.RandomState object at 0x0000026190DBC3A8>,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [23]:
gradient_boosting = clf.best_estimator_

gradient_boosting.score(X_test, Y_test)

0.64112

In [24]:
gradient_boosting.predict_proba(X_train[:10])

array([[0.55299676, 0.44700324],
       [0.80681415, 0.19318585],
       [0.69360901, 0.30639099],
       [0.29439012, 0.70560988],
       [0.47558811, 0.52441189],
       [0.46292108, 0.53707892],
       [0.59270883, 0.40729117],
       [0.81502597, 0.18497403],
       [0.52222615, 0.47777385],
       [0.39565172, 0.60434828]])