# Random Forest

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv('data/df_clean.csv')

In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41292 entries, 0 to 41291
Data columns (total 17 columns):
Unnamed: 0              41292 non-null int64
loan_amnt               41292 non-null float64
term_is_36              41292 non-null int64
int_rate                41292 non-null float64
grade                   41292 non-null object
sub_grade               41292 non-null object
emp_length              41292 non-null float64
annual_inc              41292 non-null float64
purpose                 41292 non-null object
addr_state              41292 non-null object
dti                     41292 non-null float64
inq_last_6mths          41292 non-null float64
open_acc                41292 non-null float64
pub_rec                 41292 non-null float64
total_acc               41292 non-null float64
pub_rec_bankruptcies    41292 non-null int64
default                 41292 non-null int64
dtypes: float64(9), int64(4), object(4)
memory usage: 5.4+ MB
None


## Create Dummies

In [8]:
cat_feats = ['grade','sub_grade','purpose','addr_state']

In [9]:
df_final = pd.get_dummies(df, columns=cat_feats, drop_first=True)

In [10]:
df_final.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,term_is_36,int_rate,emp_length,annual_inc,dti,inq_last_6mths,open_acc,pub_rec,...,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
0,0,5000.0,1,0.1065,10.0,24000.0,27.65,1.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,2500.0,0,0.1527,0.0,30000.0,1.0,5.0,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2,2400.0,1,0.1596,10.0,12252.0,8.72,2.0,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,10000.0,1,0.1349,10.0,49200.0,20.0,1.0,10.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5,5000.0,1,0.079,3.0,36000.0,11.2,3.0,9.0,0.0,...,0,0,0,0,0,0,0,0,0,0


## Train Test Split

In [46]:
from sklearn.model_selection import train_test_split

X = df_final.drop('default', axis=1)
y = df_final['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101, stratify=y)

## Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
dtree = DecisionTreeClassifier()

In [14]:
dtree.fit(X_train, y_train)

DecisionTreeClassifier()

In [15]:
predictions = dtree.predict(X_test)
train_predictions = dtree.predict(X_train)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix

In [17]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86     10528
           1       0.23      0.26      0.24      1860

    accuracy                           0.76     12388
   macro avg       0.55      0.55      0.55     12388
weighted avg       0.77      0.76      0.76     12388



In [18]:
print(confusion_matrix(y_test, predictions))

[[8911 1617]
 [1385  475]]


In [22]:
print(classification_report(y_train, train_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24478
           1       1.00      1.00      1.00      4426

    accuracy                           1.00     28904
   macro avg       1.00      1.00      1.00     28904
weighted avg       1.00      1.00      1.00     28904



In [23]:
print(confusion_matrix(y_train, train_predictions))

[[24478     0]
 [    0  4426]]


In [24]:
from sklearn.model_selection import StratifiedKFold

def Stacking(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        print(len(train_indices),len(val_indices))
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
  
        model.fit(X=x_train,y=y_train)
        train_pred = np.append(train_pred, model.predict(x_val))
        test_pred =  np.append(test_pred,  model.predict(test))
    return test_pred.reshape(-1,1),train_pred

In [25]:
model = dtree
train = X_train
y = y_train
test = X_test
n_fold = 10

In [26]:
a, b = Stacking(model,train,y,test,n_fold)



26013 2891
26013 2891
26013 2891
26013 2891
26014 2890
26014 2890
26014 2890
26014 2890
26014 2890
26014 2890


## Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rnc = RandomForestClassifier(n_estimators = 200)

In [56]:
rnc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, oob_score=True)

In [57]:
pred_train = rnc.predict(X_train)

In [58]:
pred_test = rnc.predict(X_test)

In [59]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24504
           1       1.00      1.00      1.00      4400

    accuracy                           1.00     28904
   macro avg       1.00      1.00      1.00     28904
weighted avg       1.00      1.00      1.00     28904



In [60]:
print(confusion_matrix(y_train, pred_train))

[[24504     0]
 [    0  4400]]


In [61]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92     10502
           1       0.46      0.02      0.03      1886

    accuracy                           0.85     12388
   macro avg       0.66      0.51      0.47     12388
weighted avg       0.79      0.85      0.78     12388



In [62]:
print(confusion_matrix(y_test, pred_test))

[[10466    36]
 [ 1855    31]]


## Random Forest Optimization through RandomSearch

In [67]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter grid
param_grid = {
    'n_estimators': np.linspace(10, 500).astype(int),
    'max_depth': [None] + list(np.linspace(3, 100).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

# Estimator for use in random search
estimator = RandomForestClassifier()

# Create the random search model
rnc = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'roc_auc', cv = 3, 
                        n_iter = 20, verbose = 1)

# Fit 
rnc.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  3.0min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [None, 3, 4, 6, 8, 10, 12,
                                                      14, 16, 18, 20, 22, 24,
                                                      26, 28, 30, 32, 34, 36,
                                                      38, 40, 42, 44, 46, 48,
                                                      50, 52, 54, 56, 58, ...],
                                        'max_features': ['auto', 'sqrt', None,
                                                         0.5, 0.6, 0.7,
                                                         0.7999999999999999,
                                                         0.8999999999999999],
                                        'max_leaf_nodes': [None, 10, 10, 10, 10,
                                                 

In [68]:
best_model = rnc.best_estimator_

In [69]:
train_rf_predictions = best_model.predict(X_train)
train_rf_probs = best_model.predict_proba(X_train)[:, 1]

rf_predictions = best_model.predict(X_test)
rf_probs = best_model.predict_proba(X_test)[:, 1]

In [70]:
print(confusion_matrix(y_test, rf_predictions))

[[10494     8]
 [ 1877     9]]


In [None]:
n_nodes = []
max_depths = []

for ind_tree in best_model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')