# Random Forest

In [79]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [80]:
df = pd.read_csv('data/df_clean.csv')

In [81]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41292 entries, 0 to 41291
Data columns (total 15 columns):
Unnamed: 0              41292 non-null int64
loan_amnt               41292 non-null float64
term_is_36              41292 non-null int64
int_rate                41292 non-null float64
grade                   41292 non-null object
sub_grade               41292 non-null object
emp_length              41292 non-null float64
annual_inc              41292 non-null float64
purpose                 41292 non-null object
addr_state              41292 non-null object
dti                     41292 non-null float64
inq_last_6mths          41292 non-null float64
open_acc                41292 non-null float64
pub_rec_bankruptcies    41292 non-null int64
default                 41292 non-null int64
dtypes: float64(7), int64(4), object(4)
memory usage: 4.7+ MB
None


## Upscaling of the minority class default = 1

Up-sampling is the process of randomly duplicating observations from the minority class in order to reinforce its signal.

There are several heuristics for doing so, but the most common way is to simply resample with replacement.

First, we'll import the resampling module from Scikit-Learn:

In [82]:
from sklearn.utils import resample

Next, we'll create a new DataFrame with an up-sampled minority class. Here are the steps:

First, we'll separate observations from each class into different DataFrames.
Next, we'll resample the minority class with replacement, setting the number of samples to match that of the majority class.
Finally, we'll combine the up-sampled minority class DataFrame with the original majority class DataFrame.

In [83]:
df.default.value_counts()

0    35006
1     6286
Name: default, dtype: int64

In [147]:
# Separate majority and minority classes
df_majority = df[df.default==0]
df_minority = df[df.default==1]

majority_size = df_majority.shape[0]

print(majority_size)

# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=majority_size,    # to match majority class
                                 random_state=123) # reproducible results

df_minority_upsampled.columns
df_minority_upsampled.drop(columns='Unnamed: 0',inplace=True,axis=1)

35006


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [149]:
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.default.value_counts()
df_upsampled.drop(columns='Unnamed: 0',inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [150]:
df_upsampled.head()

Unnamed: 0,addr_state,annual_inc,default,dti,emp_length,grade,inq_last_6mths,int_rate,loan_amnt,open_acc,pub_rec_bankruptcies,purpose,sub_grade,term_is_36
0,AZ,24000.0,0,27.65,10.0,B,1.0,0.1065,5000.0,3.0,0,credit_card,B2,1
2,IL,12252.0,0,8.72,10.0,C,2.0,0.1596,2400.0,2.0,0,small_business,C5,1
3,CA,49200.0,0,20.0,10.0,C,1.0,0.1349,10000.0,10.0,0,other,C1,1
4,AZ,36000.0,0,11.2,3.0,A,3.0,0.079,5000.0,9.0,0,wedding,A4,1
5,NC,47004.0,0,23.51,8.0,C,1.0,0.1596,7000.0,7.0,0,debt_consolidation,C5,0


In [152]:
df_upsampled.shape

(70012, 14)

## Scaling

In [178]:
df_upsampled.columns

Index(['addr_state', 'annual_inc', 'default', 'dti', 'emp_length', 'grade',
       'inq_last_6mths', 'int_rate', 'loan_amnt', 'open_acc',
       'pub_rec_bankruptcies', 'purpose', 'sub_grade', 'term_is_36'],
      dtype='object')

In [141]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaling_list = ["annual_inc", "dti", "int_rate", "loan_amount"]

dfTest[['A', 'B']] = scaler.fit_transform(dfTest[['A', 'B']])

## Create Dummies

In [142]:
cat_feats = ['grade','sub_grade','purpose','addr_state']

In [143]:
df = df_upsampled.copy()

In [144]:
df_final = pd.get_dummies(df, columns=cat_feats, drop_first=True)

In [153]:
df_final.head()

Unnamed: 0.1,Unnamed: 0,loan_amnt,term_is_36,int_rate,emp_length,annual_inc,dti,inq_last_6mths,open_acc,pub_rec_bankruptcies,...,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY
0,0,5000.0,1,0.1065,10.0,24000.0,27.65,1.0,3.0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,2400.0,1,0.1596,10.0,12252.0,8.72,2.0,2.0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,10000.0,1,0.1349,10.0,49200.0,20.0,1.0,10.0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,5000.0,1,0.079,3.0,36000.0,11.2,3.0,9.0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,7000.0,0,0.1596,8.0,47004.0,23.51,1.0,7.0,0,...,0,0,0,0,0,0,0,0,0,0


In [154]:
df_final.shape

(70012, 113)

## Train Test Split

In [155]:
from sklearn.model_selection import train_test_split

X = df_final.drop('default', axis=1)
y = df_final['default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

## Decision Tree

In [156]:
from sklearn.tree import DecisionTreeClassifier

In [157]:
dtree = DecisionTreeClassifier()

In [158]:
dtree.fit(X_train, y_train)

DecisionTreeClassifier()

In [159]:
predictions = dtree.predict(X_test)
train_predictions = dtree.predict(X_train)

In [160]:
from sklearn.metrics import classification_report, confusion_matrix

In [161]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.81      0.88     10501
           1       0.84      0.98      0.90     10503

    accuracy                           0.89     21004
   macro avg       0.90      0.89      0.89     21004
weighted avg       0.90      0.89      0.89     21004



In [162]:
print(confusion_matrix(y_test, predictions))

[[ 8481  2020]
 [  258 10245]]


In [163]:
print(classification_report(y_train, train_predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24505
           1       1.00      1.00      1.00     24503

    accuracy                           1.00     49008
   macro avg       1.00      1.00      1.00     49008
weighted avg       1.00      1.00      1.00     49008



In [164]:
print(confusion_matrix(y_train, train_predictions))

[[24505     0]
 [    0 24503]]


In [165]:
from sklearn.model_selection import StratifiedKFold

def Stacking(model,train,y,test,n_fold):
    folds=StratifiedKFold(n_splits=n_fold,random_state=1)
    test_pred=np.empty((test.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for train_indices,val_indices in folds.split(train,y.values):
        print(len(train_indices),len(val_indices))
        x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
        y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
  
        model.fit(X=x_train,y=y_train)
        train_pred = np.append(train_pred, model.predict(x_val))
        test_pred =  np.append(test_pred,  model.predict(test))
    return test_pred.reshape(-1,1),train_pred

In [166]:
model = dtree
train = X_train
y = y_train
test = X_test
n_fold = 10

In [167]:
a, b = Stacking(model,train,y,test,n_fold)



44107 4901
44107 4901
44107 4901
44107 4901
44107 4901
44107 4901
44107 4901
44107 4901
44108 4900
44108 4900


## Random Forest

In [168]:
from sklearn.ensemble import RandomForestClassifier

In [169]:
rnc = RandomForestClassifier()

In [170]:
rnc.fit(X_train, y_train)

RandomForestClassifier()

In [171]:
pred_train = rnc.predict(X_train)

In [172]:
pred_test = rnc.predict(X_test)

In [173]:
print(classification_report(y_train, pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24505
           1       1.00      1.00      1.00     24503

    accuracy                           1.00     49008
   macro avg       1.00      1.00      1.00     49008
weighted avg       1.00      1.00      1.00     49008



In [174]:
print(confusion_matrix(y_train, pred_train))

[[24505     0]
 [    0 24503]]


In [175]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96     10501
           1       0.95      0.97      0.96     10503

    accuracy                           0.96     21004
   macro avg       0.96      0.96      0.96     21004
weighted avg       0.96      0.96      0.96     21004



In [176]:
print(confusion_matrix(y_test, pred_test))

[[10017   484]
 [  295 10208]]


## Random Forest Optimization through RandomSearch

In [177]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter grid
param_grid = {
    'n_estimators': np.linspace(10, 500).astype(int),
    'max_depth': [None] + list(np.linspace(3, 100).astype(int)),
    'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
    'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
    'min_samples_split': [2, 5, 10],
    'bootstrap': [True, False]
}

# Estimator for use in random search
estimator = RandomForestClassifier()

# Create the random search model
rnc = RandomizedSearchCV(estimator, param_grid, n_jobs = -1, 
                        scoring = 'roc_auc', cv = 3, 
                        n_iter = 20, verbose = 1)

# Fit 
rnc.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  4.9min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=20,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [None, 3, 4, 6, 8, 10, 12,
                                                      14, 16, 18, 20, 22, 24,
                                                      26, 28, 30, 32, 34, 36,
                                                      38, 40, 42, 44, 46, 48,
                                                      50, 52, 54, 56, 58, ...],
                                        'max_features': ['auto', 'sqrt', None,
                                                         0.5, 0.6, 0.7,
                                                         0.7999999999999999,
                                                         0.8999999999999999],
                                        'max_leaf_nodes': [None, 10, 10, 10, 10,
                                                 

In [179]:
best_model = rnc.best_estimator_

In [180]:
train_rf_predictions = best_model.predict(X_train)
train_rf_probs = best_model.predict_proba(X_train)[:, 1]

rf_predictions = best_model.predict(X_test)
rf_probs = best_model.predict_proba(X_test)[:, 1]

In [181]:
print(confusion_matrix(y_test, rf_predictions))

[[6703 3798]
 [3217 7286]]


In [None]:
n_nodes = []
max_depths = []

for ind_tree in best_model.estimators_:
    n_nodes.append(ind_tree.tree_.node_count)
    max_depths.append(ind_tree.tree_.max_depth)
    
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')