Goal: visualize the impact of trees and depth on out of sample scores
Understand Out of Bag errors:
https://datascience.stackexchange.com/questions/13151/randomforestclassifier-oob-scoring-method
<br>

Resources:
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
<br>

https://stats.stackexchange.com/questions/233275/multilabel-classification-metrics-on-scikit

<br>
https://stackoverflow.com/questions/31421413/how-to-compute-precision-recall-accuracy-and-f1-score-for-the-multiclass-case


In [1]:
# Utilities
import numpy as np
import time
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# Sklearn
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_recall_fscore_support, precision_recall_curve
from sklearn import metrics

from sklearn.datasets import make_classification


X, y = make_classification(n_features=30, 
                           random_state=0, n_classes=4, n_samples=4000, 
                           weights=[0.6, 0.1, 0.1, 0.2],
                           n_informative=20)

LABEL_SELECT = 1

# Test out some metrics

In [2]:
clf = RandomForestClassifier(max_depth=4, n_estimators=20, n_jobs= -1)
clf.fit(X, y)
print("Weighted mean ", f1_score(y, clf.predict(X), average='weighted'))
print("Unweighed mean of f1_score ",f1_score(y, clf.predict(X), average='macro'))
display(precision_recall_fscore_support(y, clf.predict(X), average=None))
print("Actual values")
display(pd.Series(y).value_counts().sort_index())

Weighted mean  0.5599527100658053
Unweighed mean of f1_score  0.3134842462398626


  _warn_prf(average, modifier, msg_start, len(result))


(array([0.64100486, 0.        , 0.        , 0.87583893]),
 array([0.99538591, 0.        , 0.        , 0.32503113]),
 array([0.77982254, 0.        , 0.        , 0.47411444]),
 array([2384,  403,  410,  803], dtype=int64))

Actual values


0    2384
1     403
2     410
3     803
dtype: int64

## Using balanced class weight: big difference for label 1-2

In [3]:
clf = RandomForestClassifier(max_depth=4, n_estimators=20, n_jobs= -1,class_weight='balanced',random_state=0)
clf.fit(X, y)
print("Weighted mean ", f1_score(y, clf.predict(X), average='weighted'))
print("Unweighed mean of f1_score ",f1_score(y, clf.predict(X), average='macro'))
display(precision_recall_fscore_support(y, clf.predict(X), average=None))

Weighted mean  0.6839775597338591
Unweighed mean of f1_score  0.6052768443001172


(array([0.90764706, 0.4234104 , 0.40747029, 0.58096173]),
 array([0.64723154, 0.72704715, 0.58536585, 0.73723537]),
 array([0.75563173, 0.53515982, 0.48048048, 0.64983535]),
 array([2384,  403,  410,  803], dtype=int64))

applying threshold specifically for label 1, which I'm interested in 

In [4]:
y_proba = clf.predict_proba(X)
y_1 = np.where(y == LABEL_SELECT , 1, 0)

precision, recall, thresholds = precision_recall_curve(y_1, y_proba[:,LABEL_SELECT])
f1 = 2 * np.multiply(precision, recall) / (precision + recall)
selected_ind = np.argmax(f1)
display(thresholds[selected_ind], precision[selected_ind], recall[selected_ind], f1[selected_ind])

0.3634425183789126

0.6479750778816199

0.5161290322580645

0.5745856353591159

In [5]:
# Applying this threshold onto the prediction
y_predict = clf.predict(X)
y_predict_threshold = np.where(y_proba[:, LABEL_SELECT] >= thresholds[selected_ind], LABEL_SELECT, y_predict)
print("Weighted mean ", f1_score(y, y_predict_threshold, average='weighted'))
print("Unweighed mean of f1_score ",f1_score(y, y_predict_threshold, average='macro'))
print("Unwaited is lower because it treats the f1 on class 1-2-3 at the same weight as on class 0")
display(precision_recall_fscore_support(y, y_predict_threshold, average=None))

Weighted mean  0.6835297221395317
Unweighed mean of f1_score  0.6050624085431068
Unwaited is lower because it treats the f1 on class 1-2-3 at the same weight as on class 0


(array([0.90801887, 0.42241379, 0.40747029, 0.58096173]),
 array([0.64597315, 0.72952854, 0.58536585, 0.73723537]),
 array([0.75490196, 0.53503185, 0.48048048, 0.64983535]),
 array([2384,  403,  410,  803], dtype=int64))

No noticeable different in the metrics for all labels? So no need for a threshold?

# Next
Simple grid search. What is the most important metrics? f1_score

In [6]:
# Source: https://stackoverflow.com/questions/61965700/adjust-threshold-cros-val-score-sklearn

class MyRF_multinomial(RandomForestClassifier):
    def fit(self, X, y, **kwargs):
        RandomForestClassifier.fit(self, X=X, y=y, **kwargs)
        y_proba = RandomForestClassifier.predict_proba(self, X=X)[:, LABEL_SELECT]
        y_select = np.where(y == LABEL_SELECT, 1, 0)
        precision, recall, thresholds = metrics.precision_recall_curve(y_select, y_proba)
        f1 = 2 * np.multiply(precision, recall) / (precision + recall)
        self.threshold = thresholds[np.argmax(f1)]
        self.train_f1 = np.max(f1)
        return self
        
    def predict(self,X, **kwargs):
        result = RandomForestClassifier.predict(self, X=X)
        
        # Over-write the prediction with the specific threshold for for LABEL_SELECT
        result_proba = RandomForestClassifier.predict_proba(self, X=X)[:, LABEL_SELECT]
        predictions = np.where(result_proba >= self.threshold, LABEL_SELECT, result)
        
        return predictions

def plot_res(final_res):
    fig, axs = plt.subplots(figsize=(8,10), nrows=len(final_res.keys()), ncols=1)
    
    this_limit = [999, -1]
    for this_index, this_key in enumerate(final_res.keys()):
        tmp_min = min(np.min(final_res[this_key]), this_limit[0])
        tmp_max = max(np.max(final_res[this_key]), this_limit[1])
        this_limit = [tmp_min, tmp_max]
    
    this_limit = [this_limit[0] - 0.05, this_limit[1] + 0.05]
    
    for this_index, this_key in enumerate(final_res.keys()):
        this_ax = axs[this_index]
        this_val = final_res[this_key]
        this_ax.plot(this_val)
        this_ax.set_xticks(ticks=range(len(this_val)))
        this_ax.set_xticklabels(labels=num_trees_range)
        this_ax.set_ylim(this_limit)

        this_ax.set_ylabel("Depth = {}".format(this_key))
    fig.tight_layout()

def process_cv_results(clf, parameters):
    results_df = pd.DataFrame(clf.cv_results_)

    # Record the minimum of the out of sample score
    split_test_score = [x for x in results_df.columns if '_test_score' in x and 'split' in x]
    results_df['min_score'] = results_df[split_test_score].min(axis=1)
    
    column_list = [x for x in results_df.columns if 'param_' in x]

    # param_cols = [x for x in results_df.columns if 'param_' in x]
    results_subset = results_df[column_list + ['min_score']]
    results_subset = results_subset.sort_values(by=column_list, ascending=True)
    return results_subset

In [9]:
key_args ={'labels':[LABEL_SELECT], 'average':'macro'}

scorer_class1 = metrics.make_scorer(f1_score, **key_args) 
precision_class1= metrics.make_scorer(metrics.precision_score,**key_args) 
recall_class1= metrics.make_scorer(metrics.recall_score, **key_args) 

In [10]:
# Source
# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring
# f1_macro: so weight the under-sampled class the same as the overwelming class
# Custom scoring function
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html#sphx-glr-auto-examples-model-selection-plot-multi-metric-evaluation-py
parameters = {'max_depth':[5,7, 10,15,20], 
              'n_estimators':[10, 20, 50,100, 200,500,1000],
              'class_weight': [None,'balanced'],
             'max_samples':[0.5, 0.75]} 

parameters = {'max_depth':[5,7], 
              'n_estimators':[10, 20],
              'class_weight': [None,'balanced'],
             'max_samples':[0.5, 0.75]} 

# clf = GridSearchCV(MyRF(), parameters)


clf = GridSearchCV(RandomForestClassifier(n_jobs=-1), parameters,scoring=scorer_class1)
start = time.time()
clf.fit(X, y)

end = time.time()
print("Run time {}".format((end-start)/60))

Run time 0.25318236351013185


In [11]:
results_subset = process_cv_results(clf, parameters)
results_subset.head()

Unnamed: 0,param_class_weight,param_max_depth,param_max_samples,param_n_estimators,min_score
8,balanced,5,0.5,10,0.352941
9,balanced,5,0.5,20,0.380952
10,balanced,5,0.75,10,0.398148
11,balanced,5,0.75,20,0.445714
12,balanced,7,0.5,10,0.45


The results are different to another level if I use f1_macro so my customer scorer works

In [12]:
clf_macro = GridSearchCV(RandomForestClassifier(n_jobs=-1,class_weight='balanced'), parameters,refit=False, scoring={'f1_macro':'f1_macro',
                                                                               'f1_class1':scorer_class1,
                                                                               'precision_class1': precision_class1,
                                                                               'recall_class1': recall_class1
                                                                                            })
clf_macro.fit(X, y)
# results_subset = process_cv_results(clf_macro, parameters)
#results_subset.head()

pd.DataFrame(clf_macro.cv_results_).head()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_max_samples,param_n_estimators,params,split0_test_f1_macro,...,std_test_precision_class1,rank_test_precision_class1,split0_test_recall_class1,split1_test_recall_class1,split2_test_recall_class1,split3_test_recall_class1,split4_test_recall_class1,mean_test_recall_class1,std_test_recall_class1,rank_test_recall_class1
0,0.037402,0.0008,0.110837,0.00135,,5,0.5,10,"{'class_weight': None, 'max_depth': 5, 'max_sa...",0.320463,...,0.290593,16,0.0,0.0,0.049383,0.0,0.012346,0.012346,0.019126,14
1,0.067005,0.003848,0.110683,0.001296,,5,0.5,20,"{'class_weight': None, 'max_depth': 5, 'max_sa...",0.32273,...,0.489898,12,0.0,0.0375,0.012346,0.0,0.0,0.009969,0.014572,15
2,0.036002,0.001788,0.109089,0.001125,,5,0.75,10,"{'class_weight': None, 'max_depth': 5, 'max_sa...",0.314326,...,0.411825,9,0.0,0.05,0.0,0.012346,0.037037,0.019877,0.020242,13
3,0.068605,0.00398,0.110211,0.000745,,5,0.75,20,"{'class_weight': None, 'max_depth': 5, 'max_sa...",0.311398,...,0.489898,5,0.0,0.0125,0.024691,0.012346,0.0,0.009907,0.009247,16
4,0.038403,0.001856,0.110377,0.000767,,7,0.5,10,"{'class_weight': None, 'max_depth': 7, 'max_sa...",0.411563,...,0.152294,4,0.0875,0.025,0.148148,0.111111,0.098765,0.094105,0.040127,11


In [13]:
parameters = {'max_depth':[5,7, 10,15,20], 
              'n_estimators':[10, 20, 50,100, 200,500,1000],
             'max_samples':[0.5, 0.75]} 

In [15]:
start = time.time()
clf_macro = GridSearchCV(RandomForestClassifier(n_jobs=-1,class_weight='balanced'), 
                         parameters,refit=False, scoring={'f1_macro':'f1_macro',
                                                                               'f1_class1':scorer_class1,
                                                                               'precision_class1': precision_class1,
                                                                               'recall_class1': recall_class1
                                                                                            })
clf_macro.fit(X, y)

end = time.time()
print("Run time {}".format((end-start)/60))

with open('./output_20Q3/1023_gridsearch_multi_rf.p', 'wb') as out_file:
    pickle.dump(pd.DataFrame(clf_macro.cv_results_), out_file)

Run time 3.9833602587382


In [None]:
start = time.time()
clf_multi = GridSearchCV(MyRF_multinomial(n_jobs=-1,class_weight='balanced'), 
                         parameters,refit=False, scoring={'f1_macro':'f1_macro',
                                                          'f1_class1':scorer_class1,
                                                          'precision_class1': precision_class1,
                                                          'recall_class1': recall_class1
                                                         })
clf_multi.fit(X, y)

end = time.time()
print("Run time {}".format((end-start)/60))

with open('./output_20Q3/1023_gridsearch_multi_custom.p', 'wb') as out_file:
    pickle.dump(pd.DataFrame(clf_multi.cv_results_), out_file)

In [11]:
scoring_dict =['f1_macro','f1_class1'
               ,'precision_class1',
               'recall_class1']

with open('./output_20Q3/1023_gridsearch_multi_rf.p', 'rb') as in_file:
    results = pickle.load(in_file)

drop_cols = []
for this_col in results:
    if 'time' in this_col or 'rank_test_' in this_col:
        drop_cols.append(this_col)

# calculate min of results
for this_key in scoring_dict:
    select_cols = [x for x in results.columns if "_test_" + this_key in x and 'mean_test_' + this_key not in x and 'std_test_' + this_key not in x]
    results['min_test_' + this_key] = results[select_cols].min(axis=1)
    
results.drop(columns=drop_cols, axis=1).to_csv("./output_20Q3/1023_gridsearch_multi_rf.csv", index=False)

In [12]:
with open('./output_20Q3/1023_gridsearch_multi_custom.p', 'rb') as in_file:
    results = pickle.load(in_file)
drop_cols = []
for this_col in results:
    if 'time' in results or 'rank_test_' in results:
        drop_cols.append(this_col)

# calculate min of results
for this_key in scoring_dict:
    select_cols = [x for x in results.columns if "_test_" + this_key in x and 'mean_test_' + this_key not in x and 'std_test_' + this_key not in x]
    results['min_test_' + this_key] = results[select_cols].min(axis=1)
    
results.drop(columns=drop_cols, axis=1).to_csv("./output_20Q3/1023_gridsearch_multi_custom.csv", index=False)