# Random Forest Classifier

Imports

In [4]:
import time

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import joblib
import shap

Load data and labels into dataframes

In [5]:
df = pd.read_csv('../data/16_lucas_organic_carbon_training_and_test_data.csv')
labels = pd.read_csv('../data/16_lucas_organic_carbon_target.csv')

In [6]:
df

Unnamed: 0,500.0,502.0,504.0,506.0,508.0,510.0,512.0,514.0,516.0,518.0,...,2480.0,2482.0,2484.0,2486.0,2488.0,2490.0,2492.0,2494.0,2496.0,2498.0
0,0.000276,0.000283,0.000289,0.000296,0.000302,0.000309,0.000315,0.000321,0.000327,0.000333,...,-0.000138,-0.000132,-0.000124,-0.000115,-0.000104,-0.000095,-0.000085,-0.000075,-0.000065,-0.000056
1,0.000300,0.000309,0.000318,0.000327,0.000336,0.000345,0.000353,0.000362,0.000369,0.000376,...,-0.000199,-0.000195,-0.000188,-0.000178,-0.000165,-0.000150,-0.000133,-0.000117,-0.000100,-0.000084
2,0.000388,0.000395,0.000403,0.000410,0.000418,0.000426,0.000433,0.000440,0.000446,0.000451,...,-0.000142,-0.000135,-0.000126,-0.000117,-0.000107,-0.000098,-0.000089,-0.000080,-0.000072,-0.000063
3,0.000362,0.000371,0.000379,0.000388,0.000397,0.000406,0.000414,0.000422,0.000429,0.000434,...,-0.000136,-0.000128,-0.000118,-0.000108,-0.000099,-0.000090,-0.000081,-0.000073,-0.000064,-0.000056
4,0.000357,0.000365,0.000372,0.000379,0.000387,0.000394,0.000402,0.000408,0.000414,0.000419,...,-0.000149,-0.000142,-0.000134,-0.000124,-0.000114,-0.000102,-0.000089,-0.000077,-0.000065,-0.000053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946,0.000194,0.000201,0.000208,0.000216,0.000223,0.000230,0.000238,0.000245,0.000251,0.000258,...,-0.000097,-0.000093,-0.000088,-0.000082,-0.000074,-0.000067,-0.000059,-0.000050,-0.000042,-0.000034
1947,0.000347,0.000354,0.000361,0.000368,0.000375,0.000382,0.000388,0.000394,0.000398,0.000402,...,-0.000068,-0.000065,-0.000061,-0.000057,-0.000051,-0.000045,-0.000039,-0.000033,-0.000027,-0.000021
1948,0.000224,0.000232,0.000239,0.000247,0.000254,0.000262,0.000269,0.000277,0.000286,0.000294,...,-0.000268,-0.000258,-0.000245,-0.000229,-0.000210,-0.000190,-0.000168,-0.000147,-0.000126,-0.000104
1949,0.000305,0.000310,0.000315,0.000320,0.000325,0.000330,0.000335,0.000339,0.000343,0.000346,...,-0.000188,-0.000177,-0.000164,-0.000148,-0.000130,-0.000111,-0.000092,-0.000073,-0.000054,-0.000035


In [5]:
df.describe()

Unnamed: 0,500.0,502.0,504.0,506.0,508.0,510.0,512.0,514.0,516.0,518.0,...,2480.0,2482.0,2484.0,2486.0,2488.0,2490.0,2492.0,2494.0,2496.0,2498.0
count,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,...,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0,1951.0
mean,0.000272,0.000279,0.000286,0.000292,0.000299,0.000306,0.000313,0.000319,0.000324,0.000329,...,-0.000154,-0.000148,-0.00014,-0.000131,-0.00012,-0.000109,-9.7e-05,-8.5e-05,-7.3e-05,-6.1e-05
std,0.000124,0.000126,0.000129,0.000132,0.000134,0.000137,0.000139,0.000142,0.000144,0.000146,...,5.9e-05,5.8e-05,5.7e-05,5.5e-05,5.3e-05,5e-05,4.8e-05,4.6e-05,4.4e-05,4.3e-05
min,1.6e-05,1.7e-05,1.8e-05,1.9e-05,2.1e-05,2.2e-05,2.3e-05,2.4e-05,2.5e-05,2.6e-05,...,-0.000485,-0.000466,-0.000511,-0.000544,-0.000563,-0.000566,-0.000563,-0.000559,-0.000556,-0.000553
25%,0.000179,0.000184,0.000189,0.000194,0.000199,0.000203,0.000207,0.000212,0.000216,0.000219,...,-0.000185,-0.000179,-0.00017,-0.00016,-0.000148,-0.000136,-0.000122,-0.000109,-9.7e-05,-8.3e-05
50%,0.000266,0.000273,0.00028,0.000287,0.000294,0.0003,0.000307,0.000312,0.000318,0.000323,...,-0.000146,-0.000141,-0.000133,-0.000123,-0.000113,-0.000102,-9e-05,-7.8e-05,-6.7e-05,-5.4e-05
75%,0.000356,0.000365,0.000374,0.000382,0.00039,0.000398,0.000407,0.000414,0.000421,0.000428,...,-0.000114,-0.000109,-0.000102,-9.3e-05,-8.3e-05,-7.3e-05,-6.3e-05,-5.3e-05,-4.2e-05,-3.1e-05
max,0.000777,0.000788,0.000799,0.000809,0.00082,0.000831,0.000842,0.000851,0.000858,0.000865,...,-2.2e-05,-1.9e-05,-1.5e-05,-1e-05,-5e-06,-1e-06,9e-06,2.3e-05,3.9e-05,5.5e-05


In [6]:
df.shape

(1951, 1000)

In [7]:
labels.shape

(1951, 1)

Create Scaler and scala data

In [7]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df

Unnamed: 0,500.0,502.0,504.0,506.0,508.0,510.0,512.0,514.0,516.0,518.0,...,2480.0,2482.0,2484.0,2486.0,2488.0,2490.0,2492.0,2494.0,2496.0,2498.0
0,0.033550,0.030923,0.028393,0.025957,0.023611,0.021351,0.020134,0.019600,0.020371,0.022724,...,0.275892,0.280295,0.287538,0.297802,0.298202,0.276732,0.244755,0.207759,0.165672,0.118863
1,0.228730,0.240699,0.252153,0.263118,0.273620,0.283682,0.292945,0.302699,0.312571,0.322177,...,-0.761572,-0.805638,-0.843667,-0.865928,-0.860229,-0.825640,-0.773770,-0.711011,-0.636492,-0.550152
2,0.931905,0.920007,0.908408,0.897107,0.886101,0.875386,0.865490,0.855364,0.845065,0.834877,...,0.205335,0.234949,0.253837,0.257953,0.247090,0.213483,0.155301,0.090107,0.018342,-0.058809
3,0.723173,0.725128,0.726874,0.728428,0.729809,0.731031,0.730880,0.729221,0.725947,0.720612,...,0.307128,0.351822,0.390977,0.413210,0.409331,0.376629,0.321965,0.259427,0.189080,0.111728
4,0.685617,0.677358,0.669300,0.661443,0.653785,0.646324,0.638793,0.630961,0.623076,0.615126,...,0.095192,0.106626,0.116113,0.119311,0.122850,0.137148,0.148715,0.160365,0.171653,0.182027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1946,-0.634850,-0.617595,-0.600906,-0.584765,-0.569154,-0.554056,-0.538888,-0.523103,-0.506631,-0.489778,...,0.970707,0.944959,0.923588,0.898918,0.867846,0.832824,0.794746,0.747019,0.688553,0.618903
1947,0.601779,0.591457,0.581432,0.571699,0.562252,0.553084,0.542656,0.530403,0.515762,0.498423,...,1.470333,1.435445,1.394717,1.350529,1.304687,1.259562,1.199300,1.124194,1.032635,0.924024
1948,-0.386278,-0.373224,-0.360616,-0.348438,-0.336674,-0.325309,-0.311088,-0.292803,-0.269534,-0.240856,...,-1.941530,-1.899356,-1.845980,-1.782353,-1.707989,-1.618095,-1.506433,-1.372492,-1.214715,-1.033249
1949,0.268080,0.248295,0.229244,0.210896,0.193222,0.176193,0.159145,0.142823,0.127460,0.112641,...,-0.570091,-0.495802,-0.408980,-0.303210,-0.180018,-0.049235,0.095414,0.253978,0.424406,0.602904


Split Dataset into Training- and Testset

In [8]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, labels, test_size=0.2, random_state=42)

In [10]:
X_train.shape

(1560, 1000)

In [11]:
X_test.shape

(391, 1000)

In [12]:
y_train.shape

(1560, 1)

In [13]:
y_test.shape

(391, 1)

In [14]:
y_train = y_train['x']

Create Parametergrid for Gridsearch / Randomsearch

In [9]:
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 400, stop = 450, num = 8)],
    'max_depth': [35],
    'min_samples_split': [10],
    'min_samples_leaf': [4],
    'max_features': ['sqrt'],
    'bootstrap': [True]
}
param_grid

{'n_estimators': [400, 407, 414, 421, 428, 435, 442, 450],
 'max_depth': [35],
 'min_samples_split': [10],
 'min_samples_leaf': [4],
 'max_features': ['sqrt'],
 'bootstrap': [True]}

Create Scoring method

In [10]:
scorer = make_scorer(f1_score, average='micro')

Run Grid/Randomsearch


In [11]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(n_estimators=100, random_state=42, verbose=2),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring=scorer,
    verbose=1
)
 
fit_time = time.perf_counter()
grid_search.fit(X_train, y_train)
fit_time = np.round(time.perf_counter() - fit_time)
print(f'Total fitting time: {fit_time}s')

Fitting 5 folds for each of 8 candidates, totalling 40 fits


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    1.0s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    4.4s
[Paral

building tree 1 of 407
building tree 2 of 407
building tree 3 of 407
building tree 4 of 407
building tree 5 of 407
building tree 6 of 407
building tree 7 of 407
building tree 8 of 407
building tree 9 of 407
building tree 10 of 407
building tree 11 of 407
building tree 12 of 407
building tree 13 of 407
building tree 14 of 407
building tree 15 of 407
building tree 16 of 407
building tree 17 of 407
building tree 18 of 407
building tree 19 of 407
building tree 20 of 407
building tree 21 of 407
building tree 22 of 407
building tree 23 of 407
building tree 24 of 407
building tree 25 of 407
building tree 26 of 407
building tree 27 of 407
building tree 28 of 407
building tree 29 of 407
building tree 30 of 407
building tree 31 of 407
building tree 32 of 407
building tree 33 of 407
building tree 34 of 407
building tree 35 of 407
building tree 36 of 407
building tree 37 of 407
building tree 38 of 407
building tree 39 of 407
building tree 40 of 407
building tree 41 of 407
building tree 42 of 407
b

[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    4.4s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    4.2s


building tree 1 of 400
building tree 2 of 400
building tree 3 of 400
building tree 4 of 400
building tree 5 of 400
building tree 6 of 400
building tree 7 of 400
building tree 8 of 400
building tree 9 of 400
building tree 10 of 400
building tree 11 of 400
building tree 12 of 400
building tree 13 of 400
building tree 14 of 400
building tree 15 of 400
building tree 16 of 400
building tree 17 of 400
building tree 18 of 400
building tree 19 of 400
building tree 20 of 400
building tree 21 of 400
building tree 22 of 400
building tree 23 of 400
building tree 24 of 400
building tree 25 of 400
building tree 26 of 400
building tree 27 of 400
building tree 28 of 400
building tree 29 of 400
building tree 30 of 400
building tree 31 of 400
building tree 32 of 400
building tree 33 of 400
building tree 34 of 400
building tree 35 of 400
building tree 36 of 400
building tree 37 of 400
building tree 38 of 400
building tree 39 of 400
building tree 40 of 400
building tree 41 of 400
building tree 42 of 400
b

[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    8.7s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    8.8s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    9.7s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    9.1s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    9.6s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    9.6s
Traceback (most recent call last):
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/s

building tree 258 of 407
building tree 259 of 407
building tree 260 of 407
building tree 261 of 407
building tree 262 of 407
building tree 263 of 407
building tree 264 of 407
building tree 265 of 407
building tree 266 of 407
building tree 267 of 407
building tree 268 of 407
building tree 269 of 407
building tree 270 of 407
building tree 271 of 407
building tree 272 of 407
building tree 273 of 407
building tree 274 of 407
building tree 275 of 407
building tree 276 of 407
building tree 277 of 407
building tree 278 of 407
building tree 279 of 407
building tree 280 of 407
building tree 281 of 407
building tree 282 of 407
building tree 283 of 407
building tree 284 of 407
building tree 285 of 407
building tree 286 of 407
building tree 287 of 407
building tree 288 of 407
building tree 289 of 407
building tree 290 of 407
building tree 291 of 407
building tree 292 of 407
building tree 293 of 407
building tree 294 of 407
building tree 295 of 407
building tree 296 of 407
building tree 297 of 407


Traceback (most recent call last):
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3

building tree 265 of 421
building tree 266 of 421
building tree 267 of 421
building tree 268 of 421
building tree 269 of 421
building tree 270 of 421
building tree 271 of 421
building tree 272 of 421
building tree 273 of 421
building tree 274 of 421
building tree 275 of 421
building tree 276 of 421
building tree 277 of 421
building tree 278 of 421
building tree 279 of 421
building tree 280 of 421
building tree 281 of 421
building tree 282 of 421
building tree 283 of 421
building tree 284 of 421
building tree 285 of 421
building tree 286 of 421
building tree 287 of 421
building tree 288 of 421
building tree 289 of 421
building tree 290 of 421
building tree 291 of 421
building tree 292 of 421
building tree 293 of 421
building tree 294 of 421
building tree 295 of 421
building tree 296 of 421
building tree 297 of 421
building tree 298 of 421
building tree 299 of 421
building tree 300 of 421
building tree 301 of 421
building tree 302 of 421
building tree 303 of 421
building tree 304 of 421


[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    9.5s
Traceback (most recent call last):
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^

building tree 265 of 414
building tree 266 of 414
building tree 267 of 414
building tree 268 of 414
building tree 269 of 414
building tree 270 of 414
building tree 271 of 414
building tree 272 of 414
building tree 273 of 414
building tree 274 of 414
building tree 275 of 414
building tree 276 of 414
building tree 277 of 414
building tree 278 of 414
building tree 279 of 414
building tree 280 of 414
building tree 281 of 414
building tree 282 of 414
building tree 283 of 414
building tree 284 of 414
building tree 285 of 414
building tree 286 of 414
building tree 287 of 414
building tree 288 of 414
building tree 289 of 414
building tree 290 of 414
building tree 291 of 414
building tree 292 of 414
building tree 293 of 414
building tree 294 of 414
building tree 295 of 414
building tree 296 of 414
building tree 297 of 414
building tree 298 of 414
building tree 299 of 414
building tree 300 of 414
building tree 301 of 414
building tree 302 of 414
building tree 303 of 414
building tree 304 of 414


Traceback (most recent call last):
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3

building tree 265 of 414
building tree 266 of 414
building tree 267 of 414
building tree 268 of 414
building tree 269 of 414
building tree 270 of 414
building tree 271 of 414
building tree 272 of 414
building tree 273 of 414
building tree 274 of 414
building tree 275 of 414
building tree 276 of 414
building tree 277 of 414
building tree 278 of 414
building tree 279 of 414
building tree 280 of 414
building tree 281 of 414
building tree 282 of 414
building tree 283 of 414
building tree 284 of 414
building tree 285 of 414
building tree 286 of 414
building tree 287 of 414
building tree 288 of 414
building tree 289 of 414
building tree 290 of 414
building tree 291 of 414
building tree 292 of 414
building tree 293 of 414
building tree 294 of 414
building tree 295 of 414
building tree 296 of 414
building tree 297 of 414
building tree 298 of 414
building tree 299 of 414
building tree 300 of 414
building tree 301 of 414
building tree 302 of 414
building tree 303 of 414
building tree 304 of 414


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    2.9s


building tree 265 of 414
building tree 266 of 414
building tree 267 of 414
building tree 268 of 414
building tree 269 of 414
building tree 270 of 414
building tree 271 of 414
building tree 272 of 414
building tree 273 of 414
building tree 274 of 414
building tree 275 of 414
building tree 276 of 414
building tree 277 of 414
building tree 278 of 414
building tree 279 of 414
building tree 280 of 414
building tree 281 of 414
building tree 282 of 414
building tree 283 of 414
building tree 284 of 414
building tree 285 of 414
building tree 286 of 414
building tree 287 of 414
building tree 288 of 414
building tree 289 of 414
building tree 290 of 414
building tree 291 of 414
building tree 292 of 414
building tree 293 of 414
building tree 294 of 414
building tree 295 of 414
building tree 296 of 414
building tree 297 of 414
building tree 298 of 414
building tree 299 of 414
building tree 300 of 414
building tree 301 of 414
building tree 302 of 414
building tree 303 of 414
building tree 304 of 414


Traceback (most recent call last):
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 87, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3

building tree 265 of 414
building tree 266 of 414
building tree 267 of 414
building tree 268 of 414
building tree 269 of 414
building tree 270 of 414
building tree 271 of 414
building tree 272 of 414
building tree 273 of 414
building tree 274 of 414
building tree 275 of 414
building tree 276 of 414
building tree 277 of 414
building tree 278 of 414
building tree 279 of 414
building tree 280 of 414
building tree 281 of 414
building tree 282 of 414
building tree 283 of 414
building tree 284 of 414
building tree 285 of 414
building tree 286 of 414
building tree 287 of 414
building tree 288 of 414
building tree 289 of 414
building tree 290 of 414
building tree 291 of 414
building tree 292 of 414
building tree 293 of 414
building tree 294 of 414
building tree 295 of 414
building tree 296 of 414
building tree 297 of 414
building tree 298 of 414
building tree 299 of 414
building tree 300 of 414
building tree 301 of 414
building tree 302 of 414
building tree 303 of 414
building tree 304 of 414


  return fit_method(estimator, *args, **kwargs)
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    2.1s
[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    1.5s
Traceback (most recent call last):
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 345, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/moritzlindner/anaconda3/lib/python3.11/site-packages/sklearn/me

building tree 1 of 400
building tree 2 of 400
building tree 3 of 400
building tree 4 of 400
building tree 5 of 400
building tree 6 of 400
building tree 7 of 400
building tree 8 of 400
building tree 9 of 400
building tree 10 of 400
building tree 11 of 400
building tree 12 of 400
building tree 13 of 400
building tree 14 of 400
building tree 15 of 400
building tree 16 of 400
building tree 17 of 400
building tree 18 of 400
building tree 19 of 400
building tree 20 of 400
building tree 21 of 400
building tree 22 of 400
building tree 23 of 400
building tree 24 of 400
building tree 25 of 400
building tree 26 of 400
building tree 27 of 400
building tree 28 of 400
building tree 29 of 400
building tree 30 of 400
building tree 31 of 400
building tree 32 of 400
building tree 33 of 400
building tree 34 of 400
building tree 35 of 400
building tree 36 of 400
building tree 37 of 400
building tree 38 of 400
building tree 39 of 400
building tree 40 of 400
building tree 41 of 400
building tree 42 of 400
b

[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.9s


building tree 46 of 400
building tree 47 of 400
building tree 48 of 400
building tree 49 of 400
building tree 50 of 400
building tree 51 of 400
building tree 52 of 400
building tree 53 of 400
building tree 54 of 400
building tree 55 of 400
building tree 56 of 400
building tree 57 of 400
building tree 58 of 400
building tree 59 of 400
building tree 60 of 400
building tree 61 of 400
building tree 62 of 400
building tree 63 of 400
building tree 64 of 400
building tree 65 of 400
building tree 66 of 400
building tree 67 of 400
building tree 68 of 400
building tree 69 of 400
building tree 70 of 400
building tree 71 of 400
building tree 72 of 400
building tree 73 of 400
building tree 74 of 400
building tree 75 of 400
building tree 76 of 400
building tree 77 of 400
building tree 78 of 400
building tree 79 of 400
building tree 80 of 400
building tree 81 of 400
building tree 82 of 400
building tree 83 of 400
building tree 84 of 400
building tree 85 of 400
building tree 86 of 400
building tree 87

[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    4.1s


building tree 167 of 400
building tree 168 of 400
building tree 169 of 400
building tree 170 of 400
building tree 171 of 400
building tree 172 of 400
building tree 173 of 400
building tree 174 of 400
building tree 175 of 400
building tree 176 of 400
building tree 177 of 400
building tree 178 of 400
building tree 179 of 400
building tree 180 of 400
building tree 181 of 400
building tree 182 of 400
building tree 183 of 400
building tree 184 of 400
building tree 185 of 400
building tree 186 of 400
building tree 187 of 400
building tree 188 of 400
building tree 189 of 400
building tree 190 of 400
building tree 191 of 400
building tree 192 of 400
building tree 193 of 400
building tree 194 of 400
building tree 195 of 400
building tree 196 of 400
building tree 197 of 400
building tree 198 of 400
building tree 199 of 400
building tree 200 of 400
building tree 201 of 400
building tree 202 of 400
building tree 203 of 400
building tree 204 of 400
building tree 205 of 400
building tree 206 of 400


[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    7.8s


building tree 373 of 400
building tree 374 of 400
building tree 375 of 400
building tree 376 of 400
building tree 377 of 400
building tree 378 of 400
building tree 379 of 400
building tree 380 of 400
building tree 381 of 400
building tree 382 of 400
building tree 383 of 400
building tree 384 of 400
building tree 385 of 400
building tree 386 of 400
building tree 387 of 400
building tree 388 of 400
building tree 389 of 400
building tree 390 of 400
building tree 391 of 400
building tree 392 of 400
building tree 393 of 400
building tree 394 of 400
building tree 395 of 400
building tree 396 of 400
building tree 397 of 400
building tree 398 of 400
building tree 399 of 400
building tree 400 of 400
Total fitting time: 74.0s


Extract the best classifier, its paramester and the score

In [12]:
best_clf = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [13]:
print(f"best params {best_params}")

best params {'bootstrap': True, 'max_depth': 35, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 400}


In [14]:
print(f"best score {best_score}")

best score nan


# Make predictions on the test set and show model metrics

In [15]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5294117647058824
Classification Report:
              precision    recall  f1-score   support

        high       0.58      0.37      0.45        67
         low       0.42      0.56      0.48       106
    moderate       0.45      0.43      0.44       101
   very_high       0.57      0.55      0.56        22
    very_low       0.75      0.72      0.73        95

    accuracy                           0.53       391
   macro avg       0.55      0.52      0.53       391
weighted avg       0.54      0.53      0.53       391

Confusion Matrix:
[[25  9 22  8  3]
 [ 4 59 26  0 17]
 [ 8 46 43  1  3]
 [ 6  0  4 12  0]
 [ 0 27  0  0 68]]


[Parallel(n_jobs=1)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 161 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 364 tasks      | elapsed:    0.0s


Save model

In [16]:
joblib.dump(best_clf, '../new_models/random_forest.pkl')

['../new_models/random_forest.pkl']

Shaply

In [17]:
model = joblib.load('../new_models/random_forest.pkl')
feature_names = df.columns.to_list()
explainer = shap.TreeExplainer(model, feature_names=feature_names)
explanation = explainer(df)

In [18]:
file_path = f"../shapley-values/saved_values/shapley-values/saved_values/randomforest-shapley_values.json"
joblib.dump(explanation, file_path)

['../shapley-values/saved_values/randomforest-shapley_values.json']