In [1]:
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
tr_features = pd.read_csv('data/train_features.csv')
tr_labels = pd.read_csv('data/train_labels.csv')

te_features = pd.read_csv('data/test_features.csv')
te_labels = pd.read_csv('data/test_labels.csv')

In [3]:
def print_results(results):
    print(f'BEST PARAMS: {results.best_params_}\n')

    means = sorted(results.cv_results_['mean_test_score'], reverse=True)
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f'{round(mean,3)} (+/- {round(std * 2, 3)}) for {params}')

In [4]:
rf = RandomForestClassifier()
rfscores = cross_val_score(rf, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=16)
print(rfscores)

[0.37773924 0.38893268 0.38010405 0.38909034 0.38514898]


In [7]:
rfparams = {
    'n_estimators': [25, 50],
    'max_depth': [5, 50, 100, None],
    'min_samples_leaf': [3],
    'max_features': ['sqrt', 'log2', None],
    'max_leaf_nodes': [5, 50, 500, None],
    'bootstrap': [True, False],
    'oob_score': [True, False],
    'warm_start': [True, False],
    'max_samples': [None, 1, 2]
}
cv = GridSearchCV(rf, rfparams, cv=5, n_jobs=16)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

4800 fits failed out of a total of 11520.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
960 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\caden\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\caden\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\ensemble\_forest.py", line 434, in fit
    raise ValueError("Out of bag estimation only available if bootstrap=True")
ValueError: Out of bag estimation only available if bootstrap=True

--------------------------------------------------------------------------------
3840 fits failed with th

BEST PARAMS: {'bootstrap': False, 'max_depth': 100, 'max_features': 'sqrt', 'max_leaf_nodes': 500, 'max_samples': None, 'min_samples_leaf': 3, 'n_estimators': 50, 'oob_score': False, 'warm_start': False}

0.39 (+/- 0.011) for {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 5, 'max_samples': None, 'min_samples_leaf': 3, 'n_estimators': 25, 'oob_score': True, 'warm_start': True}
0.39 (+/- 0.013) for {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 5, 'max_samples': None, 'min_samples_leaf': 3, 'n_estimators': 25, 'oob_score': True, 'warm_start': False}
0.389 (+/- 0.014) for {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 5, 'max_samples': None, 'min_samples_leaf': 3, 'n_estimators': 25, 'oob_score': False, 'warm_start': True}
0.389 (+/- 0.014) for {'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 5, 'max_samples': None, 'min_samples_leaf': 3, 'n_estimators': 25, 'oob_score

In [9]:
rf.fit(tr_features, tr_labels.values.ravel())

In [10]:
y_pred = rf.predict(te_features)
accuracy = round(accuracy_score(te_labels, y_pred), 8)
precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
print(f'A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

A: 0.38479001 / P: 0.37745289 / R: 0.38479001 / F1: 0.37824485
