In [15]:
import pandas as pd
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
warnings.filterwarnings("ignore", category=FutureWarning)

In [16]:
tr_features = pd.read_csv('data/train_features.csv')
tr_labels = pd.read_csv('data/train_labels.csv')

te_features = pd.read_csv('data/test_features.csv')
te_labels = pd.read_csv('data/test_labels.csv')

In [17]:
def print_results(results):
    print(f'BEST PARAMS: {results.best_params_}\n')

    means = sorted(results.cv_results_['mean_test_score'], reverse=True)
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f'{round(mean,3)} (+/- {round(std * 2, 3)}) for {params}')

In [18]:
rf = RandomForestClassifier()
rfscores = cross_val_score(rf, tr_features, tr_labels.values.ravel(), cv=5, n_jobs=32)
print(rfscores)

[0.38436071 0.38751379 0.38877503 0.37915813 0.38152294]


In [19]:
rfparams = {
    'max_depth': [5,50,100,None],
    'max_leaf_nodes': [5,50,500,None]
}
cv = GridSearchCV(rf, rfparams, cv=5, n_jobs=32)
cv.fit(tr_features, tr_labels.values.ravel())

print_results(cv)

BEST PARAMS: {'max_depth': None, 'max_leaf_nodes': 500}

0.393 (+/- 0.011) for {'max_depth': 5, 'max_leaf_nodes': 5}
0.393 (+/- 0.009) for {'max_depth': 5, 'max_leaf_nodes': 50}
0.391 (+/- 0.011) for {'max_depth': 5, 'max_leaf_nodes': 500}
0.385 (+/- 0.009) for {'max_depth': 5, 'max_leaf_nodes': None}
0.384 (+/- 0.016) for {'max_depth': 50, 'max_leaf_nodes': 5}
0.384 (+/- 0.01) for {'max_depth': 50, 'max_leaf_nodes': 50}
0.384 (+/- 0.016) for {'max_depth': 50, 'max_leaf_nodes': 500}
0.383 (+/- 0.014) for {'max_depth': 50, 'max_leaf_nodes': None}
0.379 (+/- 0.014) for {'max_depth': 100, 'max_leaf_nodes': 5}
0.379 (+/- 0.011) for {'max_depth': 100, 'max_leaf_nodes': 50}
0.378 (+/- 0.012) for {'max_depth': 100, 'max_leaf_nodes': 500}
0.378 (+/- 0.011) for {'max_depth': 100, 'max_leaf_nodes': None}
0.363 (+/- 0.015) for {'max_depth': None, 'max_leaf_nodes': 5}
0.363 (+/- 0.01) for {'max_depth': None, 'max_leaf_nodes': 50}
0.362 (+/- 0.014) for {'max_depth': None, 'max_leaf_nodes': 500}
0.3

In [20]:
rf.fit(tr_features, tr_labels.values.ravel())

In [21]:
y_pred = rf.predict(te_features)
accuracy = round(accuracy_score(te_labels, y_pred), 8)
precision = round(precision_score(te_labels, y_pred, average='weighted'), 8)
recall = round(recall_score(te_labels, y_pred, average='weighted'), 8)
f1 = round(f1_score(te_labels, y_pred, average='weighted'), 8)
print(f'A: {accuracy} / P: {precision} / R: {recall} / F1: {f1}')

A: 0.39210493 / P: 0.38512662 / R: 0.39210493 / F1: 0.38628333
