# Random Forest Classifier

Imports

In [1]:
import time

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

import joblib

Load data and labels into dataframes

In [2]:
df = pd.read_csv("./data/lucas_organic_carbon_training_and_test_data_NEW.csv")
labels = pd.read_csv("./data/lucas_organic_carbon_target.csv")

In [None]:
df

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
labels.shape

Create Scaler and scala data

In [3]:
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=df.columns)
scaled_df

Unnamed: 0,500.0,500.5,501.0,501.5,502.0,502.5,503.0,503.5,504.0,504.5,...,2495.0,2495.5,2496.0,2496.5,2497.0,2497.5,2498.0,2498.5,2499.0,2499.5
0,-0.317694,-0.318224,-0.318744,-0.319256,-0.319759,-0.320253,-0.320739,-0.321217,-0.321686,-0.322148,...,0.320884,0.311196,0.301124,0.290665,0.279820,0.268590,0.256980,0.244997,0.232651,0.219954
1,0.256876,0.254926,0.252993,0.251076,0.249174,0.247289,0.245420,0.243566,0.241727,0.239904,...,0.953367,0.949280,0.944598,0.939293,0.933338,0.926704,0.919368,0.911309,0.902508,0.892950
2,0.091526,0.090582,0.089648,0.088721,0.087803,0.086892,0.085990,0.085095,0.084209,0.083330,...,-0.016399,-0.031121,-0.046161,-0.061504,-0.077135,-0.093032,-0.109173,-0.125532,-0.142080,-0.158787
3,-0.132203,-0.129195,-0.126215,-0.123263,-0.120339,-0.117442,-0.114573,-0.111730,-0.108914,-0.106124,...,-0.503614,-0.486400,-0.468538,-0.450026,-0.430868,-0.411068,-0.390637,-0.369589,-0.347943,-0.325724
4,0.536071,0.533381,0.530712,0.528063,0.525434,0.522825,0.520236,0.517667,0.515117,0.512587,...,0.194378,0.177311,0.159763,0.141743,0.123262,0.104338,0.084991,0.065242,0.045121,0.024658
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9890,1.068931,1.065211,1.061515,1.057844,1.054196,1.050572,1.046972,1.043396,1.039844,1.036315,...,-1.896294,-1.884456,-1.871355,-1.856935,-1.841145,-1.823937,-1.805269,-1.785104,-1.763413,-1.740174
9891,-0.533231,-0.528060,-0.522935,-0.517854,-0.512817,-0.507825,-0.502875,-0.497969,-0.493105,-0.488283,...,0.317374,0.315407,0.313229,0.310831,0.308203,0.305338,0.302229,0.298870,0.295256,0.291383
9892,0.557028,0.562643,0.568195,0.573686,0.579116,0.584485,0.589796,0.595047,0.600241,0.605379,...,-1.968334,-1.971239,-1.973170,-1.974056,-1.973825,-1.972409,-1.969740,-1.965754,-1.960391,-1.953597
9893,0.121474,0.111586,0.101796,0.092101,0.082501,0.072994,0.063580,0.054258,0.045026,0.035884,...,1.528755,1.514539,1.499200,1.482700,1.465000,1.446070,1.425882,1.404414,1.381652,1.357588


Split Dataset into Training- and Testset

In [4]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, labels, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
y_train = y_train['x']

Create Parametergrid for Gridsearch / Randomsearch

In [None]:
param_grid = {
    'n_estimators': [int(x) for x in np.linspace(start = 400, stop = 450, num = 8)],
    'max_depth': [35],
    'min_samples_split': [10],
    'min_samples_leaf': [4],
    'max_features': ['sqrt'],
    'bootstrap': [True]
}
param_grid

Create Scoring method

In [None]:
scorer = make_scorer(f1_score, average='micro')

Run Grid/Randomsearch


In [None]:
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(n_estimators=100, random_state=42, verbose=2),
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring=scorer,
    verbose=1
)

fit_time = time.perf_counter()
grid_search.fit(X_train, y_train)
fit_time = np.round(time.perf_counter() - fit_time)
print(f'Total fitting time: {fit_time}s')

Extract the best classifier, its paramester and the score

In [None]:
best_clf = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [None]:
print(f"best params {best_params}")

In [None]:
print(f"best score {best_score}")

# Make predictions on the test set and show model metrics

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Save model

In [None]:
joblib.dump(best_clf, './models/random_forest.pkl')

Load model

In [7]:
model = joblib.load('./models/best_model_random_forest.pkl')
model

In [8]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.7842344618494189
Classification Report:
              precision    recall  f1-score   support

        high       0.60      0.52      0.56       130
         low       0.37      0.09      0.15       224
    moderate       0.49      0.36      0.42       190
   very_high       0.71      0.61      0.65        28
    very_low       0.84      0.98      0.90      1407

    accuracy                           0.78      1979
   macro avg       0.60      0.51      0.54      1979
weighted avg       0.73      0.78      0.74      1979


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:    0.2s finished


In [11]:
f1 = f1_score(y_test, y_pred, average='weighted')
f1

0.7447171403439661