In [13]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_percentage_error

import pandas as pd
import numpy as np
import itertools
import pickle
from datetime import datetime

In [14]:
df = pd.read_csv("../data/processed/concrete_min_max_scaled.csv")
df.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,1.0,0.0,0.0,0.321086,0.07764,0.694767,0.20572,0.074176,79.99
1,1.0,0.0,0.0,0.321086,0.07764,0.738372,0.20572,0.074176,61.89
2,0.526256,0.396494,0.0,0.848243,0.0,0.380814,0.0,0.739011,40.27
3,0.526256,0.396494,0.0,0.848243,0.0,0.380814,0.0,1.0,41.05
4,0.220548,0.368392,0.0,0.560703,0.0,0.515698,0.580783,0.986264,44.3


In [15]:
y = df["csMPa"]
x = df[["cement", "slag", "flyash", "age", "water", "superplasticizer"]]

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [17]:
parameters = {"n_estimators": [100, 150, 200],
              "criterion": ["mse", "mae"],
              "min_samples_split": [2, 4, 6],
              "max_features": ["auto", "sqrt", "log2"]}

In [18]:
grid_search = list(itertools.product(*parameters.values()))

In [22]:
mape_scores = []
r2_scores = []

for params in grid_search:
    rf = RandomForestRegressor(n_estimators=params[0],
                       criterion=params[1],
                       min_samples_split=params[2],
                       max_features=params[3])
    rf.fit(x_train, y_train)
    pred = rf.predict(x_test)
    mape = mean_absolute_percentage_error(y_test, pred)
    mape_scores.append(mape)
    
    r2 = r2_score(y_test, pred)
    r2_scores.append(r2)

In [26]:
mape_scores[np.argmax(r2_scores)]

0.11957234859465854

In [27]:
r2_scores[np.argmax(r2_scores)]

0.9007555345871645

In [29]:
best_params = grid_search[np.argmax(r2_scores)]
best_model = RandomForestRegressor(n_estimators=best_params[0],
                   criterion=best_params[1],
                   min_samples_split=best_params[2],
                   max_features=best_params[3])
best_model.fit(x_train, y_train)

RandomForestRegressor(criterion='mae')

In [31]:
filename = datetime.now().strftime("%Y_%m_%d_%H_%M")

with open(f"../models/random_forest_regressor/{filename}.pkl", "wb") as f:
    pickle.dump(best_model, f)