In [None]:
pip install -r ./requirements.txt

In [None]:
import time

import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, train_test_split

merged_book_attributes_gold = './data/gold/merged_book_attributes.csv'

In [None]:
merged_book_attributes = pd.read_csv(merged_book_attributes_gold)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(merged_book_attributes[['price', 'full_review_sentiment_score', 'review_summary_sentiment_score', 'books_average_rating']], merged_book_attributes[['rating']], test_size=0.2, random_state=42)

In [None]:
param_grid = {
    "n_estimators": [x for x in range(200, 2001, 200)],
    "max_features": ["log2", "sqrt"],
    "max_depth": [x for x in range(10, 110, 10)],
    "min_samples_split": [2, 4, 8],
    "bootstrap": [True, False]
}

best_rf_regressor = RandomizedSearchCV(RandomForestRegressor(random_state=42), param_grid, n_jobs=-1, cv=5, scoring='r2')

In [None]:
start = time.time()
best_rf_regressor.fit(X_train, y_train.values.ravel())
stop = time.time()
rf_regressor_training_time = round(stop - start, 2)

In [None]:
print(f"Random Forests training time: {rf_regressor_training_time}s")

print("Best parameters:", best_rf_regressor.best_params_)
print("Best r2 score:", best_rf_regressor.best_score_)

In [None]:
print(f"r2 score on test data: {best_rf_regressor.score(X_test, y_test)}")