# 1. Standardized vs. non-standardized data

In [3]:
from src.train_model import split_data, standardize_data
from src.preprocessing import load_data
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from dotenv import load_dotenv

load_dotenv()

PREPROCESSED_FILE_PATH = os.getenv('PREPROCESSED_DATA_PATH').replace('/', os.sep)
TARGET_COLUMN = os.getenv('TARGET_COLUMN')
BEST_PARAMS_PATH = os.getenv('BEST_PARAMS_PATH').replace('/', os.sep)

data = load_data(PREPROCESSED_FILE_PATH)

X_train, X_test, y_train, y_test = split_data(data, TARGET_COLUMN)

base_model = RandomForestRegressor(n_estimators=100, random_state=42)

base_model.fit(X_train, y_train)
non_standardized_score = base_model.score(X_test, y_test)
non_standardized_me = mean_squared_error(y_test, base_model.predict(X_test), squared=False)

X_train_std, X_test_std = standardize_data(X_train, X_test)
base_model.fit(X_train_std, y_train)
standardized_score = base_model.score(X_test_std, y_test)
standardized_me = mean_squared_error(y_test, base_model.predict(X_test_std), squared=False)

print(f'Non-standardized score: {non_standardized_score}')
print(f'Non-standardized ME: {non_standardized_me}')
print(f'Standardized score: {standardized_score}')
print(f'Standardized ME: {standardized_me}')

Loaded dataset from data/preprocessed.csv
Split the data into training and testing sets (80%/20%)
Standardized the data
Non-standardized score: 0.9121564406849627
Non-standardized ME: 19221.480517430282
Standardized score: 0.9044382420997802
Standardized ME: 20048.1331754706


#### => Keeping the data non-standardized seeems to yield a better performance for random forest regression!

# 2. Hyperparameter Tuning

## 2.1 Random Search with Cross-Validation

In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

model = RandomForestRegressor(random_state=42)

random_grid = {
  'n_estimators': randint(50, 500),
  'max_features': ['sqrt', 'log2', None, 0.1, 0.5],
  'max_depth': randint(5, 30),
  'min_samples_split': randint(2, 10),
  'min_samples_leaf': randint(2, 10),
  'bootstrap': [True, False]
}

rf_random = RandomizedSearchCV(
  random_state=42,
  estimator=model,
  param_distributions=random_grid,
  scoring='neg_mean_absolute_error',
  n_iter=100,
  cv=3,
  verbose=1,
  n_jobs=-1
)

rf_random.fit(X_train, y_train)

best_random = rf_random.best_estimator_
best_random.fit(X_train, y_train)
best_random_score = best_random.score(X_test, y_test)
best_random_me = mean_squared_error(y_test, best_random.predict(X_test), squared=False)

print(f'Best random score: {best_random_score}')
print(f'Best random ME: {best_random_me}')
print(f'Base model score: {non_standardized_score}')
print(f'Base model ME: {non_standardized_me}')

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best random score: 0.9116627070456951
Best random ME: 19275.422967282422
Base model score: 0.9121564406849627
Base model ME: 19221.480517430282


#### => Best estimator after random search yields a small improvement!

## 2.2 Grid Search with Cross-Validation

In [8]:
print(rf_random.best_params_)

{'bootstrap': False, 'max_depth': 16, 'max_features': 0.1, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 242}


In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
  'n_estimators': [180, 220, 260, 300],
  'max_features': [0.1, 0.2, 0.3],
  'max_depth': [13, 16, 19],
  'min_samples_split': [5, 6, 7],
  'min_samples_leaf': [1, 2, 3],
  'bootstrap': [False]
}

grid_search = GridSearchCV(
  estimator=model,
  param_grid=param_grid,
  scoring='neg_mean_absolute_error',
  cv=3,
  verbose=1,
  n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_grid = grid_search.best_estimator_
best_grid.fit(X_train, y_train)
best_grid_score = best_grid.score(X_test, y_test)
best_grid_me = mean_squared_error(y_test, best_grid.predict(X_test), squared=False)

print(f'Best grid score: {best_grid_score}')
print(f'Best grid ME: {best_grid_me}')
print(f'Base model score: {non_standardized_score}')
print(f'Base model ME: {non_standardized_me}')

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best grid score: 0.9204163287279854
Best grid ME: 18295.482115763014
Base model score: 0.9121564406849627
Base model ME: 19221.480517430282


In [11]:
import json

json.dump(grid_search.best_params_, open(BEST_PARAMS_PATH, 'w'))

#### => Achieved an improvement of ME: -926€ and R2: +0.825%