In [1]:
import quandl
import pandas as pd
import re
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from tqdm import tqdm
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Importing Data

In [None]:
df=pd.read_csv('../1. Data Extraction & Cleaning/df.csv')
df.date=pd.to_datetime(df.date)
df.set_index('date')

In [48]:
predictors = df[['ZABT','ZSFH','NEWY636FIRE','CUURA101SAF11','CUURA101SS47016', 'MICH']]

In [49]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)

### Creating Random Forest Model

In [50]:
# define the random forest model
rf = RandomForestRegressor()

# define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# perform grid search to find the best hyperparameters
grid_search = GridSearchCV(rf, param_grid, cv=5)

for params in tqdm(param_grid):
    grid_search.fit(X_train, y_train)

# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# use the best hyperparameters to create the final model
final_model_rf = RandomForestRegressor(**grid_search.best_params_)
final_model_rf.fit(X_train, y_train)

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [08:35<00:00, 128.91s/it]


Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}


RandomForestRegressor(max_depth=10, n_estimators=500)

In [51]:
# Evaluate the final model on the testing set
y_pred = final_model_rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse=math.sqrt(mse)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
r2 = final_model_rf.score(X_test, y_test)

In [52]:
print("Best hyperparameters:", grid_search.best_params_)
print("MAE:", mae)
print("MSE on testing set:", mse)
print("RMSE on testing set:" , rmse)
print("MAPE:", mape)
print("R-squared on testing set:", r2)

Best hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
MAE: 21.88127018971562
MSE on testing set: 901.5085231945377
RMSE on testing set: 30.02513152668174
MAPE: 0.8220408019399078
R-squared on testing set: 0.9779914674947715


### Using RF Model

In [54]:
### MARCH
features = np.array([[367681.3436, 588200.022840, 810.39730, 315.610, 297.364, 3.9]])
prediction = final_model_rf.predict(features)
print("Prediction: {}".format(prediction))

Prediction: [2985.35266519]


  "X does not have valid feature names, but"


In [53]:
### APRIL
features = np.array([[370384.9491, 588633.772961, 811.15608, 316.115, 296.949,4.1]])
prediction = final_model_rf.predict(features)
print("Prediction: {}".format(prediction))

Prediction: [2987.93545623]


  "X does not have valid feature names, but"
