In [1]:
import quandl
import pandas as pd
import re
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import geopandas as gpd
import seaborn as sns

In [41]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from tqdm import tqdm
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error

### Importing Data

In [None]:
df=pd.read_csv('../1. Data Extraction & Cleaning/df.csv')
df.date=pd.to_datetime(df.date)
df.set_index('date')

In [48]:
predictors = df[['ZABT','ZSFH','NEWY636FIRE','CUURA101SAF11','CUURA101SS47016', 'MICH']]

In [49]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)

### Creating Gradient Boosting Regressor

In [55]:
# Define the hyperparameter space to search over
param_grid = {'learning_rate': [0.1, 0.15, 0.2],
              'n_estimators': [300, 350, 400, 450],
              'max_depth': [2, 3, 4],
              'min_samples_split': [2, 3, 4],
              'min_samples_leaf': [1, 2]}

# Create a Gradient Boosting regressor
reg = GradientBoostingRegressor()

# Run grid search to find the best hyperparameters
grid_search = GridSearchCV(reg, param_grid=param_grid, cv=5, n_jobs=-1)

for params in tqdm(param_grid):
    grid_search.fit(X_train, y_train)
    
# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# use the best hyperparameters to create the final model
final_model_gb = GradientBoostingRegressor(**grid_search.best_params_)
final_model_gb.fit(X_train, y_train)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:40<00:00, 44.14s/it]

Best hyperparameters: {'learning_rate': 0.15, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 350}





GradientBoostingRegressor(learning_rate=0.15, min_samples_split=4,
                          n_estimators=350)

In [56]:
# Evaluate the final model on the testing set
y_pred = final_model_gb.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse=math.sqrt(mse)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
r2 = final_model_gb.score(X_test, y_test)

In [57]:
print("Best hyperparameters:", grid_search.best_params_)
print("MAE:", mae)
print("MSE on testing set:", mse)
print("RMSE on testing set:" , rmse)
print("MAPE:", mape)
print("R-squared on testing set:", r2)

Best hyperparameters: {'learning_rate': 0.15, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 350}
MAE: 24.911547024147268
MSE on testing set: 920.7749796633082
RMSE on testing set: 30.34427424841972
MAPE: 0.9179753018653927
R-squared on testing set: 0.9775211153876712


### Using GBR Model

In [58]:
### MARCH
features = np.array([[367681.3436, 588200.022840, 810.39730, 315.610, 297.364, 3.9]])
prediction = final_model_gb.predict(features)
print("Prediction: {}".format(prediction))

Prediction: [2979.48562223]


  "X does not have valid feature names, but"


In [59]:
### APRIL
features = np.array([[370384.9491, 588633.772961, 811.15608, 316.115, 296.949,4.1]])
prediction = final_model_gb.predict(features)
print("Prediction: {}".format(prediction))

Prediction: [2979.71812495]


  "X does not have valid feature names, but"
