In [1]:
import pandas as pd
import re
import numpy as np
import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from tqdm import tqdm
import math
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

### Importing Data

In [3]:
df=pd.read_csv('../1. Data Extraction & Cleaning/df.csv')
df.date=pd.to_datetime(df.date)
df.set_index('date')

Unnamed: 0_level_0,ZABT,ZATT,ZSFH,RSNA,NYTPOP,ACTLISCOU35620,NEWLISCOU35620,PENLISCOU35620,PRIREDCOU35620,NEWY636BPPRIV,...,CUURA101SAR,CUURA101SAF116,CUURA101SAA,CUURA101SANL1,CUURA101SS47016,PSAVERT,MICH,FEDFUNDS,T10Y2Y,T10Y3M
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-08-31,263665.000000,7.639080e+05,421874.000000,2515.697593,19336.456,71440.0,21816.0,9263.0,12840.0,2939.0,...,118.985,254.813,121.576,178.913,194.696,6.8,2.7,0.39,0.79,1.18
2016-09-30,264317.000000,7.653900e+05,422882.000000,2517.262497,19336.456,69427.0,17092.0,9077.0,11850.0,4275.0,...,119.552,254.393,129.847,180.747,188.480,6.8,2.5,0.40,0.78,1.25
2016-10-31,265031.000000,7.670250e+05,423987.000000,2512.303863,19336.456,67715.0,19382.0,8934.0,12878.0,4258.0,...,119.874,254.879,133.232,183.620,190.183,6.8,2.4,0.40,0.83,1.31
2016-11-30,265862.000000,7.689390e+05,425312.000000,2507.025158,19336.456,66569.0,16736.0,8943.0,11808.0,2559.0,...,120.142,255.126,130.743,183.239,192.314,6.9,2.4,0.40,0.98,1.50
2016-12-31,266632.000000,7.709340e+05,426762.000000,2496.784727,19336.456,62162.0,15042.0,9062.0,9930.0,3483.0,...,121.092,256.281,124.428,182.221,202.140,7.0,2.4,0.41,1.26,1.89
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-10-31,409337.000000,1.016134e+06,624836.000000,3131.359562,19768.458,39687.0,18358.0,18632.0,8096.0,3438.0,...,141.277,296.748,133.312,214.066,313.305,3.0,4.7,2.56,-0.39,0.50
2022-11-30,410536.000000,1.020136e+06,625394.000000,3099.155451,19768.458,40049.0,14664.0,17675.0,8752.0,2718.0,...,144.058,298.499,129.743,213.962,304.637,3.4,5.0,3.08,-0.41,-0.12
2022-12-31,410411.000000,1.019711e+06,624669.000000,3075.999271,19768.458,38650.0,11362.0,16282.0,7498.0,3549.0,...,142.249,299.317,123.985,215.400,323.524,4.1,4.9,3.78,-0.70,-0.69
2023-01-31,410105.000000,1.019926e+06,623959.000000,3068.361445,19768.458,33983.0,7348.0,14434.0,3820.0,4438.0,...,142.202,302.110,122.084,208.488,302.990,4.4,4.4,4.10,-0.53,-0.54


In [4]:
# Separate the target variable from the features
target = df['RSNA']
features = df[['ZABT','ZSFH','NEWY636FIRE','CUURA101SAF11','CUURA101SS47016', 'MICH']]

# Normalize the features
scaler = MinMaxScaler()
normalized_features = scaler.fit_transform(features)

# Put the normalized features and target back into a new DataFrame
df_normalized = pd.DataFrame(normalized_features, columns=features.columns)
df_normalized['RSNA'] = target

In [5]:
predictors = df_normalized[['ZABT','ZSFH','NEWY636FIRE','CUURA101SAF11','CUURA101SS47016', 'MICH']]

In [6]:
# split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)

### Creating Gradient Boosting Regressor

In [7]:
# Define the hyperparameter space to search over
param_grid = {'learning_rate': [0.1, 0.15, 0.2],
              'n_estimators': [300, 350, 400, 450],
              'max_depth': [2, 3, 4],
              'min_samples_split': [2, 3, 4],
              'min_samples_leaf': [1, 2]}

# Create a Gradient Boosting regressor
reg = GradientBoostingRegressor()

# Run grid search to find the best hyperparameters
grid_search = GridSearchCV(reg, param_grid=param_grid, cv=5, n_jobs=-1)

for params in tqdm(param_grid):
    grid_search.fit(X_train, y_train)
    
# print the best hyperparameters
print("Best hyperparameters:", grid_search.best_params_)

# use the best hyperparameters to create the final model
final_model_gb = GradientBoostingRegressor(**grid_search.best_params_)
final_model_gb.fit(X_train, y_train)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:30<00:00, 18.06s/it]

Best hyperparameters: {'learning_rate': 0.15, 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 300}





In [8]:
# Evaluate the final model on the testing set
y_pred = final_model_gb.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse=math.sqrt(mse)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
r2 = final_model_gb.score(X_test, y_test)

In [9]:
print("Best hyperparameters:", grid_search.best_params_)
print("MAE:", mae)
print("MSE on testing set:", mse)
print("RMSE on testing set:" , rmse)
print("MAPE:", mape)
print("R-squared on testing set:", r2)

Best hyperparameters: {'learning_rate': 0.15, 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 300}
MAE: 19.74368489544642
MSE on testing set: 609.9744919916923
RMSE on testing set: 24.697661670524443
MAPE: 0.7283330815899238
R-squared on testing set: 0.9851086893923215


### Using GBR Model

In [10]:
### MARCH
features = np.array([[367681.3436, 588200.022840, 810.39730, 315.610, 297.364, 3.9]])
# Normalize the array
normalized_features = scaler.transform(features)
prediction = final_model_gb.predict(normalized_features)
print("Prediction: {}".format(prediction))

Prediction: [2975.32779671]




In [11]:
normalized_features

array([[0.70821567, 0.81724657, 0.99335012, 1.        , 0.52356132,
        0.54545455]])

In [12]:
### APRIL
features = np.array([[370384.9491, 588633.772961, 811.15608, 316.115, 296.949,4.1]])
normalized_features = scaler.transform(features)
prediction = final_model_gb.predict(features)
print("Prediction: {}".format(prediction))

Prediction: [3119.98033685]




In [13]:
normalized_features

array([[0.7266237 , 0.81937782, 1.00785665, 1.00744223, 0.52156582,
        0.60606061]])