In [25]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt


from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm.notebook import tqdm
import math
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
from sklearn.ensemble import VotingClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [26]:
#data loading
df = pd.read_csv('shortened_oil_data.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)
daily_cols = ['Brent_Oil', 'USD_CAD', 'Crude_Oil', 'DXY', 'Emerging_Mkt', 'Gold', 
              'Copper', 'Heating_Oil', 'Junk_Bond', 'Transportation', 'Natural_Gas',
              'Oil_Services', 'Gasoline', 'Inflation_Tips', 'Energy_Stocks', 'SP500', 
              'Oil_VIX', 'US10Y', 'Crack_Spread_321', 'Gold_Oil_Ratio', 'Copper_Oil_Ratio',
              'Transport_Oil_Ratio', 'Service_Oil_Ratio',
              'SMA_5', 'SMA_20', 'SMA_50', 'Dist_SMA_20', 'Dist_SMA_50',
              'BB_Upper', 'BB_Lower', 'BB_Width', 'BB_Position', 
              'RSI_14', 'RSI_28', 'MACD', 'MACD_Signal', 'MACD_Hist',
              'Ret_1d', 'Ret_5d', 'Ret_20d', 'Ret_60d',
              'Realized_Vol_20d', 'Realized_Vol_60d', 'Annual_Vol_20d',
              'Vol_Adj_Mom_20d', 'Vol_Adj_Mom_60d',
              'Price_Range_20d', 'ROC_5', 'ROC_20',
              'Oil_Lag1', 'Oil_Lag2', 'Oil_Lag3', 'Oil_Lag5',
              'RSI_14_Lag1', 'MACD_Hist_Lag1', 'Ret_1d_Lag1', 'Ret_1d_Lag2',
              'US_Stocks_Crude', 'US_Stocks_Ex_SPR_Crude', 'US_Stocks_Crude_SPR']

weekly_cols = [col for col in df.columns if '4W_' in col or col.startswith('US_Stocks_') 
               or col.startswith('US_') or col.startswith('PADD') or col.startswith('AK_')
               or col.startswith('L48_') or col.startswith('New_') or col.startswith('Central_')
               or col.startswith('Lower_') or col.startswith('Cushing_')]

weekly_cols = [col for col in weekly_cols if col not in daily_cols]
daily_cols = [col for col in daily_cols if col in df.columns]

df['Year_Week'] = df['Date'].dt.isocalendar().year.astype(str) + '_W' + df['Date'].dt.isocalendar().week.astype(str).str.zfill(2)

# Use as_index=False instead of include_groups=False
df_weekly = df.groupby('Year_Week', as_index=False).apply(
    lambda x: pd.Series({
        'Date': x['Date'].iloc[-1],
        **{col: x[col].iloc[-1] for col in daily_cols if col in x.columns},
        **{col: x[col].bfill().iloc[-1] if x[col].notna().any() else np.nan 
           for col in weekly_cols if col in x.columns}
    })
) # No need for a separate reset_index() if Year_Week is included in the output Series

In [27]:
y = np.log(df_weekly['Crude_Oil']).diff().shift(-1) * 100

available_weekly_cols = [col for col in weekly_cols if col in df_weekly.columns]
X = df_weekly[available_weekly_cols]


valid_idx = y.dropna().index
X = X.loc[valid_idx]
y = y.loc[valid_idx]

X = X.ffill().bfill()  
X = X.fillna(0)


In [28]:
split_point = int(len(X) * 0.8)
X_train = X.iloc[:split_point].copy()
X_test = X.iloc[split_point:].copy()
y_train = y.iloc[:split_point].copy()
y_test = y.iloc[split_point:].copy()

train_start = df_weekly.iloc[0]['Date'].date()
train_end = df_weekly.iloc[split_point-1]['Date'].date()
test_start = df_weekly.iloc[split_point]['Date'].date()
test_end = df_weekly.iloc[-1]['Date'].date()





In [29]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
#before feature selection
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_randomforest = rf.predict(X_test)
# Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred_randomforest)

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred_randomforest)

# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# R-squared Score (R2)
r2 = r2_score(y_test, y_pred_randomforest)

## 5. Print Results
print(f"Random Forest Regressor Performance:")
print("-" * 40)
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"R-squared Score (R2): {r2:.4f}")

Random Forest Regressor Performance:
----------------------------------------
Mean Absolute Error (MAE): 3.36
Root Mean Squared Error (RMSE): 4.39
R-squared Score (R2): 0.0105


In [30]:
#after feature selection
X_train2 = X_train.filter(regex='PADD3|PADD3_RefBl_NetProd_FinGas|Residual|US_Crude_Stocks_Transit_from_AK', axis=1)
X_test2 = X_test.filter(regex='PADD3|PADD3_RefBl_NetProd_FinGas|Residual|US_Crude_Stocks_Transit_from_AK', axis=1)
rf2 = RandomForestRegressor(n_estimators=100, random_state=42)
rf2.fit(X_train2, y_train)

y_pred_randomforest2 = rf2.predict(X_test2)
# Mean Absolute Error (MAE)
mae2 = mean_absolute_error(y_test, y_pred_randomforest2)

# Mean Squared Error (MSE)
mse2 = mean_squared_error(y_test, y_pred_randomforest2)

# Root Mean Squared Error (RMSE)
rmse2 = np.sqrt(mse2)

# R-squared Score (R2)
r22 = r2_score(y_test, y_pred_randomforest2)

## 5. Print Results
print(f"Random Forest Regressor Performance:")
print("-" * 40)
print(f"Mean Absolute Error (MAE): {mae2:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse2:.2f}")
print(f"R-squared Score (R2): {r22:.4f}")

Random Forest Regressor Performance:
----------------------------------------
Mean Absolute Error (MAE): 3.53
Root Mean Squared Error (RMSE): 4.60
R-squared Score (R2): -0.0838
