In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import ParameterGrid
import pickle

In [None]:
def sliding_window_forecast_with_scaling(df, target_col, model, model_name, model_params,
                                         window_size=25, test_ratio=0.2, drop_cols=None,
                                         log_dir="model_logs", log_filename="prediction_log.csv"):
    """
    Performs sliding window forecasting using standard scaling on both features and target.
    Inverse-transforms predictions back to original units before evaluation.
    Adds early stopping for XGBoost using test set as validation (only during tuning).
    """
    if drop_cols is None:
        drop_cols = []

    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    log_path = os.path.join(log_dir, log_filename)
    log_entries = []

    df = df.dropna().sort_index()
    n = len(df)

    for start in range(0, n - window_size + 1):
        window_df = df.iloc[start:start + window_size].copy()
        X_window = window_df.drop(columns=[target_col] + drop_cols, errors='ignore')
        y_window = window_df[target_col].values

        train_size = int(len(X_window) * (1 - test_ratio))
        if train_size < 1 or train_size >= len(X_window):
            continue

        X_train = X_window.iloc[:train_size].values
        X_test = X_window.iloc[train_size:].values
        y_train = y_window[:train_size]
        y_test = y_window[train_size:]

        # Scale X
        scaler_X = StandardScaler()
        X_train_scaled = scaler_X.fit_transform(X_train)
        X_test_scaled = scaler_X.transform(X_test)

        # Scale y
        scaler_y = StandardScaler()
        y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).ravel()
        y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).ravel()

        # Fit model on scaled X and y
        if model_name == "XGB":
            model.fit(X_train_scaled, y_train_scaled,
                      eval_set=[(X_test_scaled, y_test_scaled)], verbose=False
            )

        else:
            model.fit(X_train_scaled, y_train_scaled)

        # Predict
        y_pred_scaled = model.predict(X_test_scaled)

        # Inverse-transform prediction and test y
        y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).ravel()
        y_test_real = scaler_y.inverse_transform(y_test_scaled.reshape(-1, 1)).ravel()

        # Evaluation
        mse = mean_squared_error(y_test_real, y_pred)
        mape = mean_absolute_percentage_error(y_test_real, y_pred)
        r2 = r2_score(y_test_real, y_pred)

        log_entry = {
            "model_name": model_name,
            "model_hyperparameters_dict": json.dumps(model_params),
            "window_size": window_size,
            "test_ratio": test_ratio,
            "start_date": str(df.index[start]),
            "end_date": str(df.index[start + window_size - 1]),
            "test_data_values_list": y_test_real.tolist(),
            "test_data_model_predictions_list": y_pred.tolist(),
            "MSE_score": mse,
            "MAPE_score": mape,
            "R^2_score": r2
        }
        log_entries.append(log_entry)

    log_df = pd.DataFrame(log_entries)
    if os.path.exists(log_path):
        existing = pd.read_csv(log_path)
        log_df = pd.concat([existing, log_df], ignore_index=True)
    log_df.to_csv(log_path, index=False)
    return log_df

In [3]:
def build_model(model_name, params):
    """
    Build a model given its type and hyperparameters.
    """
    if model_name == "RF":
        return RandomForestRegressor(**params, random_state=42, n_jobs=-1)
    elif model_name == "SVM":
        return SVR(**params)
    elif model_name == "GB":
        return GradientBoostingRegressor(**params, random_state=42)
    elif model_name == "XGB":
        return xgb.XGBRegressor(**params, random_state=42, n_jobs=-1)
    else:
        raise ValueError(f"Unknown model type: {model_name}")

In [None]:
def tune_model(df, target_col, model_type, param_grid, window_sizes=[25, 100, 200],
               test_ratio=0.2, drop_cols=None, log_dir="model_logs"):
    """
    Performs hyperparameter tuning for a single model type over the given window sizes.
    Uses a single tqdm progress bar that counts total iterations.
    
    Returns a combined summary DataFrame with:
      - model_name
      - model_hyperparameters_dict (JSON string)
      - window_size
      - test_ratio
      - avg_MSE
      - avg_MAPE
      - avg_R^2
    """
    if drop_cols is None:
        drop_cols = []
    
    # Calculate total iterations over all window sizes and hyperparam combinations.
    total_iterations = sum([len(list(ParameterGrid(param_grid))) for _ in window_sizes])
    pbar = tqdm(total=total_iterations, desc=f"Tuning {model_type}")
    
    summaries = []
    for window in window_sizes:
        for params in ParameterGrid(param_grid):
            model = build_model(model_type, params)
            log_df = sliding_window_forecast_with_scaling(
                df=df,
                target_col=target_col,
                model=model,
                model_name=model_type,
                model_params=params,  # Pass current parameter set.
                window_size=window,
                test_ratio=test_ratio,
                drop_cols=drop_cols,
                log_dir=log_dir,
                log_filename=f"{model_type}_window_{window}.csv"
            )
            pbar.update(1)
            if log_df.empty:
                continue
            avg_mse = log_df["MSE_score"].mean()
            avg_mape = log_df["MAPE_score"].mean()
            avg_r2 = log_df["R^2_score"].mean()
            summary_entry = {
                "model_name": model_type,
                "model_hyperparameters_dict": json.dumps(params),
                "window_size": window,
                "test_ratio": test_ratio,
                "avg_MSE": avg_mse,
                "avg_MAPE": avg_mape,
                "avg_R^2": avg_r2
            }
            summaries.append(summary_entry)
    pbar.close()
    if summaries:
        combined_summary = pd.DataFrame(summaries)
        return combined_summary
    else:
        return pd.DataFrame()

In [None]:
def combine_and_top_logs(log_dir="model_logs", tops=5):
    """
    Reads all CSV files in the given log directory (matching *_window*.csv),
    combines them into a single DataFrame, groups by model_name and hyperparameter
    configuration, then computes the average MSE, MAPE, and R^2.
    Returns a DataFrame containing the top 5 configurations (by MAPE)
    for each model type.
    """
    # List all CSV files in the log directory
    all_files = [os.path.join(log_dir, f) for f in os.listdir(log_dir) if f.endswith(".csv")]
    dfs = [pd.read_csv(f) for f in all_files]
    combined_logs = pd.concat(dfs, ignore_index=True)
    
    # Group by model_name and hyperparameters and compute average metrics
    grouped_summary = combined_logs.groupby(
        ["model_name", "model_hyperparameters_dict"]
    )[["MSE_score", "MAPE_score", "R^2_score"]].mean().reset_index()
    
    # For each model, get the top five (lowest MAPE)
    top_n_list = []
    for model in grouped_summary["model_name"].unique():
        top_n = grouped_summary[grouped_summary["model_name"] == model].sort_values("MAPE_score", ascending=True).head(tops)
        top_n_list.append(top_n)
        
    top_n_combined = pd.concat(top_n_list, ignore_index=True)
    return top_n_combined

In [6]:
param_grids = {
    "RF": {
        "n_estimators": [1200, 1400, 1600, 1800, 2000],
        "max_depth": [8, 9, 10, 11, 12],
        "min_samples_split": [2, 3],
        "max_features": ["sqrt", "log2", 0.5],
        "bootstrap": [True, False],

    },
    "SVM": {
        "C": [0.25, 0.5, 0.75, 1, 1.25, 1.5, 2.55, 5, 5.25, 5.5, 5.75, 6, 6.25, 6.5],
        "epsilon": [1e-5, 1.5e-5, 2e-5, 2.5e-5, 3e-5, 3.5e-5, 4e-5, 4.5e-5, 5e-5],
        "kernel": ["rbf", "poly", "sigmoid", "linear"],
    },
    "GB": {
        "n_estimators": [1600, 3200, 4800, 6400, 8000],
        "learning_rate": [0.0025, 0.005, 0.01, 0.02, 0.04],
        "max_depth": [2, 3],
        "subsample": [0.5, 0.55, 0.6, 0.66, 0.73, 0.81, 0.9],
    },
    "XGB": {
        "n_estimators": [1600, 3200, 4800, 6400, 8000],
        "learning_rate": [0.0025, 0.005, 0.01, 0.02, 0.04],
        "max_depth": [2, 3],
        "subsample": [0.8, 0.85, 0.9, 0.95, 1],
        "colsample_bytree": [0.4, 0.6, 0.8, 1],
        "early_stopping_rounds": [32],
    }
}

In [7]:
with open("temp_output/quarterly_X_y.pkl", "rb") as f:
    data_q = pickle.load(f)

drop_columns = []
print(data_q.keys())  # Should show all the tickers like AAPL, MSFT, etc.
quaterly_data = data_q["AAPL"]

dict_keys(['AAPL', 'MSFT', 'LLY', 'UNH', 'V', 'MA', 'GOOGL', 'META', 'AMZN', 'TSLA', 'PG', 'WMT', 'RTX', 'UNP', 'XOM', 'CVX', 'LIN', 'SHW', 'AMT', 'PLD', 'NEE', 'SO'])


In [None]:
rf_summary_q = tune_model(quaterly_data, target_col="y", model_type="RF",
                          param_grid=param_grids["RF"],
                          window_sizes=[10], #[25, 100, 200],
                          test_ratio=0.2, drop_cols=drop_columns,
                          log_dir="model_logs_q")

svm_summary_q = tune_model(quaterly_data, target_col="y", model_type="SVM",
                           param_grid=param_grids["SVM"],
                           window_sizes=[10], #[25, 100, 200],
                           test_ratio=0.2, drop_cols=drop_columns,
                           log_dir="model_logs_q")

gb_summary_q = tune_model(quaterly_data, target_col="y", model_type="GB",
                          param_grid=param_grids["GB"],
                          window_sizes=[10], #[25, 100, 200],
                          test_ratio=0.2, drop_cols=drop_columns,
                          log_dir="model_logs_q")

xgb_summary_q = tune_model(quaterly_data, target_col="y", model_type="XGB",
                           param_grid=param_grids["XGB"],
                           window_sizes=[10], #[25, 100, 200],
                           test_ratio=0.2, drop_cols=drop_columns,
                           log_dir="model_logs_q")



KeyboardInterrupt: 

In [None]:
combined_summary_q = pd.concat([rf_summary_q, svm_summary_q, gb_summary_q, xgb_summary_q], ignore_index=True)
print("Combined Tuning Summary:")
display(combined_summary_q)

# Alternatively, read all prediction log files from log directory
top5_summary_q = combine_and_top_logs(log_dir="model_logs_q")
print("Top 5 Configurations per Model:")
display(top5_summary_q)

Combined Tuning Summary:


Unnamed: 0,model_name,model_hyperparameters_dict,window_size,test_ratio,avg_MSE,avg_MAPE,avg_R^2
0,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",10,0.2,246363.428892,0.108721,-538.233200
1,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",10,0.2,244892.476689,0.108437,-530.072687
2,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",10,0.2,243915.612811,0.108247,-524.006098
3,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",10,0.2,242931.616866,0.108078,-519.318613
4,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",10,0.2,242383.677396,0.107970,-520.036887
...,...,...,...,...,...,...,...
1341,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",10,0.2,220488.126186,0.102364,-840.318680
1342,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",10,0.2,220454.785781,0.102360,-841.066041
1343,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",10,0.2,220460.950900,0.102363,-841.295238
1344,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",10,0.2,220441.823859,0.102362,-840.902401


Top 5 Configurations per Model:


Unnamed: 0,model_name,model_hyperparameters_dict,MSE_score,MAPE_score,R^2_score
0,GB,"{""learning_rate"": 0.03, ""max_depth"": 2, ""n_est...",207174.181826,0.097607,-214.316423
1,GB,"{""learning_rate"": 0.03, ""max_depth"": 2, ""n_est...",207174.181826,0.097607,-214.316423
2,GB,"{""learning_rate"": 0.03, ""max_depth"": 2, ""n_est...",207174.181826,0.097607,-214.316423
3,GB,"{""learning_rate"": 0.03, ""max_depth"": 2, ""n_est...",207174.181826,0.097607,-214.316423
4,GB,"{""learning_rate"": 0.03, ""max_depth"": 2, ""n_est...",207174.181826,0.097607,-214.316423
5,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",239498.131213,0.107405,-528.865757
6,RF,"{""max_depth"": 10, ""min_samples_split"": 2, ""n_e...",239498.131213,0.107405,-528.865757
7,RF,"{""max_depth"": 15, ""min_samples_split"": 2, ""n_e...",239498.131213,0.107405,-528.865757
8,RF,"{""max_depth"": 12, ""min_samples_split"": 2, ""n_e...",239498.131213,0.107405,-528.865757
9,RF,"{""max_depth"": 10, ""min_samples_split"": 2, ""n_e...",240191.919515,0.107542,-522.909981


In [None]:
with open("temp_output/daily_X_y.pkl", "rb") as f:
    data_d = pickle.load(f)
    
drop_columns = []
print(data_d.keys())  # Should show all the tickers like AAPL, MSFT, etc.
daily_data = data_d["AAPL"]
daily_data = daily_data.iloc[-128:,:]
daily_data

FileNotFoundError: [Errno 2] No such file or directory: 'temp_output/daily_X_y.pkl'

In [None]:
rf_summary_d = tune_model(daily_data, target_col="y", model_type="RF",
                        param_grid=param_grids["RF"],
                        window_sizes=[25], #[25, 100, 200],
                        test_ratio=0.2, drop_cols=drop_columns,
                        log_dir="model_logs_d")

svm_summary_d = tune_model(daily_data, target_col="y", model_type="SVM",
                            param_grid=param_grids["SVM"],
                        window_sizes=[25], #[25, 100, 200],
                            test_ratio=0.2, drop_cols=drop_columns,
                            log_dir="model_logs_d")

gb_summary_d = tune_model(daily_data, target_col="y", model_type="GB",
                        param_grid=param_grids["GB"],
                        window_sizes=[25], #[25, 100, 200],
                        test_ratio=0.2, drop_cols=drop_columns,
                        log_dir="model_logs_d")

xgb_summary_d = tune_model(daily_data, target_col="y", model_type="XGB",
                            param_grid=param_grids["XGB"],
                        window_sizes=[25], #[25, 100, 200],
                            test_ratio=0.2, drop_cols=drop_columns,
                            log_dir="model_logs_d")

Tuning RF: 100%|██████████| 48/48 [34:52<00:00, 43.59s/it]
Tuning SVM: 100%|██████████| 798/798 [03:36<00:00,  3.68it/s]
Tuning GB: 100%|██████████| 50/50 [51:04<00:00, 61.29s/it] 
Tuning XGB: 100%|██████████| 450/450 [58:51:49<00:00, 470.91s/it]   


In [None]:
combined_summary_d = pd.concat([rf_summary_d, svm_summary_d, gb_summary_d, xgb_summary_d], ignore_index=True)
print("Combined Tuning Summary:")
display(combined_summary_d)

# Alternatively, read all prediction log files from log directory
top5_summary_d = combine_and_top_logs(log_dir="model_logs_d")
print("Top 5 Configurations per Model:")
display(top5_summary_d)

Combined Tuning Summary:


Unnamed: 0,model_name,model_hyperparameters_dict,window_size,test_ratio,avg_MSE,avg_MAPE,avg_R^2
0,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",25,0.2,8093.615250,0.012474,-13.696438
1,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",25,0.2,8107.226922,0.012491,-13.726088
2,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",25,0.2,8110.555142,0.012492,-13.709650
3,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",25,0.2,8110.358956,0.012490,-13.685047
4,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",25,0.2,8109.757096,0.012486,-13.670188
...,...,...,...,...,...,...,...
1341,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",25,0.2,8694.347973,0.012499,-13.523623
1342,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",25,0.2,8698.003511,0.012501,-13.529404
1343,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",25,0.2,8696.666381,0.012500,-13.530422
1344,XGB,"{""colsample_bytree"": 0.9, ""learning_rate"": 0.0...",25,0.2,8696.996090,0.012500,-13.531451


Top 5 Configurations per Model:


Unnamed: 0,model_name,model_hyperparameters_dict,MSE_score,MAPE_score,R^2_score
0,GB,"{""learning_rate"": 0.01, ""max_depth"": 2, ""n_est...",10301.119094,0.013675,-17.436255
1,GB,"{""learning_rate"": 0.01, ""max_depth"": 2, ""n_est...",10302.557729,0.013676,-17.432459
2,GB,"{""learning_rate"": 0.01, ""max_depth"": 2, ""n_est...",10302.589749,0.013676,-17.43246
3,GB,"{""learning_rate"": 0.01, ""max_depth"": 2, ""n_est...",10302.590307,0.013676,-17.43246
4,GB,"{""learning_rate"": 0.01, ""max_depth"": 2, ""n_est...",10302.590315,0.013676,-17.43246
5,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",8107.349653,0.012471,-13.610755
6,RF,"{""max_depth"": 12, ""min_samples_split"": 2, ""n_e...",8107.229734,0.012471,-13.611072
7,RF,"{""max_depth"": 15, ""min_samples_split"": 2, ""n_e...",8107.229734,0.012471,-13.611072
8,RF,"{""max_depth"": 10, ""min_samples_split"": 2, ""n_e...",8107.244017,0.012471,-13.611075
9,RF,"{""max_depth"": 8, ""min_samples_split"": 2, ""n_es...",8093.61525,0.012474,-13.696438


In [None]:
top_summary_q = combine_and_top_logs(log_dir="model_logs_q", tops=25)
for d in top_summary_q['model_hyperparameters_dict']:
    print(json.loads(d))
    
print("="*100)

top_summary_d = combine_and_top_logs(log_dir="model_logs_d", tops=25)
for d in top_summary_d['model_hyperparameters_dict']:
    print(json.loads(d))

{'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 8000}
{'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 6400}
{'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 4800}
{'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 3200}
{'learning_rate': 0.03, 'max_depth': 2, 'n_estimators': 1600}
{'learning_rate': 0.04, 'max_depth': 2, 'n_estimators': 8000}
{'learning_rate': 0.04, 'max_depth': 2, 'n_estimators': 6400}
{'learning_rate': 0.04, 'max_depth': 2, 'n_estimators': 4800}
{'learning_rate': 0.04, 'max_depth': 2, 'n_estimators': 3200}
{'learning_rate': 0.04, 'max_depth': 2, 'n_estimators': 1600}
{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 4800}
{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 6400}
{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 8000}
{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 3200}
{'learning_rate': 0.01, 'max_depth': 2, 'n_estimators': 1600}
{'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 1600}
{'learni