## 1. Load Necessary Libraries

In [13]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os
import datetime as dt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error


## 2. Set seed for reproducibility

In [14]:
# -----------------------------
# Reproducibility
# -----------------------------
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)

sns.set_style("darkgrid")


## 3. Set paths

In [15]:
# -----------------------------
# Project Root Resolution
# -----------------------------
PROJECT_ROOT = Path.cwd().resolve().parents[0]

DATA_RAW_DIR = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM_DIR = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
FIGURES_DIR = PROJECT_ROOT / "reports" / "figures"

DATA_INTERIM_DIR.mkdir(parents=True, exist_ok=True)
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

DATA_RAW_DIR, DATA_INTERIM_DIR, FIGURES_DIR


(WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/raw'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/data/interim'),
 WindowsPath('C:/Users/Kinjal Mitra/Documents/stock-price-prediction-ff/reports/figures'))

## 4. Load Data from data/processed/

In [16]:
path = DATA_PROCESSED_DIR /"processed_dataset.csv"
df = pd.read_csv(path)
df = df.drop(columns=["Unnamed: 0"])

In [17]:
df

Unnamed: 0,Date,Data_Value,StockPrice,daily_return,log_return,price_change,volatility_7d,MA_7,MA_30,MA_50,...,bollinger_lower,momentum_5d,momentum_20d,price_lag_1,price_lag_2,price_lag_3,price_lag_4,price_lag_5,rolling_max_20d,rolling_min_20d
0,2010-01-04,0.700,1178.00,-0.002962,-0.002967,-3.50,0.004291,1184.892857,1152.920000,1160.837,...,1115.877196,-0.011538,0.027475,1181.50,1182.25,1186.75,1190.75,1191.75,1195.0,1119.75
1,2010-01-05,0.699,1181.50,-0.000634,-0.000635,-0.75,0.004301,1186.714286,1151.945000,1161.577,...,1113.758332,-0.001479,0.031202,1182.25,1186.75,1190.75,1191.75,1183.25,1195.0,1119.75
2,2010-01-06,0.694,1182.25,-0.003792,-0.003799,-4.50,0.006435,1188.571429,1151.053333,1162.252,...,1111.799951,-0.007138,0.064131,1186.75,1190.75,1191.75,1183.25,1190.75,1195.0,1119.75
3,2010-01-07,0.692,1186.75,-0.003359,-0.003365,-4.00,0.007695,1188.464286,1150.161667,1162.812,...,1103.950485,-0.006488,0.070108,1190.75,1191.75,1183.25,1190.75,1194.50,1195.0,1111.00
4,2010-01-08,0.691,1190.75,-0.000839,-0.000839,-1.00,0.008593,1189.642857,1149.161667,1163.397,...,1097.085188,0.007829,0.077358,1191.75,1183.25,1190.75,1194.50,1181.50,1195.0,1109.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3748,2025-01-08,2.782,5959.25,0.002439,0.002436,14.50,0.009858,5927.357143,6056.241667,5950.645,...,5854.670147,-0.004967,-0.024034,5944.75,5866.25,5874.50,5882.25,5989.00,6152.0,5866.25
3749,2025-01-09,2.789,5944.75,0.013382,0.013293,78.50,0.009941,5937.964286,6058.566667,5945.825,...,5861.877525,-0.005146,-0.017315,5866.25,5874.50,5882.25,5989.00,5975.50,6152.0,5866.25
3750,2025-01-10,2.766,5866.25,-0.001404,-0.001405,-8.25,0.006703,5957.892857,6060.433333,5943.240,...,5872.408570,-0.027720,-0.036543,5874.50,5882.25,5989.00,5975.50,6033.50,6152.0,5866.25
3751,2025-01-13,2.785,5874.50,-0.001318,-0.001318,-7.75,0.006418,5994.214286,6063.891667,5942.445,...,5903.923835,-0.034474,-0.035742,5882.25,5989.00,5975.50,6033.50,6084.25,6152.0,5874.50


In [18]:
X_scaled = np.load(DATA_PROCESSED_DIR / "X_features.npy")
y = np.load(DATA_PROCESSED_DIR / "y_target.npy")

## XGBoost

In [23]:
# ============================================================
# Leakage-Safe XGBoost Regression Baseline
# Assumes df is already initialized, cleaned, and sorted by Date
# Target: next-interval log_return
# ============================================================

import numpy as np
import itertools
from xgboost import XGBRegressor
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score
)

# ============================================================
# 0. SHIFT ALL PRICE-DERIVED INDICATORS (CRITICAL)
#    Ensures features only use information up to t-1
# ============================================================

indicator_cols = [
    "volatility_7d",
    "MA_7", "MA_30", "MA_50",
    "RSI_14",
    "MACD", "MACD_signal",
    "bollinger_upper", "bollinger_lower",
    "momentum_5d", "momentum_20d",
    "rolling_max_20d", "rolling_min_20d"
]

df[indicator_cols] = df[indicator_cols].shift(1)

# Drop rows introduced by shifting
df = df.dropna().reset_index(drop=True)

# ============================================================
# 1. PREPARE FEATURES & TARGET (STRICT — NO LEAKAGE)
# ============================================================

TARGET_COL = "log_return"

DROP_COLS = [
    "Date",
    "StockPrice",
    "log_return",
    "daily_return",
    "price_change"
]

X = df.drop(columns=DROP_COLS).values
y = df[TARGET_COL].values

# ============================================================
# 2. TIME-AWARE TRAIN / VAL / TEST SPLIT
# ============================================================

n = len(X)

train_end = int(0.70 * n)
val_end = int(0.85 * n)

X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]

# ============================================================
# 3. HYPERPARAMETER GRID (FOCUSED & SAFE)
# ============================================================

param_grid = {
    "n_estimators": [300, 500],
    "max_depth": [3, 4, 5],
    "learning_rate": [0.03, 0.05, 0.1],
    "subsample": [0.7, 0.9],
    "colsample_bytree": [0.6, 0.8],
    "reg_alpha": [0.0, 0.1],
    "reg_lambda": [1.0, 2.0]
}

# ============================================================
# 4. MANUAL GRID SEARCH (VERSION-SAFE)
# ============================================================

best_model = None
best_val_mae = float("inf")
best_params = None

for params in itertools.product(*param_grid.values()):
    param_dict = dict(zip(param_grid.keys(), params))

    model = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        tree_method="hist",
        **param_dict
    )

    model.fit(X_train, y_train)

    val_pred = model.predict(X_val)
    val_mae = mean_absolute_error(y_val, val_pred)

    if val_mae < best_val_mae:
        best_val_mae = val_mae
        best_model = model
        best_params = param_dict

print("\n==============================")
print("BEST VALIDATION RESULT")
print("==============================")
print("Best Validation MAE:", round(best_val_mae, 6))
print("Best Hyperparameters:", best_params)

# ============================================================
# 5. FINAL TEST EVALUATION
# ============================================================

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
directional_accuracy = (np.sign(y_pred) == np.sign(y_test)).mean()

print("\n==============================")
print("XGBOOST TEST PERFORMANCE")
print("==============================")
print(f"MAE  : {mae:.6f}")
print(f"RMSE : {rmse:.6f}")
print(f"R²   : {r2:.4f}")
print(f"Directional Accuracy: {directional_accuracy:.2%}")



BEST VALIDATION RESULT
Best Validation MAE: 0.006028
Best Hyperparameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.7, 'colsample_bytree': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 2.0}

XGBOOST TEST PERFORMANCE
MAE  : 0.005170
RMSE : 0.006863
R²   : 0.3729
Directional Accuracy: 70.16%


In [24]:
# ============================================================
# Walk-Forward / Rolling Window Evaluation for XGBoost
# Leakage-safe, version-safe
# ============================================================

import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -----------------------------
# 1. Prepare features & target
# -----------------------------
TARGET_COL = "log_return"

DROP_COLS = [
    "Date",
    "StockPrice",
    "log_return",
    "daily_return",
    "price_change"
]

X = df.drop(columns=DROP_COLS).values
y = df[TARGET_COL].values

n = len(X)

# -----------------------------
# 2. Walk-forward parameters
# -----------------------------
train_frac = 0.60
test_frac = 0.05

train_size = int(train_frac * n)
test_size = int(test_frac * n)

start = 0

# -----------------------------
# 3. Best hyperparameters (from tuning)
# -----------------------------
best_params = {
    "n_estimators": 500,
    "max_depth": 4,
    "learning_rate": 0.05,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.0,
    "reg_lambda": 1.0
}

# -----------------------------
# 4. Metrics storage
# -----------------------------
mae_list = []
rmse_list = []
r2_list = []
direction_list = []

# -----------------------------
# 5. Walk-forward loop
# -----------------------------
while start + train_size + test_size <= n:

    train_start = start
    train_end = start + train_size
    test_end = train_end + test_size

    X_train, y_train = X[train_start:train_end], y[train_start:train_end]
    X_test, y_test = X[train_end:test_end], y[train_end:test_end]

    model = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        tree_method="hist",
        **best_params
    )

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # Metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    directional_acc = (np.sign(y_pred) == np.sign(y_test)).mean()

    mae_list.append(mae)
    rmse_list.append(rmse)
    r2_list.append(r2)
    direction_list.append(directional_acc)

    start += test_size

# -----------------------------
# 6. Aggregate results
# -----------------------------
print("\n==============================")
print("WALK-FORWARD EVALUATION RESULTS")
print("==============================")

print(f"MAE  : {np.mean(mae_list):.6f} ± {np.std(mae_list):.6f}")
print(f"RMSE : {np.mean(rmse_list):.6f} ± {np.std(rmse_list):.6f}")
print(f"R²   : {np.mean(r2_list):.4f}")
print(f"Directional Accuracy: {np.mean(direction_list):.2%}")

print("\nNumber of folds:", len(mae_list))



WALK-FORWARD EVALUATION RESULTS
MAE  : 0.006222 ± 0.002138
RMSE : 0.008677 ± 0.003675
R²   : 0.3098
Directional Accuracy: 71.06%

Number of folds: 8
