# A (Feeble) Baseline Model Using `xgboost`

# TLDR

Baseline model using xgboost for interpolating the stock price of TSLA. More precisely, with respect to a pre-defined time period

**RESULTS**
- Metrics
  - RMSE: 24.11
  - MAE:  21.05
- Most important features:
  - avg_price_TSLA
  - delta_price_AAPL
  - volume_AAPL
  - high_TSLA
  - volume_MSFT
  - price_ratio_AAPL
  - close_TSLA_diff
  - low_TSLA
  - volume_NVDA
  - close_TSLA_log_return

In [None]:
# Additional requirements

# NOTE Move to requirements.txt if finally incorporated
!pip install xgboost

In [None]:
# Imports

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [None]:
# Configs

# SPLIT_DATE = "2023-01-01"

TARGET_TICKER = "TSLA"

In [None]:
# Load the Data

df = pd.read_csv("../data/processed_combined_data.csv")
df["date"] = pd.to_datetime(df["date"])

In [None]:
# Feature and target extraction

col_y = f"close_{TARGET_TICKER}"
# cols_X = [f"open_{TARGET_TICKER}"] 
cols_X = [col for col in df.columns if col not in [col_y, "date"]] 

X = df[cols_X]
y = df[col_y]

In [None]:
# Custom Train-Test Split Based on Pentiles

n_pentile = len(df) // 5

train_idxs_0 = np.arange(0, n_pentile * 2)
train_idxs_1 = np.arange(n_pentile * 3, len(df))

train_idxs = np.concatenate([train_idxs_0, train_idxs_1])

test_idxs = np.arange(n_pentile * 2, n_pentile * 3)

X_train = X.iloc[train_idxs]
X_test = X.iloc[test_idxs]
y_train = y.iloc[train_idxs]
y_test = y.iloc[test_idxs]


In [None]:
# Hyperparameter grid

param_grid = {
    # Tree depth
    "max_depth": [3, 5, 7],  
    # Learning rate (eta)
    "learning_rate": [0.01, 0.1, 0.3],  
    # Number of trees
    "n_estimators": [50, 100, 200],  
}

In [None]:
# Model Setup and Fit

xgb_regressor = xgb.XGBRegressor(
    objective="reg:squarederror",
    eval_metric="rmse",
)

grid_search = GridSearchCV(
    estimator=xgb_regressor,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=1,
)

grid_search.fit(X_train, y_train)

model = grid_search.best_estimator_

In [None]:
# Prediction

y_pred = model.predict(X)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
# Evaluation

rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae = mean_absolute_error(y_test, y_test_pred)

print("Evaluation Results:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")

In [None]:
# Plot Actual vs Predicted Values

t = df["date"]

# Plot Actual vs Predicted Values with Annotations for Train and Test
plt.figure(figsize=(12, 6))

# Plot Actual Values
plt.plot(
    t,
    y,
    label="Actual Values",
    color="blue",
    linewidth=2,
)

plt.plot(
    t,
    y_pred,
    label="Predicted Values",
    color="orange",
    linewidth=2,
    linestyle="--",
)

plt.axvspan(
    t.iloc[train_idxs_0[0]],
    t.iloc[train_idxs_0[-1]],
    color="lightgreen",
    alpha=0.3,
    label="Training Period",
)

plt.axvspan(
    t.iloc[train_idxs_1[0]],
    t.iloc[train_idxs_1[-1]],
    color="lightgreen",
    alpha=0.3,
    # label="Training Period",
)

# Annotate Test Period
plt.axvspan(
    t.iloc[test_idxs[0]],
    t.iloc[test_idxs[-1]],
    color="lightcoral",
    alpha=0.3,
    label="Test Period",
)

plt.title("Actual vs Predicted Values (with Train/Test Annotation)")
plt.xlabel("Date")
plt.ylabel("Target")
plt.legend()
plt.grid(True)
plt.show()

# plt.savefig("predictions_vs_actual.png", dpi=300)

In [None]:
# Feature importance

importance = model.get_booster().get_score(importance_type="weight")

# Sort features by importance in descending order
sorted_importance = sorted(importance.items(), key=lambda x: x[1], reverse=True)

# Get the top 30 most important features
top_features = sorted_importance[:30]

# Print the most important features
print("Top 30 Most Important Features:")
for feature, score in top_features:
    print(f"{feature}: {score}")

xgb.plot_importance(model, max_num_features=30)

In [None]:
class StopExecution(Exception):
    pass

raise StopExecution("Logical end of this notebook. Remaining cells are deprecated.")

## (Currently) Deprecated Code Snippets

In [None]:
# Custom Implementation of Blocking Time Series

class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]

# btscv = BlockingTimeSeriesSplit(n_splits=TS_NSPLITS)


In [None]:
# Set up the time series split
TS_NSPLITS = 20
tscv = TimeSeriesSplit(n_splits=TS_NSPLITS)

In [None]:
# Trend-Seasoning-Noise Decomposition

from statsmodels.tsa.seasonal import seasonal_decompose

result = seasonal_decompose(
    y, model="additive", period=12
)  # Example: monthly seasonality
y_train_trend = result.trend
y_train_seasonal = result.seasonal
y_train_residual = result.resid

y_train_trend

In [None]:
# Fourier Decompositon

from scipy.fft import fft
from scipy.fft import fftfreq
from scipy.fft import ifft

time_series = y_test.values 
n_samples = len(time_series)
delta_t = 1

fft_values = fft(time_series)      # Compute Fourier coefficients
frequencies = fftfreq(n_samples, d=delta_t)  # Get corresponding frequencies

import matplotlib.pyplot as plt
plt.figure(figsize=(10, 4))
plt.stem(frequencies[:n_samples // 2], np.abs(fft_values[:n_samples // 2]))
plt.title("Frequency Spectrum")
plt.xlabel("Frequency")
plt.ylabel("Amplitude")
plt.show()

# Filter: Remove low frequencies (trend) and/or specific seasonal frequencies
filter_mask = (np.abs(frequencies) > 0.01) & (np.abs(frequencies) != 1/365)
filtered_fft_values = fft_values * filter_mask 

# Reconstruct the time series (detrended and deseasonalized)
reconstructed_signal = ifft(filtered_fft_values).real

In [None]:
# Set up lists for the evaluation scores
rmses = []
maes = []
models = []

for fold, (train_idxs, test_idx) in enumerate(tscv.split(X)):
    X_train, X_test = X.iloc[train_idxs], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idxs], y.iloc[test_idx]

    xgb_regressor = xgb.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
    )

    grid_search = GridSearchCV(
        estimator=xgb_regressor,
        param_grid=param_grid,
        scoring="neg_mean_squared_error",
        cv=3,
        verbose=1,
    )
    grid_search.fit(X_train, y_train)

    model = grid_search.best_estimator_
    models.append(model)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate performance
    rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae = mean_absolute_error(y_test, y_test_pred)
    rmses.append(rmse)
    maes.append(mae)

rmses = np.array(rmses)
maes = np.array(maes)

best_model_index = np.argmin(rmses)
best_model = models[best_model_index]

In [None]:
print(f"Best Model is from Fold {best_model_index + 1}")
print(f"Best RMSE: {rmses[best_model_index]:.2f}")

print("Cross-Validation Results:")
print(f"Average RMSE: {np.mean(rmses):.2f}")
print(f"Average MAE: {np.mean(maes):.2f}")

    