# Time Series Decomposition

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller, kpss

import importlib
import ipynbrc

In [None]:
importlib.reload(ipynbrc)
from ipynbrc import *

NB_NUMBER = 5

relpath = "main.csv"
df = load_df_from_csv(relpath, NB_NUMBER)

## STL-Decomposition

The **Seasonal-Trend Decomposition** using Loess decomposes features of time series according to

$$
\text{Original}_t = \text{Trend}_t + \text{Seasonality}_t + \text{Residual}_t
$$

where:

- **Original**: The original time series feature.
- **Trend**: Long-term smooth movement (extracted via Loess).
- **Seasonality**: Repeating patterns (e.g., yearly cycles, period = 252 trading days).
- **Residual**: Leftover noise after removing trend and seasonality.

In [None]:
# Trend-Seasonality-Residual Decomposition

# A year consists of approximately 252 trading days
PERIOD = 252

cols_0 = [
    *cartprod("close", STOCK_TICKERS),
    "s&p500_index",
    "dow_jones_index",
    "nasdaq_composite",
    "russell2000_index",
    "vix_index",
    "dollar_index_dxy",
    "gold_futures",
    "wti_oil_futures",
    "copper_futures",
    "brent_crude_futures",
    "tech_sector_etf",
    "energy_sector_etf",
    "financial_sector_etf",
    "consumerdiscretionary_etf",
    "lithium_etf",
    "semiconductor_etf",
    "electricity_proxy",
    "cpi",
    "fed_rate",
    "consumer_confidence",
    "oil",
    "nonfarm_payrolls",
    "treasury_yield",
    "industrial_production",
    "retail_sales",
    "pmi",
]

# STL decomposition for each column
for col in cols_0:
    stl = STL(df[col].dropna(), period=PERIOD)
    result = stl.fit()
    df[f"{col}_trend"] = result.trend
    df[f"{col}_seasonal"] = result.seasonal
    df[f"{col}_residual"] = result.resid

# Due to the nature of STL decomposition, the first and last few rows will have NaN values
df = df.dropna()

In [None]:
# Columns to plot

# Subset of columns to decompose
cols_0 = [
    "close_AAPL",
    "close_MSFT",
    "close_GOOGL",
    "s&p500_index",
    # "gold_futures",
    # "cpi", 
    # Feel free to add more columns ...
]

components = (
    ("Original", "blue", ""),
    ("Trend", "green", "_trend"),
    ("Seasonal", "orange", "_seasonal"),
    ("Residual", "red", "_residual"),
)

for col in cols_0:
    fig, axes = plt.subplots(len(components), 1, figsize=(12, 10), sharex=True)
    fig.suptitle(f"Decomposition of {col}")

    for ax, (label, color, suffix) in zip(axes, components):
        ax.plot(df["date"], df[f"{col}{suffix}"], label=label, color=color)
        ax.set_ylabel(label)

    plt.xlabel("Date")
    plt.tight_layout()
    plt.show()

In [None]:
# Prepare hypothesis tests

SIGNIFICANCE_LEVEL = 0.05

def get_adf_kpss_stats(ser: pd.Series, s_level=SIGNIFICANCE_LEVEL) -> dict:
    try:
        _, adf_pvalue, *_ = adfuller(ser.dropna(), autolag="AIC")
    except Exception as e:
        print(f"ADF fails: {e}")
        # Set to 1.1 so that adf does not reject
        # Null Hypothesis: Series is non-stationary
        adf_pvalue = 1.1
    try:
        _, kpss_pvalue, *_ = kpss(ser.dropna(), regression="c", nlags="auto")
    except Exception as e:
        print(f"KPSS fails: {e}")
        # Set to -0.1 so that kpp does reject
        # Null Hypothesis: Series is stationary
        kpss_pvalue = -0.1
    adf_likely_stationary = adf_pvalue < s_level
    kpss_likely_non_stationary = kpss_pvalue < s_level
    # If ADF test rejects non-stationarity and KPSS test does not reject
    # stationarity, we consider the series to be stationary.
    is_stationary = adf_likely_stationary and not kpss_likely_non_stationary
    adf_kpss_stats = {
        "adf_pvalue": round(adf_pvalue, 4),
        "adf_likely_stationary": adf_likely_stationary,
        "kpss_pvalue": round(kpss_pvalue,4),
        "kpss_likely_non_stationary": kpss_likely_non_stationary,
        "is_stationary": is_stationary,
    }
    return adf_kpss_stats


In [None]:
# Simultaneous augmented Dickey-Fuller (ADF) and
# Kwiatkowski–Phillips–Schmidt–Shin (KPSS) test

cols_0 = df.select_dtypes(include="number").columns.tolist()

summaries = []

for col in cols_0:
    ser = df[col].dropna()
    # series = df[col]
    summary = {"Feature": col}
    adf_kpss_stats = get_adf_kpss_stats(ser)
    summary.update(adf_kpss_stats)
    summaries.append(summary)

summaries = pd.DataFrame(summaries)
summaries = summaries.sort_values("is_stationary", ascending=False)
is_stationary = summaries["is_stationary"]
is_stationary.value_counts()

## Advanced Stationarity Fix: Second Differencing, Residuals, Log Returns

In [None]:
# Try to make features identified as non-stationary
# by the ADF and KPSS tests stationary

is_non_stationary = summaries["is_stationary"] == False
non_stationary_ser = summaries[is_non_stationary]["Feature"]
non_stationary_cols = non_stationary_ser.tolist()

methods = {}

for col in non_stationary_cols:
    ser = df[col].dropna()
    # ser = df[col]

    # Try log transformation if column has only positive values,
    # and second (backward) differencing otherwise.
    if (ser > 0).all():
        ser_trans = np.log(ser / ser.shift(1))
        method = "log_trafo"
    else:
        ser_trans = ser.diff().diff()
        method = "diff_2"

    # Check whether transformed columns still shows a trend.
    has_trend = ( False
        or ser_trans.dropna().std() == 0 
        or ser_trans.isna().mean() > 0.5
    )

    # If the transformed column is still showing a trend,
    # try the residual method from STL.
    if has_trend:
        stl = STL(ser, period=252)
        ser_trans = stl.fit().resid
        method = "stl_residual"

    # Save transformed back into df (optional)
    df[f"{col}_{method}"] = ser_trans

    methods[col] = method

In [None]:
# Check whether the transformed columns are stationary

summaries_fixed_li = []

for col, method in methods.items():
    ser = df[f"{col}_{method}"]
    summary = {
        "feature": col,
        "fix_method": method,
    }
    adf_kpss_stats = get_adf_kpss_stats(ser)
    summary.update(adf_kpss_stats)
    summaries_fixed_li.append(summary)

# stationarity_df = pd.DataFrame(summaries)
# stationarity_df = stationarity_df.sort_values("is_stationary", ascending=False)
# stationarity_df = stationarity_df.reset_index(drop=True)
# is_stationary = stationarity_df["is_stationary"]
# is_stationary.value_counts()


fixed_stationarity_df = pd.DataFrame(summaries_fixed_li)
fixed_stationarity_df = fixed_stationarity_df.sort_values(
    "is_stationary", ascending=False
)
fixed_stationarity_df = fixed_stationarity_df.reset_index(drop=True)
is_stationary = fixed_stationarity_df["is_stationary"]
is_stationary.value_counts()


In [None]:
fixed_stationarity_df

In [None]:
relpath = "main.csv"
store_df_as_csv(df, relpath, NB_NUMBER)