In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

import datetime

from arch import arch_model

import numpy.linalg as la
from pandas.tseries.offsets import BDay
import matplotlib.pyplot as plt

from collections import defaultdict

from joblib import Parallel, delayed

from tqdm import tqdm

In [2]:
df_clustered = pd.read_csv("knn-dataset.csv")
df_crsp = pd.read_csv('stock_daily.csv')
df_crsp['date'] = pd.to_datetime(df_crsp['date'])
df_crsp

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx
0,1975-01-02,23924,CFSR,-11.62500,0.000000,1591.0,2.0000,,0.025240,0.025212
1,1975-01-02,23931,NSP,17.12500,0.070313,23233.0,4.0000,12300.0,0.025240,0.025212
2,1975-01-02,23975,CLRK,-21.00000,0.000000,1279.0,45.5625,,0.025240,0.025212
3,1975-01-02,23990,ROH,47.87500,0.035135,12871.0,18.0000,4600.0,0.025240,0.025212
4,1975-01-02,24002,DEW,9.25000,0.000000,16487.0,1.5000,12200.0,0.025240,0.025212
...,...,...,...,...,...,...,...,...,...,...
88359665,2024-12-31,92396,ECH,25.04000,0.000000,18950.0,1.0000,67015.0,-0.003392,-0.003541
88359666,2024-12-31,92397,BKF,36.49050,-0.002550,1850.0,1.0000,2732.0,-0.003392,-0.003541
88359667,2024-12-31,92398,AIA,67.83000,-0.005571,10500.0,1.0000,38260.0,-0.003392,-0.003541
88359668,2024-12-31,92402,MSCI,600.01001,0.000600,78371.0,1.0000,223964.0,-0.003392,-0.003541


In [3]:
# Only keeping the permno that exist in df_clustered 
# -------------------------------------------------------------
# Create a reference key from df_clustered
valid_keys = df_clustered[['permno', 'trading_start']].drop_duplicates()

# Assign trading_start to df_crsp
df_crsp['trading_start'] = df_crsp['date'].dt.to_period('Q').dt.start_time

valid_keys['trading_start'] = pd.to_datetime(valid_keys['trading_start'])
df_crsp['trading_start'] = pd.to_datetime(df_crsp['trading_start'])

# Keep only CRSP rows for stocks in df_clustered for that trading quarter
df_crsp_filtered = pd.merge(df_crsp, valid_keys, on=['permno', 'trading_start'], how='inner')
df_crsp_filtered

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,77900.0,-0.007779,-0.007821,1975-04-01
1,1975-04-01,22752,MRK,73.75000,-0.026403,75389.0,40.008190,47200.0,-0.007779,-0.007821,1975-04-01
2,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,16300.0,-0.007779,-0.007821,1975-04-01
3,1975-04-01,25013,SGP,60.12500,0.002083,53929.0,32.000000,29200.0,-0.007779,-0.007821,1975-04-01
4,1975-04-01,25478,CRK,34.00000,0.018727,8000.0,6.000000,15400.0,-0.007779,-0.007821,1975-04-01
...,...,...,...,...,...,...,...,...,...,...,...
14830463,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,1007467.0,-0.003392,-0.003541,2024-10-01
14830464,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,828475.0,-0.003392,-0.003541,2024-10-01
14830465,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,465628.0,-0.003392,-0.003541,2024-10-01
14830466,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,1063685.0,-0.003392,-0.003541,2024-10-01


In [4]:
df_clustered['trading_start'] = pd.to_datetime(df_clustered['trading_start'])

df_merged = pd.merge(
    df_crsp_filtered,
    df_clustered[['permno', 'trading_start', 'group_id']],
    on=['permno', 'trading_start'],
    how='left'
)
df_merged

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start,group_id
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,77900.0,-0.007779,-0.007821,1975-04-01,1975-Q1-07
1,1975-04-01,22752,MRK,73.75000,-0.026403,75389.0,40.008190,47200.0,-0.007779,-0.007821,1975-04-01,1975-Q1-04
2,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,16300.0,-0.007779,-0.007821,1975-04-01,1975-Q1-08
3,1975-04-01,25013,SGP,60.12500,0.002083,53929.0,32.000000,29200.0,-0.007779,-0.007821,1975-04-01,1975-Q1-04
4,1975-04-01,25478,CRK,34.00000,0.018727,8000.0,6.000000,15400.0,-0.007779,-0.007821,1975-04-01,1975-Q1-04
...,...,...,...,...,...,...,...,...,...,...,...,...
14831791,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,1007467.0,-0.003392,-0.003541,2024-10-01,2024-Q3-10
14831792,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,828475.0,-0.003392,-0.003541,2024-10-01,2024-Q3-00
14831793,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,465628.0,-0.003392,-0.003541,2024-10-01,2024-Q3-08
14831794,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,1063685.0,-0.003392,-0.003541,2024-10-01,2024-Q3-10


In [5]:
df_merged['vol'] = df_merged['vol'].fillna(0) / 100           # ← changed
# rolling 20 day average volumne
df_merged['adv20'] = (df_merged.groupby('permno')['vol']
                      .rolling(20, min_periods=1).mean()
                      .reset_index(level=0, drop=True))

# adjusted price
df_merged['adj_prc'] = df_merged['prc'] / df_merged['cfacpr']

In [6]:
df_merged[df_merged['ticker'] == 'MMM'].sort_values(by = 'date')

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start,group_id,adv20,adj_prc
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,779.00,-0.007779,-0.007821,1975-04-01,1975-Q1-07,779.000000,5.203797
112,1975-04-02,22592,MMM,50.50000,-0.012225,114200.0,9.824557,1253.00,-0.001893,-0.001946,1975-04-01,1975-Q1-07,1016.000000,5.140181
273,1975-04-03,22592,MMM,48.00000,-0.049505,114200.0,9.824557,2720.00,-0.010225,-0.010248,1975-04-01,1975-Q1-07,1377.750000,4.885716
266,1975-04-04,22592,MMM,48.00000,0.000000,114200.0,9.824557,759.00,-0.005813,-0.005955,1975-04-01,1975-Q1-07,930.333333,4.885716
427,1975-04-07,22592,MMM,47.37500,-0.013021,114200.0,9.824557,633.00,-0.006551,-0.006602,1975-04-01,1975-Q1-07,1228.800000,4.822100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14826866,2024-12-24,22592,MMM,130.36000,0.010699,544559.0,1.000000,8032.16,0.010566,0.010521,2024-10-01,2024-Q3-00,30552.376500,130.360000
14827707,2024-12-26,22592,MMM,131.17999,0.006290,544559.0,1.000000,14851.03,0.000346,0.000282,2024-10-01,2024-Q3-00,29754.779000,131.179990
14828970,2024-12-27,22592,MMM,130.17999,-0.007623,544559.0,1.000000,18428.23,-0.010692,-0.010775,2024-10-01,2024-Q3-00,29386.388000,130.179990
14830077,2024-12-30,22592,MMM,129.13000,-0.008066,544559.0,1.000000,21539.94,-0.009878,-0.009900,2024-10-01,2024-Q3-00,29654.244000,129.130000


In [7]:
# Compute group-level average return
group_avg_return = df_merged.groupby(['date', 'group_id'])['retx'].mean().reset_index()
group_avg_return.rename(columns={'retx': 'group_avg_retx'}, inplace=True)

# Merge with main df
df_merged = df_merged.merge(group_avg_return, on=['date', 'group_id'], how='left')

# Compute peer-relative return
df_merged['retx_relative'] = df_merged['retx'] - df_merged['group_avg_retx']

# Create lagged peer-relative return for signal generation (based on t-1 info)
df_merged['retx_relative_lag1'] = df_merged.groupby('permno')['retx_relative'].shift(1)

df_merged

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start,group_id,adv20,adj_prc,group_avg_retx,retx_relative,retx_relative_lag1
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,779.00,-0.007779,-0.007821,1975-04-01,1975-Q1-07,779.0000,5.203797,0.008906,-0.006455,
1,1975-04-01,22752,MRK,73.75000,-0.026403,75389.0,40.008190,472.00,-0.007779,-0.007821,1975-04-01,1975-Q1-04,472.0000,1.843373,-0.016955,-0.009448,
2,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,163.00,-0.007779,-0.007821,1975-04-01,1975-Q1-08,163.0000,5.806934,0.000978,-0.000978,
3,1975-04-01,25013,SGP,60.12500,0.002083,53929.0,32.000000,292.00,-0.007779,-0.007821,1975-04-01,1975-Q1-04,292.0000,1.878906,-0.016955,0.019038,
4,1975-04-01,25478,CRK,34.00000,0.018727,8000.0,6.000000,154.00,-0.007779,-0.007821,1975-04-01,1975-Q1-04,154.0000,5.666667,-0.016955,0.035682,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14831791,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,10074.67,-0.003392,-0.003541,2024-10-01,2024-Q3-10,14424.1330,24.890000,0.000467,-0.000467,0.018788
14831792,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,8284.75,-0.003392,-0.003541,2024-10-01,2024-Q3-00,7927.2985,31.150000,-0.002312,0.006503,-0.010173
14831793,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,4656.28,-0.003392,-0.003541,2024-10-01,2024-Q3-08,10146.9905,434.929990,-0.001960,0.000881,-0.004185
14831794,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,10636.85,-0.003392,-0.003541,2024-10-01,2024-Q3-10,18799.4110,18.740000,0.000467,0.009231,0.009399


In [8]:
def add_classic_z_scores(df, horizons=[1, 5, 10, 20], n_jobs=-1):
    """
    Adds classic z-scores and their lagged versions for multiple horizons using parallel processing.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing 'retx_relative' and group identifiers.
    horizons : list of int
        Time horizons over which to calculate classic z-scores.
    n_jobs : int
        Number of parallel jobs. Default is -1 (use all cores).

    Returns
    -------
    df : pd.DataFrame
        DataFrame with added z-score and lagged z-score columns for each horizon.
    """

    def compute_group_std(horizon):
        # Compute rolling standard deviation within each group_id over the specified horizon
        return (
            df.groupby(['date', 'group_id'])
            .agg({'retx_relative': lambda x: x.rolling(window=horizon, min_periods=1).std().iloc[-1]})
            .rename(columns={'retx_relative': f'group_std_retx_relative_{horizon}d'})
            .reset_index()
        )

    # Compute rolling std dev for each horizon in parallel
    group_stats_list = Parallel(n_jobs=n_jobs)(
        delayed(compute_group_std)(h) for h in horizons
    )

    # For each horizon, merge std, compute z-score, and add lag
    for idx, horizon in enumerate(horizons):
        std_col = f'group_std_retx_relative_{horizon}d'
        z_col = f'z_score_classic_{horizon}d'
        z_lag_col = f'{z_col}_lag1'

        # Merge group std back into DataFrame
        df = df.merge(group_stats_list[idx], on=['date', 'group_id'], how='left')

        # Compute Z-Score
        df[z_col] = df['retx_relative'] / df[std_col]
        df[z_col] = df[z_col].replace([np.inf, -np.inf], np.nan).fillna(0)

        # Add Lagged Z-Score
        df[z_lag_col] = df.groupby('permno')[z_col].shift(1)
        df[z_lag_col] = df[z_lag_col].replace([np.inf, -np.inf], np.nan).fillna(0)

    return df

df_merged = add_classic_z_scores(df_merged, horizons=[1, 5, 10, 20], n_jobs=-1)

In [9]:
# ---------------------------------------------
# Estimate OU Process Parameters θ and μ in Parallel
# ---------------------------------------------

def estimate_ou_params(series):
    series = series.dropna()
    if len(series) < 30:
        return None  # Insufficient data

    x_lag = series.shift(1).dropna()
    x_current = series.loc[x_lag.index]

    # OLS Regression: x_current = beta * x_lag + c
    X = np.vstack([x_lag.values, np.ones(len(x_lag))]).T
    beta, c = np.linalg.lstsq(X, x_current.values, rcond=None)[0]

    if beta <= 0 or beta >= 1:
        θ = 0.05  # Assume very slow reversion
        μ = np.mean(series)
        return θ, μ


    dt = 1  # Assuming daily data
    θ = -np.log(beta) / dt
    μ = c / (1 - beta)

    return θ, μ

# Parallel computation of OU parameters
permno_groups = list(df_merged.groupby('permno')['retx_relative_lag1'])

ou_param_results = Parallel(n_jobs=-1)(
    delayed(estimate_ou_params)(series) for _, series in permno_groups
)

ou_param_dict = {
    permno: result
    for (permno, _), result in zip(permno_groups, ou_param_results)
    if result is not None
}


theta_vals = [v[0] for v in ou_param_dict.values()]
mu_vals    = [v[1] for v in ou_param_dict.values()]

print(f"θ  (speed) – min: {min(theta_vals):.3f}, max: {max(theta_vals):.3f}, mean: {np.mean(theta_vals):.3f}")
print(f"μ (mean ) – min: {min(mu_vals):.4f}, max: {max(mu_vals):.4f}, mean: {np.mean(mu_vals):.4f}")

θ  (speed) – min: 0.050, max: 10.367, mean: 1.271
μ (mean ) – min: -0.0287, max: 0.0751, mean: -0.0006


In [10]:
print(df_merged['retx'].describe())
print(df_merged['retx'].isna().mean())  # Percentage of NaNs

count    1.483180e+07
mean     4.026628e-04
std      4.487741e-02
min     -8.974030e-01
25%     -1.515200e-02
50%      0.000000e+00
75%      1.403200e-02
max      1.900000e+01
Name: retx, dtype: float64
0.0


In [11]:
total_permnos = df_merged['permno'].nunique()
valid_permnos = len(ou_param_dict)

print(f"Parameter coverage: {valid_permnos} / {total_permnos} stocks"
      f"  ({valid_permnos/total_permnos:.1%})")

Parameter coverage: 12724 / 12746 stocks  (99.8%)


In [12]:
def add_forecast_metrics(df, horizons=[1, 5, 10], ou_params_dict=None, n_jobs=-1, verbose=True):
    """
    Compute GARCH volatility, OU forecast, and Z-Scores for multiple horizons in parallel.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame with financial data.
    horizons : list of int
        Forecast horizons (e.g., [1, 5, 10, 20]).
    ou_params_dict : dict
        Precomputed OU parameters by permno.
    n_jobs : int
        Number of parallel jobs.
    verbose : bool
        Show progress logs.

    Returns
    -------
    pd.DataFrame with added forecast columns.
    """
    if 'garch_vol_daily' not in df.columns:
        if verbose: print("📊 Computing Daily GARCH Volatility...")

        def estimate_garch_daily(series):
            if len(series.dropna()) < 50:
                return pd.Series(index=series.index, data=np.nan)
            try:
                series_scaled = series * 100
                model = arch_model(series_scaled, vol='Garch', p=1, q=1)
                res = model.fit(disp="off")
                vol_scaled = res.conditional_volatility / 100
                return vol_scaled.reindex(series.index, fill_value=np.nan)
            except:
                return pd.Series(index=series.index, data=np.nan)

        permno_groups = list(df.groupby('permno')['retx'])
        garch_results = Parallel(n_jobs=n_jobs)(
            delayed(estimate_garch_daily)(series) for _, series in tqdm(permno_groups, disable=not verbose)
        )
        garch_series = pd.concat(garch_results)
        df['garch_vol_daily'] = garch_series
        df['garch_vol_daily_lag1'] = df.groupby('permno')['garch_vol_daily'].shift(1)

    permno_groups = list(df.groupby('permno'))

    # Process all horizons in parallel
    for horizon in horizons:
        if verbose: print(f"\n🚀 Processing Horizon: {horizon} Days")

        # 1. GARCH Volatility for Horizon
        garch_col = f"garch_vol_{horizon}d_lag1"
        df[garch_col] = df['garch_vol_daily_lag1'] * np.sqrt(horizon)

        # 2. OU Forecast for Horizon
        if verbose: print(f"📈 Computing OU Forecast for {horizon} Days...")

        ou_forecast_col = f"ou_forecast_{horizon}d"

        def compute_ou_forecast_group(permno, group):
            θμ = ou_params_dict.get(permno, None)
            if θμ is None:
                return pd.Series(np.nan, index=group.index)
            θ, μ = θμ
            retx_rel_lag1 = group['retx_relative_lag1']
            return μ + (retx_rel_lag1 - μ) * np.exp(-θ * horizon)

        ou_results = Parallel(n_jobs=n_jobs)(
            delayed(compute_ou_forecast_group)(permno, group) 
            for permno, group in tqdm(permno_groups, disable=not verbose)
        )
        ou_forecast_series = pd.concat(ou_results).reindex(df.index)
        df[ou_forecast_col] = ou_forecast_series

        # 3. Z-Score Calculation
        z_score_col = f"z_score_{horizon}d"
        df[z_score_col] = (
            df[ou_forecast_col] - df['retx_relative_lag1']
        ) / df[garch_col]

        # Clean NaNs and Inf values
        df[z_score_col] = df[z_score_col].replace([np.inf, -np.inf], np.nan).fillna(0)

        if verbose: 
            print(f"✅ Added: {garch_col}, {ou_forecast_col}, {z_score_col}")

    return df

In [13]:
problem_series = []

for permno, series in df_merged.groupby('permno')['retx']:
    if series.var() < 1e-6:  # Arbitrary low variance threshold
        problem_series.append(permno)

print(f"Assets with near-zero variance: {problem_series}")


Assets with near-zero variance: [10058, 10538, 11815, 15083, 37947, 48996, 49884, 60418, 88366]


In [14]:
low_var_permnos = [10058, 10538, 11815, 15083, 37947, 48996, 49884, 60418, 88366]

row_counts = df_merged[df_merged['permno'].isin(low_var_permnos)]['permno'].value_counts()
print(row_counts)
print(f"Total rows with low variance assets: {row_counts.sum()}")
print(f"Total rows in dataset: {len(df_merged)}")
print(f"Percentage of rows: {row_counts.sum() / len(df_merged) * 100:.2f}%")

permno
11815    136
37947    125
48996     65
10538     64
10058     50
88366     43
49884     36
15083     35
60418     34
Name: count, dtype: int64
Total rows with low variance assets: 588
Total rows in dataset: 14831796
Percentage of rows: 0.00%


In [15]:
df_merged = df_merged[~df_merged['permno'].isin(low_var_permnos)]

In [16]:
df_merged.isnull().sum()

date                                  0
permno                                0
ticker                             1789
prc                                   0
retx                                  0
shrout                                0
cfacpr                                0
vol                                   0
vwretd                                0
vwretx                                0
trading_start                         0
group_id                              0
adv20                                 0
adj_prc                               0
group_avg_retx                        0
retx_relative                         0
retx_relative_lag1                12737
group_std_retx_relative_1d     14831208
z_score_classic_1d                    0
z_score_classic_1d_lag1               0
group_std_retx_relative_5d        18195
z_score_classic_5d                    0
z_score_classic_5d_lag1               0
group_std_retx_relative_10d       18195
z_score_classic_10d                   0


In [17]:
df_merged['retx_relative_lag1'] = (
    df_merged.groupby('permno')['retx_relative'].shift(1)
)

# << run this once >>
nan_ratio = df_merged['retx_relative_lag1'].isna().mean()
print(f"retx_relative_lag1 NaN ratio = {nan_ratio:.2%}")

retx_relative_lag1 NaN ratio = 0.09%


In [18]:
# Get permno for TSLA (or any ticker you know should have forecasts)
pno = df_merged.loc[df_merged['ticker'] == 'TSLA', 'permno'].iat[0]
tsla_grp = df_merged[df_merged['permno'] == pno].sort_values('date')

# Check the estimated θ and μ for TSLA
θμ = ou_param_dict.get(pno, None)
print("θ, μ for TSLA:", θμ)

if θμ:
    θ, μ = θμ
    h = 5  # Horizon (change if needed)
    manual_forecast = μ + (tsla_grp['retx_relative_lag1'] - μ) * np.exp(-θ * h)
    print(manual_forecast.head(10))
    print("\nForecast Column Values:\n", tsla_grp[f'ou_forecast_{h}d'].head(10))

θ, μ for TSLA: (0.05, np.float64(0.0017221140766372502))
10611096         NaN
10612206    0.003030
10613014    0.023642
10613908   -0.013038
10615052   -0.017371
10615840    0.000365
10616733   -0.009228
10617849   -0.006295
10618650   -0.008543
10619508    0.004146
Name: retx_relative_lag1, dtype: float64


KeyError: 'ou_forecast_5d'

In [None]:
df_merged = add_forecast_metrics(
    df_merged, 
    horizons=[1, 5, 10, 20], 
    ou_params_dict=ou_param_dict, 
    n_jobs=-1, 
    verbose=True
)

In [None]:
# ---------------------------------------------
# Filter to Keep Only Groups (Portfolios) with Sufficient Stocks for Reliable Trading Signals
# ---------------------------------------------

valid_groups = (
    # For each date and group (peer portfolio), count the number of available stocks (permno)
    df_merged.groupby(['date', 'group_id'])['permno'].count()
    .reset_index(name='group_size')  # Flatten the result and rename the count column to 'group_size'
    .query('group_size >= 10')       # Keep only groups that have at least 10 stocks available
)

# Keep only the trades that belong to valid groups (with at least 10 stocks)
df_trade = df_merged.merge(
    valid_groups[['date', 'group_id']],  # Only need 'date' and 'group_id' columns to filter
    on=['date', 'group_id'], 
    how='inner'  # Inner join ensures only trades from valid groups are kept
)

# Display the cleaned trading DataFrame with only sufficiently large groups
df_trade

In [None]:
df_fedfunds = pd.read_csv('fed_rates.csv')
df_fedfunds['date'] = pd.to_datetime(df_fedfunds['date'])
df_fedfunds

In [None]:
df_final = df_trade.merge(df_fedfunds[['date', 'fed_funds_rate']], on='date', how='left')
df_final

In [None]:
# Identify (date, permno) combinations that appear more than once
dup_keys = (
    df_final.groupby(['date', 'permno'])
            .size()
            .reset_index(name='count')
            .query('count > 1')[['date', 'permno']]
)

# Remove all rows that match those duplicate keys
df_final_clean = df_final.merge(dup_keys, on=['date', 'permno'], how='left', indicator=True)
df_final_clean = df_final_clean[df_final_clean['_merge'] == 'left_only'].drop(columns=['_merge'])

# Reset index if needed
df_final_clean.reset_index(drop=True, inplace=True)

In [None]:
duplicates = (
    df_final_clean.groupby(['date', 'permno'])
            .size()
            .reset_index(name='count')
            .query('count > 1')
)

if not duplicates.empty:
    print("Duplicates found:")
    # Count total number of duplicate rows based on (date, permno)
    total_duplicates = df_final_clean.duplicated(subset=['date', 'permno'], keep=False).sum()

    print(f"Total duplicate rows based on (date, permno): {total_duplicates}")

else:
    print("No duplicates found.")

In [None]:
def add_actual_volatility(
    df: pd.DataFrame,
    horizons=(1, 5, 10, 20),
    *,
    keep_daily=False,         # set to True if you also want the daily σ_t column
    min_obs_ratio: float = 1  # require a *full* window by default
) -> pd.DataFrame:
    """
    Append rolling realised vol columns that are on the *same scale*
    as your GARCH-derived horizon vol (σ_daily × √h).

    Parameters
    ----------
    df : DataFrame
        Must contain 'date', 'permno', 'retx' (simple daily return in **decimal** form).
    horizons : iterable[int]
        Rolling windows (in trading days) – e.g. (1, 5, 10, 20).
    keep_daily : bool
        If True, also keep the rolling daily σ_t column ('actual_vol_1d').
    min_obs_ratio : float
        Fraction of the window that must be present before a value is emitted.
        min_obs = max(1, int(min_obs_ratio * h))

    Returns
    -------
    df : DataFrame  (same object, modified in-place and returned for chaining)
    """
    # --- housekeeping -------------------------------------------------------
    df = df.sort_values(['permno', 'date'])          # guarantee time ordering
    df['retx'] = pd.to_numeric(df['retx'], errors='coerce')  # just in case

    # --- realised σ ---------------------------------------------------------
    for h in horizons:
        col = f"actual_vol_{h}d"         # e.g. actual_vol_5d
        col_lag = f"{col}_lag1"          # one-day lag to avoid look-ahead
        min_obs = max(1, int(min_obs_ratio * h))

        # rolling stdev of *daily* returns …
        roll_std = (
            df.groupby('permno')['retx']
              .rolling(window=h, min_periods=min_obs)
              .std()
              .reset_index(level=0, drop=True)
        )

        # … scaled to an h-day horizon (same scaling you used for GARCH)
        df[col] = roll_std * np.sqrt(h)

        # lagged version for back-tests
        df[col_lag] = df.groupby('permno')[col].shift(1)

    # optionally drop the 1-day series if you don’t need it explicitly
    if not keep_daily and 1 in horizons:
        df.drop(columns=['actual_vol_1d'], errors='ignore', inplace=True)

    return df

df_final_clean = add_actual_volatility(df_final_clean)

In [None]:
df = df_final_clean.copy()

In [None]:
df['realized_return_5'] = (
        df.groupby('permno')['retx']
        .transform(lambda x: x.shift(-1).rolling(5).sum())
    )

In [None]:
df[df['ticker'] == 'TSLA'][['date', 'retx_relative_lag1', 'ou_forecast_1d', 'ticker', 
                            'ou_forecast_5d', 'realized_return_5', 'ou_forecast_20d', 
                            'retx']].sort_values('date')

In [None]:
subset = df[df['ticker'] == 'TSLA'][['ou_forecast_1d', 'retx', 'ou_forecast_5d', 'realized_return_5', 'ou_forecast_20d']].dropna()

# Compute correlation matrix
correlation_matrix = subset.corr()

print(correlation_matrix)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_forecasts_vs_actual(df, ticker, days, start_date=None):
    """
    Plots GARCH vs Actual Volatility and OU Forecast vs Actual Returns for a given ticker and forecast horizon.

    Parameters
    ----------
    df : pd.DataFrame
        The dataframe containing the relevant columns.
    ticker : str
        Ticker symbol to filter the data.
    days : int
        Forecast horizon (e.g., 1, 5, 10, 20).
    start_date : str or pd.Timestamp, optional
        Start date for the plot (format: 'YYYY-MM-DD'). Plots from this date onwards.
    """
    df['date'] = pd.to_datetime(df['date'])
    valid_days = [1, 5, 10, 20]
    if days not in valid_days:
        raise ValueError(f"Invalid 'days' value. Choose from {valid_days}.")

    stock_df = df[df['ticker'] == ticker].sort_values('date').copy()
    if stock_df.empty:
        print(f"No data found for ticker: {ticker}")
        return

    # Apply start date filter if provided
    if start_date:
        stock_df = stock_df[stock_df['date'] >= pd.to_datetime(start_date)]
        if stock_df.empty:
            print(f"No data available for {ticker} after {start_date}.")
            return

    # Build dynamic column names
    garch_col = f"garch_vol_{days}d_lag1"
    ou_col = f"ou_forecast_{days}d"
    
    if days == 1:
        actual_vol_col = vol
    else:
        actual_vol_col = f"actual_vol_{days}d"

    # Compute realized returns over the horizon for proper OU comparison
    stock_df['realized_return'] = (
        stock_df.groupby('permno')['retx']
        .transform(lambda x: x.shift(-1).rolling(days).sum())
    )

    plot_df = stock_df[['date', garch_col, ou_col, actual_vol_col, 'realized_return']].dropna()

    if plot_df.empty:
        print(f"No valid data to plot for ticker '{ticker}' and horizon {days}d.")
        return

    # Plot 1: GARCH vs Actual Volatility
    plt.figure(figsize=(12, 5))
    plt.plot(plot_df['date'], plot_df[garch_col], label=f'GARCH Volatility ({days}d)', linestyle='--', color='red', alpha=0.5)
    plt.plot(plot_df['date'], plot_df[actual_vol_col], label=f'Actual Volatility ({days}d)', linestyle='-', color='green', alpha=0.5)
    plt.title(f'GARCH vs Actual Volatility ({days}d) - {ticker}')
    plt.xlabel('Date')
    plt.ylabel('Volatility')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

    # Plot 2: OU Forecast vs Realized Return
    plt.figure(figsize=(12, 5))
    plt.plot(plot_df['date'], plot_df[ou_col], label=f'OU Forecasted Return ({days}d)', linestyle='--', color='tab:green', alpha=0.8)
    plt.plot(plot_df['date'], plot_df['realized_return'], label=f'Realized Return ({days}d)', linestyle='-', color='tab:red', alpha=0.7)
    plt.title(f'OU Forecast vs Realized Return ({days}d) - {ticker}')
    plt.xlabel('Date')
    plt.ylabel('Return')
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()
    plt.show()

In [None]:
plot_forecasts_vs_actual(df, ticker='AAPL', days=5, start_date='2005-01-01')

In [None]:
df_final_clean.to_csv("final_dataset_backtest.csv", index = False)