In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans

import datetime

from arch import arch_model

import numpy.linalg as la
from pandas.tseries.offsets import BDay
import matplotlib.pyplot as plt

from collections import defaultdict

from joblib import Parallel, delayed

from tqdm import tqdm

In [2]:
df_clustered = pd.read_csv("knn-dataset.csv")
df_crsp = pd.read_csv('stock_daily.csv')
df_crsp['date'] = pd.to_datetime(df_crsp['date'])
df_crsp

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx
0,1975-01-02,23924,CFSR,-11.62500,0.000000,1591.0,2.0000,,0.025240,0.025212
1,1975-01-02,23931,NSP,17.12500,0.070313,23233.0,4.0000,12300.0,0.025240,0.025212
2,1975-01-02,23975,CLRK,-21.00000,0.000000,1279.0,45.5625,,0.025240,0.025212
3,1975-01-02,23990,ROH,47.87500,0.035135,12871.0,18.0000,4600.0,0.025240,0.025212
4,1975-01-02,24002,DEW,9.25000,0.000000,16487.0,1.5000,12200.0,0.025240,0.025212
...,...,...,...,...,...,...,...,...,...,...
88359665,2024-12-31,92396,ECH,25.04000,0.000000,18950.0,1.0000,67015.0,-0.003392,-0.003541
88359666,2024-12-31,92397,BKF,36.49050,-0.002550,1850.0,1.0000,2732.0,-0.003392,-0.003541
88359667,2024-12-31,92398,AIA,67.83000,-0.005571,10500.0,1.0000,38260.0,-0.003392,-0.003541
88359668,2024-12-31,92402,MSCI,600.01001,0.000600,78371.0,1.0000,223964.0,-0.003392,-0.003541


In [3]:
# Only keeping the permno that exist in df_clustered 
# -------------------------------------------------------------
# Create a reference key from df_clustered
valid_keys = df_clustered[['permno', 'trading_start']].drop_duplicates()

# Assign trading_start to df_crsp
df_crsp['trading_start'] = df_crsp['date'].dt.to_period('Q').dt.start_time

valid_keys['trading_start'] = pd.to_datetime(valid_keys['trading_start'])
df_crsp['trading_start'] = pd.to_datetime(df_crsp['trading_start'])

# Keep only CRSP rows for stocks in df_clustered for that trading quarter
df_crsp_filtered = pd.merge(df_crsp, valid_keys, on=['permno', 'trading_start'], how='inner')
df_crsp_filtered

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,77900.0,-0.007779,-0.007821,1975-04-01
1,1975-04-01,22752,MRK,73.75000,-0.026403,75389.0,40.008190,47200.0,-0.007779,-0.007821,1975-04-01
2,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,16300.0,-0.007779,-0.007821,1975-04-01
3,1975-04-01,25013,SGP,60.12500,0.002083,53929.0,32.000000,29200.0,-0.007779,-0.007821,1975-04-01
4,1975-04-01,25478,CRK,34.00000,0.018727,8000.0,6.000000,15400.0,-0.007779,-0.007821,1975-04-01
...,...,...,...,...,...,...,...,...,...,...,...
14830463,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,1007467.0,-0.003392,-0.003541,2024-10-01
14830464,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,828475.0,-0.003392,-0.003541,2024-10-01
14830465,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,465628.0,-0.003392,-0.003541,2024-10-01
14830466,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,1063685.0,-0.003392,-0.003541,2024-10-01


In [4]:
df_clustered['trading_start'] = pd.to_datetime(df_clustered['trading_start'])

df_merged = pd.merge(
    df_crsp_filtered,
    df_clustered[['permno', 'trading_start', 'group_id']],
    on=['permno', 'trading_start'],
    how='left'
)
df_merged

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start,group_id
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,77900.0,-0.007779,-0.007821,1975-04-01,1975-Q1-07
1,1975-04-01,22752,MRK,73.75000,-0.026403,75389.0,40.008190,47200.0,-0.007779,-0.007821,1975-04-01,1975-Q1-04
2,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,16300.0,-0.007779,-0.007821,1975-04-01,1975-Q1-08
3,1975-04-01,25013,SGP,60.12500,0.002083,53929.0,32.000000,29200.0,-0.007779,-0.007821,1975-04-01,1975-Q1-04
4,1975-04-01,25478,CRK,34.00000,0.018727,8000.0,6.000000,15400.0,-0.007779,-0.007821,1975-04-01,1975-Q1-04
...,...,...,...,...,...,...,...,...,...,...,...,...
14831791,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,1007467.0,-0.003392,-0.003541,2024-10-01,2024-Q3-10
14831792,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,828475.0,-0.003392,-0.003541,2024-10-01,2024-Q3-00
14831793,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,465628.0,-0.003392,-0.003541,2024-10-01,2024-Q3-08
14831794,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,1063685.0,-0.003392,-0.003541,2024-10-01,2024-Q3-10


In [5]:
# rolling 20 day average volumne
df_merged['adv20']   = (df_merged.groupby('permno')['vol']
                        .rolling(20, min_periods=1).mean()
                        .reset_index(level=0, drop=True))

# adjusted price
df_merged['adj_prc'] = df_merged['prc'] / df_merged['cfacpr']

In [6]:
df_merged[df_merged['ticker'] == 'MMM'].sort_values(by = 'date')

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start,group_id,adv20,adj_prc
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,77900.0,-0.007779,-0.007821,1975-04-01,1975-Q1-07,7.790000e+04,5.203797
112,1975-04-02,22592,MMM,50.50000,-0.012225,114200.0,9.824557,125300.0,-0.001893,-0.001946,1975-04-01,1975-Q1-07,1.016000e+05,5.140181
273,1975-04-03,22592,MMM,48.00000,-0.049505,114200.0,9.824557,272000.0,-0.010225,-0.010248,1975-04-01,1975-Q1-07,1.377750e+05,4.885716
266,1975-04-04,22592,MMM,48.00000,0.000000,114200.0,9.824557,75900.0,-0.005813,-0.005955,1975-04-01,1975-Q1-07,9.303333e+04,4.885716
427,1975-04-07,22592,MMM,47.37500,-0.013021,114200.0,9.824557,63300.0,-0.006551,-0.006602,1975-04-01,1975-Q1-07,1.228800e+05,4.822100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14826866,2024-12-24,22592,MMM,130.36000,0.010699,544559.0,1.000000,803216.0,0.010566,0.010521,2024-10-01,2024-Q3-00,3.055238e+06,130.360000
14827707,2024-12-26,22592,MMM,131.17999,0.006290,544559.0,1.000000,1485103.0,0.000346,0.000282,2024-10-01,2024-Q3-00,2.975478e+06,131.179990
14828970,2024-12-27,22592,MMM,130.17999,-0.007623,544559.0,1.000000,1842823.0,-0.010692,-0.010775,2024-10-01,2024-Q3-00,2.938639e+06,130.179990
14830077,2024-12-30,22592,MMM,129.13000,-0.008066,544559.0,1.000000,2153994.0,-0.009878,-0.009900,2024-10-01,2024-Q3-00,2.965424e+06,129.130000


In [7]:
# Compute group-level average return
group_avg_return = df_merged.groupby(['date', 'group_id'])['retx'].mean().reset_index()
group_avg_return.rename(columns={'retx': 'group_avg_retx'}, inplace=True)

# Merge with main df
df_merged = df_merged.merge(group_avg_return, on=['date', 'group_id'], how='left')

# Compute peer-relative return
df_merged['retx_relative'] = df_merged['retx'] - df_merged['group_avg_retx']

# Create lagged peer-relative return for signal generation (based on t-1 info)
df_merged['retx_relative_lag1'] = df_merged.groupby('permno')['retx_relative'].shift(1)

df_merged['vol']  = df_merged['vol'].fillna(0) / 100           # ← changed
df_merged

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,trading_start,group_id,adv20,adj_prc,group_avg_retx,retx_relative,retx_relative_lag1
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,779.00,-0.007779,-0.007821,1975-04-01,1975-Q1-07,77900.00,5.203797,0.008906,-0.006455,
1,1975-04-01,22752,MRK,73.75000,-0.026403,75389.0,40.008190,472.00,-0.007779,-0.007821,1975-04-01,1975-Q1-04,47200.00,1.843373,-0.016955,-0.009448,
2,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,163.00,-0.007779,-0.007821,1975-04-01,1975-Q1-08,16300.00,5.806934,0.000978,-0.000978,
3,1975-04-01,25013,SGP,60.12500,0.002083,53929.0,32.000000,292.00,-0.007779,-0.007821,1975-04-01,1975-Q1-04,29200.00,1.878906,-0.016955,0.019038,
4,1975-04-01,25478,CRK,34.00000,0.018727,8000.0,6.000000,154.00,-0.007779,-0.007821,1975-04-01,1975-Q1-04,15400.00,5.666667,-0.016955,0.035682,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14831791,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,10074.67,-0.003392,-0.003541,2024-10-01,2024-Q3-10,1442413.30,24.890000,0.000467,-0.000467,0.018788
14831792,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,8284.75,-0.003392,-0.003541,2024-10-01,2024-Q3-00,792729.85,31.150000,-0.002312,0.006503,-0.010173
14831793,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,4656.28,-0.003392,-0.003541,2024-10-01,2024-Q3-08,1014699.05,434.929990,-0.001960,0.000881,-0.004185
14831794,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,10636.85,-0.003392,-0.003541,2024-10-01,2024-Q3-10,1879941.10,18.740000,0.000467,0.009231,0.009399


In [8]:
def add_classic_z_scores(df, horizons=[1, 5, 10, 20], n_jobs=-1):
    """
    Adds classic z-scores and their lagged versions for multiple horizons using parallel processing.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing 'retx_relative' and group identifiers.
    horizons : list of int
        Time horizons over which to calculate classic z-scores.
    n_jobs : int
        Number of parallel jobs. Default is -1 (use all cores).

    Returns
    -------
    df : pd.DataFrame
        DataFrame with added z-score and lagged z-score columns for each horizon.
    """

    def compute_group_std(horizon):
        # Compute rolling standard deviation within each group_id over the specified horizon
        return (
            df.groupby(['date', 'group_id'])
            .agg({'retx_relative': lambda x: x.rolling(window=horizon, min_periods=1).std().iloc[-1]})
            .rename(columns={'retx_relative': f'group_std_retx_relative_{horizon}d'})
            .reset_index()
        )

    # Compute rolling std dev for each horizon in parallel
    group_stats_list = Parallel(n_jobs=n_jobs)(
        delayed(compute_group_std)(h) for h in horizons
    )

    # For each horizon, merge std, compute z-score, and add lag
    for idx, horizon in enumerate(horizons):
        std_col = f'group_std_retx_relative_{horizon}d'
        z_col = f'z_score_classic_{horizon}d'
        z_lag_col = f'{z_col}_lag1'

        # Merge group std back into DataFrame
        df = df.merge(group_stats_list[idx], on=['date', 'group_id'], how='left')

        # Compute Z-Score
        df[z_col] = df['retx_relative'] / df[std_col]
        df[z_col] = df[z_col].replace([np.inf, -np.inf], np.nan).fillna(0)

        # Add Lagged Z-Score
        df[z_lag_col] = df.groupby('permno')[z_col].shift(1)
        df[z_lag_col] = df[z_lag_col].replace([np.inf, -np.inf], np.nan).fillna(0)

    return df

df_merged = add_classic_z_scores(df_merged, horizons=[1, 5, 10, 20], n_jobs=-1)

In [9]:
# ---------------------------------------------
# Estimate OU Process Parameters θ and μ in Parallel
# ---------------------------------------------

def estimate_ou_params(x):
    x = x.dropna()
    dx = x.diff().dropna()
    x_lag = x.shift(1).dropna()

    if len(x_lag) < 30:
        return None

    X = np.vstack([x_lag, np.ones(len(x_lag))]).T
    θ_mu, c = np.linalg.lstsq(X, dx, rcond=None)[0]

    θ = -θ_mu
    μ = -c / θ_mu if θ_mu != 0 else np.nan

    return θ, μ

# Parallel computation of OU parameters
permno_groups = list(df_merged.groupby('permno')['retx_relative_lag1'])

ou_param_results = Parallel(n_jobs=-1)(
    delayed(estimate_ou_params)(series) for _, series in permno_groups
)

ou_param_dict = {
    permno: result
    for (permno, _), result in zip(permno_groups, ou_param_results)
    if result is not None
}

In [19]:
def add_forecast_metrics(df, horizons=[1, 5, 10, 20], ou_params_dict=None, n_jobs=-1, verbose=True):
    """
    Compute GARCH volatility, OU forecast, and Z-Scores for multiple horizons in parallel.

    Parameters
    ----------
    df : pd.DataFrame
        Input DataFrame with financial data.
    horizons : list of int
        Forecast horizons (e.g., [1, 5, 10, 20]).
    ou_params_dict : dict
        Precomputed OU parameters by permno.
    n_jobs : int
        Number of parallel jobs.
    verbose : bool
        Show progress logs.

    Returns
    -------
    pd.DataFrame with added forecast columns.
    """
    if 'garch_vol_daily' not in df.columns:
        if verbose: print("📊 Computing Daily GARCH Volatility...")

        def estimate_garch_daily(series):
            if len(series.dropna()) < 100:
                return pd.Series(index=series.index, data=np.nan)
            try:
                series_scaled = series * 100
                model = arch_model(series_scaled, vol='Garch', p=1, q=1)
                res = model.fit(disp="off", method="BFGS", options={"maxiter": 1000})
                vol_scaled = res.conditional_volatility / 100
                return vol_scaled.reindex(series.index, fill_value=np.nan)
            except:
                return pd.Series(index=series.index, data=np.nan)

        permno_groups = list(df.groupby('permno')['retx'])
        garch_results = Parallel(n_jobs=n_jobs)(
            delayed(estimate_garch_daily)(series) for _, series in tqdm(permno_groups, disable=not verbose)
        )
        garch_series = pd.concat(garch_results)
        df['garch_vol_daily'] = garch_series
        df['garch_vol_daily_lag1'] = df.groupby('permno')['garch_vol_daily'].shift(1)

    permno_groups = list(df.groupby('permno'))

    # Process all horizons in parallel
    for horizon in horizons:
        if verbose: print(f"\n🚀 Processing Horizon: {horizon} Days")

        # 1. GARCH Volatility for Horizon
        garch_col = f"garch_vol_{horizon}d_lag1"
        df[garch_col] = df['garch_vol_daily_lag1'] * np.sqrt(horizon)

        # 2. OU Forecast for Horizon
        if verbose: print(f"📈 Computing OU Forecast for {horizon} Days...")

        ou_forecast_col = f"ou_forecast_{horizon}d"

        def compute_ou_forecast_group(permno, group):
            θμ = ou_params_dict.get(permno, None)
            if θμ is None:
                return pd.Series(np.nan, index=group.index)
            θ, μ = θμ
            retx_rel_lag1 = group['retx_relative_lag1']
            return μ + (retx_rel_lag1 - μ) * np.exp(-θ * horizon)

        ou_results = Parallel(n_jobs=n_jobs)(
            delayed(compute_ou_forecast_group)(permno, group) 
            for permno, group in tqdm(permno_groups, disable=not verbose)
        )
        ou_forecast_series = pd.concat(ou_results)
        df[ou_forecast_col] = ou_forecast_series

        # 3. Z-Score Calculation
        z_score_col = f"z_score_{horizon}d"
        df[z_score_col] = (
            df[ou_forecast_col] - df['retx_relative_lag1']
        ) / df[garch_col]

        # Clean NaNs and Inf values
        df[z_score_col] = df[z_score_col].replace([np.inf, -np.inf], np.nan).fillna(0)

        if verbose: 
            print(f"✅ Added: {garch_col}, {ou_forecast_col}, {z_score_col}")

    return df

In [13]:
problem_series = []

for permno, series in df_merged.groupby('permno')['retx']:
    if series.var() < 1e-6:  # Arbitrary low variance threshold
        problem_series.append(permno)

print(f"Assets with near-zero variance: {problem_series}")


Assets with near-zero variance: [10058, 10538, 11815, 15083, 37947, 48996, 49884, 60418, 88366]


In [16]:
low_var_permnos = [10058, 10538, 11815, 15083, 37947, 48996, 49884, 60418, 88366]

row_counts = df_merged[df_merged['permno'].isin(low_var_permnos)]['permno'].value_counts()
print(row_counts)
print(f"Total rows with low variance assets: {row_counts.sum()}")
print(f"Total rows in dataset: {len(df_merged)}")
print(f"Percentage of rows: {row_counts.sum() / len(df_merged) * 100:.2f}%")

permno
11815    136
37947    125
48996     65
10538     64
10058     50
88366     43
49884     36
15083     35
60418     34
Name: count, dtype: int64
Total rows with low variance assets: 588
Total rows in dataset: 14831796
Percentage of rows: 0.00%


In [17]:
df_merged = df_merged[~df_merged['permno'].isin(low_var_permnos)]

In [11]:
df_merged.isnull().sum()

date                                  0
permno                                0
ticker                             1789
prc                                   0
retx                                  0
shrout                                0
cfacpr                                0
vol                                   0
vwretd                                0
vwretx                                0
trading_start                         0
group_id                              0
adv20                            114737
adj_prc                               0
group_avg_retx                        0
retx_relative                         0
retx_relative_lag1                12746
group_std_retx_relative_1d     14831796
z_score_classic_1d                    0
z_score_classic_1d_lag1               0
group_std_retx_relative_5d        18195
z_score_classic_5d                    0
z_score_classic_5d_lag1               0
group_std_retx_relative_10d       18195
z_score_classic_10d                   0


In [20]:
df_merged = add_forecast_metrics(
    df_merged, 
    horizons=[1, 5, 10, 20], 
    ou_params_dict=ou_param_dict, 
    n_jobs=-1, 
    verbose=True
)


🚀 Processing Horizon: 1 Days
📈 Computing OU Forecast for 1 Days...



[A%|                                                 | 0/12737 [00:00<?, ?it/s]
[A%|                                        | 20/12737 [00:00<05:05, 41.63it/s]
[A%|                                        | 30/12737 [00:00<04:29, 47.19it/s]
[A%|▏                                       | 40/12737 [00:00<04:28, 47.28it/s]
[A%|▏                                       | 50/12737 [00:01<04:32, 46.62it/s]
[A%|▏                                       | 60/12737 [00:01<04:33, 46.35it/s]
[A%|▏                                       | 70/12737 [00:01<04:40, 45.17it/s]
[A%|▎                                       | 80/12737 [00:01<04:45, 44.32it/s]
[A%|▎                                       | 90/12737 [00:02<04:47, 44.04it/s]
[A%|▎                                      | 100/12737 [00:02<05:19, 39.59it/s]
[A%|▎                                      | 110/12737 [00:02<05:11, 40.52it/s]
[A%|▎                                      | 120/12737 [00:02<05:04, 41.48it/s]
[A%|▍                     

✅ Added: garch_vol_1d_lag1, ou_forecast_1d, z_score_1d

🚀 Processing Horizon: 5 Days
📈 Computing OU Forecast for 5 Days...



[A%|                                                 | 0/12737 [00:00<?, ?it/s]
[A%|                                       | 40/12737 [00:00<01:42, 123.52it/s]
[A%|▏                                      | 60/12737 [00:00<01:59, 105.78it/s]
[A%|▎                                       | 80/12737 [00:00<02:09, 97.64it/s]
[A%|▎                                      | 100/12737 [00:01<02:15, 93.13it/s]
[A%|▎                                      | 120/12737 [00:01<02:29, 84.36it/s]
[A%|▍                                      | 140/12737 [00:01<02:27, 85.28it/s]
[A%|▍                                      | 160/12737 [00:01<02:26, 86.01it/s]
[A%|▌                                      | 180/12737 [00:01<02:26, 85.67it/s]
[A%|▌                                      | 200/12737 [00:02<02:26, 85.37it/s]
[A%|▋                                      | 220/12737 [00:02<02:26, 85.57it/s]
[A%|▋                                      | 240/12737 [00:02<02:25, 85.63it/s]
[A%|▊                     

✅ Added: garch_vol_5d_lag1, ou_forecast_5d, z_score_5d

🚀 Processing Horizon: 10 Days
📈 Computing OU Forecast for 10 Days...



[A%|                                                 | 0/12737 [00:00<?, ?it/s]
[A%|                                       | 40/12737 [00:00<01:32, 137.53it/s]
[A%|▏                                      | 60/12737 [00:00<01:52, 112.55it/s]
[A%|▏                                      | 80/12737 [00:00<02:03, 102.25it/s]
[A%|▎                                      | 100/12737 [00:00<02:11, 95.99it/s]
[A%|▎                                      | 120/12737 [00:01<02:18, 91.09it/s]
[A%|▍                                      | 140/12737 [00:01<02:18, 91.25it/s]
[A%|▍                                      | 160/12737 [00:01<02:20, 89.62it/s]
[A%|▌                                      | 180/12737 [00:01<02:21, 88.47it/s]
[A%|▌                                      | 200/12737 [00:02<02:21, 88.70it/s]
[A%|▋                                      | 220/12737 [00:02<02:21, 88.74it/s]
[A%|▋                                      | 240/12737 [00:02<02:20, 88.72it/s]
[A%|▊                     

✅ Added: garch_vol_10d_lag1, ou_forecast_10d, z_score_10d

🚀 Processing Horizon: 20 Days
📈 Computing OU Forecast for 20 Days...



[A%|                                                 | 0/12737 [00:00<?, ?it/s]
[A%|                                       | 40/12737 [00:00<01:34, 134.58it/s]
[A%|▏                                      | 60/12737 [00:00<01:51, 113.76it/s]
[A%|▏                                      | 80/12737 [00:00<02:03, 102.84it/s]
[A%|▎                                      | 100/12737 [00:00<02:12, 95.40it/s]
[A%|▎                                      | 120/12737 [00:01<02:22, 88.68it/s]
[A%|▍                                      | 140/12737 [00:01<02:23, 87.89it/s]
[A%|▍                                      | 160/12737 [00:01<02:24, 87.29it/s]
[A%|▌                                      | 180/12737 [00:01<02:25, 86.54it/s]
[A%|▌                                      | 200/12737 [00:02<02:22, 87.95it/s]
[A%|▋                                      | 220/12737 [00:02<02:24, 86.45it/s]
[A%|▋                                      | 240/12737 [00:02<02:24, 86.58it/s]
[A%|▊                     

✅ Added: garch_vol_20d_lag1, ou_forecast_20d, z_score_20d


In [21]:
# ---------------------------------------------
# Filter to Keep Only Groups (Portfolios) with Sufficient Stocks for Reliable Trading Signals
# ---------------------------------------------

valid_groups = (
    # For each date and group (peer portfolio), count the number of available stocks (permno)
    df_merged.groupby(['date', 'group_id'])['permno'].count()
    .reset_index(name='group_size')  # Flatten the result and rename the count column to 'group_size'
    .query('group_size >= 10')       # Keep only groups that have at least 10 stocks available
)

# Keep only the trades that belong to valid groups (with at least 10 stocks)
df_trade = df_merged.merge(
    valid_groups[['date', 'group_id']],  # Only need 'date' and 'group_id' columns to filter
    on=['date', 'group_id'], 
    how='inner'  # Inner join ensures only trades from valid groups are kept
)

# Display the cleaned trading DataFrame with only sufficiently large groups
df_trade

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,...,z_score_1d,garch_vol_5d_lag1,ou_forecast_5d,z_score_5d,garch_vol_10d_lag1,ou_forecast_10d,z_score_10d,garch_vol_20d_lag1,ou_forecast_20d,z_score_20d
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,779.00,-0.007779,-0.007821,...,0.000000,,,0.000000,,,0.000000,,,0.000000
1,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,163.00,-0.007779,-0.007821,...,0.000000,,,0.000000,,,0.000000,,,0.000000
2,1975-04-01,26390,SKL,55.25000,0.004545,14851.0,8.000000,44.00,-0.007779,-0.007821,...,0.000000,,,0.000000,,,0.000000,,,0.000000
3,1975-04-01,10890,BGH,89.12500,-0.020604,39573.0,0.300000,221.00,-0.007779,-0.007821,...,0.000000,,,0.000000,,,0.000000,,,0.000000
4,1975-04-01,11308,KO,76.00000,-0.037975,59849.0,97.836357,788.00,-0.007779,-0.007821,...,0.000000,,,0.000000,,,0.000000,,,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14689061,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,10074.67,-0.003392,-0.003541,...,-0.432848,0.056864,0.001559,-0.302994,0.080418,0.001447,-0.215642,0.113729,0.001446,-0.152488
14689062,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,8284.75,-0.003392,-0.003541,...,0.291384,0.047868,-0.000206,0.208215,0.067696,-0.000128,0.148382,0.095737,-0.000128,0.104928
14689063,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,4656.28,-0.003392,-0.003541,...,0.135689,0.048177,0.000505,0.097362,0.068133,0.000543,0.069404,0.096355,0.000544,0.049079
14689064,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,10636.85,-0.003392,-0.003541,...,-0.256846,0.053665,-0.000401,-0.182623,0.075894,-0.000475,-0.130101,0.107331,-0.000475,-0.092001


In [22]:
df_fedfunds = pd.read_csv('fed_rates.csv')
df_fedfunds['date'] = pd.to_datetime(df_fedfunds['date'])
df_fedfunds

Unnamed: 0,date,fed_funds_rate
0,1970-01-01,5.00
1,1970-01-02,9.63
2,1970-01-03,9.63
3,1970-01-04,9.63
4,1970-01-05,9.75
...,...,...
20084,2024-12-27,4.33
20085,2024-12-28,4.33
20086,2024-12-29,4.33
20087,2024-12-30,4.33


In [23]:
df_final = df_trade.merge(df_fedfunds[['date', 'fed_funds_rate']], on='date', how='left')
df_final

Unnamed: 0,date,permno,ticker,prc,retx,shrout,cfacpr,vol,vwretd,vwretx,...,garch_vol_5d_lag1,ou_forecast_5d,z_score_5d,garch_vol_10d_lag1,ou_forecast_10d,z_score_10d,garch_vol_20d_lag1,ou_forecast_20d,z_score_20d,fed_funds_rate
0,1975-04-01,22592,MMM,51.12500,0.002451,114200.0,9.824557,779.00,-0.007779,-0.007821,...,,,0.000000,,,0.000000,,,0.000000,5.48
1,1975-04-01,23819,HAL,145.50000,0.000000,19167.0,25.056250,163.00,-0.007779,-0.007821,...,,,0.000000,,,0.000000,,,0.000000,5.48
2,1975-04-01,26390,SKL,55.25000,0.004545,14851.0,8.000000,44.00,-0.007779,-0.007821,...,,,0.000000,,,0.000000,,,0.000000,5.48
3,1975-04-01,10890,BGH,89.12500,-0.020604,39573.0,0.300000,221.00,-0.007779,-0.007821,...,,,0.000000,,,0.000000,,,0.000000,5.48
4,1975-04-01,11308,KO,76.00000,-0.037975,59849.0,97.836357,788.00,-0.007779,-0.007821,...,,,0.000000,,,0.000000,,,0.000000,5.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14689061,2024-12-31,92245,AROC,24.89000,0.000000,175154.0,1.000000,10074.67,-0.003392,-0.003541,...,0.056864,0.001559,-0.302994,0.080418,0.001447,-0.215642,0.113729,0.001446,-0.152488,4.33
14689062,2024-12-31,92293,TDC,31.15000,0.004191,95700.0,1.000000,8284.75,-0.003392,-0.003541,...,0.047868,-0.000206,0.208215,0.067696,-0.000128,0.148382,0.095737,-0.000128,0.104928,4.33
14689063,2024-12-31,92322,ULTA,434.92999,-0.001079,46569.0,1.000000,4656.28,-0.003392,-0.003541,...,0.048177,0.000505,0.097362,0.068133,0.000543,0.069404,0.096355,0.000544,0.049079,4.33
14689064,2024-12-31,92326,CVI,18.74000,0.009698,100531.0,1.000000,10636.85,-0.003392,-0.003541,...,0.053665,-0.000401,-0.182623,0.075894,-0.000475,-0.130101,0.107331,-0.000475,-0.092001,4.33


In [24]:
# Identify (date, permno) combinations that appear more than once
dup_keys = (
    df_final.groupby(['date', 'permno'])
            .size()
            .reset_index(name='count')
            .query('count > 1')[['date', 'permno']]
)

# Remove all rows that match those duplicate keys
df_final_clean = df_final.merge(dup_keys, on=['date', 'permno'], how='left', indicator=True)
df_final_clean = df_final_clean[df_final_clean['_merge'] == 'left_only'].drop(columns=['_merge'])

# Reset index if needed
df_final_clean.reset_index(drop=True, inplace=True)

In [25]:
duplicates = (
    df_final_clean.groupby(['date', 'permno'])
            .size()
            .reset_index(name='count')
            .query('count > 1')
)

if not duplicates.empty:
    print("Duplicates found:")
    # Count total number of duplicate rows based on (date, permno)
    total_duplicates = df_final_clean.duplicated(subset=['date', 'permno'], keep=False).sum()

    print(f"Total duplicate rows based on (date, permno): {total_duplicates}")

else:
    print("No duplicates found.")

No duplicates found.


In [26]:
def add_actual_volatility(
    df: pd.DataFrame,
    horizons=(1, 5, 10, 20),
    *,
    keep_daily=False,         # set to True if you also want the daily σ_t column
    min_obs_ratio: float = 1  # require a *full* window by default
) -> pd.DataFrame:
    """
    Append rolling realised vol columns that are on the *same scale*
    as your GARCH-derived horizon vol (σ_daily × √h).

    Parameters
    ----------
    df : DataFrame
        Must contain 'date', 'permno', 'retx' (simple daily return in **decimal** form).
    horizons : iterable[int]
        Rolling windows (in trading days) – e.g. (1, 5, 10, 20).
    keep_daily : bool
        If True, also keep the rolling daily σ_t column ('actual_vol_1d').
    min_obs_ratio : float
        Fraction of the window that must be present before a value is emitted.
        min_obs = max(1, int(min_obs_ratio * h))

    Returns
    -------
    df : DataFrame  (same object, modified in-place and returned for chaining)
    """
    # --- housekeeping -------------------------------------------------------
    df = df.sort_values(['permno', 'date'])          # guarantee time ordering
    df['retx'] = pd.to_numeric(df['retx'], errors='coerce')  # just in case

    # --- realised σ ---------------------------------------------------------
    for h in horizons:
        col = f"actual_vol_{h}d"         # e.g. actual_vol_5d
        col_lag = f"{col}_lag1"          # one-day lag to avoid look-ahead
        min_obs = max(1, int(min_obs_ratio * h))

        # rolling stdev of *daily* returns …
        roll_std = (
            df.groupby('permno')['retx']
              .rolling(window=h, min_periods=min_obs)
              .std()
              .reset_index(level=0, drop=True)
        )

        # … scaled to an h-day horizon (same scaling you used for GARCH)
        df[col] = roll_std * np.sqrt(h)

        # lagged version for back-tests
        df[col_lag] = df.groupby('permno')[col].shift(1)

    # optionally drop the 1-day series if you don’t need it explicitly
    if not keep_daily and 1 in horizons:
        df.drop(columns=['actual_vol_1d'], errors='ignore', inplace=True)

    return df

df_final_clean = add_actual_volatility(df_final_clean)

In [None]:
df_final_clean.to_csv("final_dataset_backtest.csv", index = False)