#### COINTEGRATION FILTER TO FIND STOCKS   
- Goal: Find only the most stable, highly significant pairs
- Risk: Fewer opportunities, more selective

Parameters:
- Correlation threshold: 0.98
- P-value threshold: 0.01
- ADF Statistic threshold:  loweest 10 pairs. 
- Time period: 15 years
- Min correlation with individual correlations > 0.98

In [2]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy import stats
from datetime import datetime, timedelta
from statsmodels.tsa.stattools import coint
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
from statsmodels.tsa.stattools import adfuller
import warnings
from GetFreshMarketData import *
from tqdm import tqdm
import polars as pl
import glob
import os
tqdm.pandas()
warnings.filterwarnings('ignore')


trading_vol = 500000
end_date = datetime.today()
start_date = end_date - timedelta(days = 365 * 1)
corr_threshold = 0.7

window = 13

In [3]:
folder_path = r"C:\Users\ksdee\Documents\Trading\Data\bhavcopy"
# Find all CSV files in the folder
list_of_files = glob.glob(os.path.join(folder_path, '*.csv'))
latest_file = max(list_of_files, key=os.path.getmtime)
index_symbols = pd.read_csv(latest_file)
index_symbols = index_symbols.loc[((index_symbols.SctySrs=='EQ') & (index_symbols.TtlTradgVol>=trading_vol)),'TckrSymb'].to_list()


INDEX_NAME = "NIFTY_500"
index_symbols_1  = pd.read_csv(fr"C:\Users\ksdee\Documents\Trading\Data\index\constituents\{INDEX_NAME}.csv")
index_symbols_1 = index_symbols_1.loc[((index_symbols_1.symbol!='DUMMYHDLVR') ),:] 
index_symbols_1 = index_symbols_1.loc[index_symbols_1.series == 'EQ','symbol'].to_list()


index_symbols = list(set(index_symbols).union(set(index_symbols_1)))



In [4]:
stock_dict = {}
for sym in tqdm(index_symbols):
    file = STOCK_DIR/f'{sym}.csv'
    if file.exists():
        df = pd.read_csv(file, low_memory=False)
        df.date = df.date.apply(lambda x : datetime.strptime(x,'%Y-%m-%d'))
        df = df.loc[df.date >= start_date,:]
        df = df.sort_values(by='date')
        stock_dict[sym] = df

stock_pairs = list(combinations(index_symbols, 2))
combinations_df = pd.DataFrame(stock_pairs, columns=['Stock_1', 'Stock_2'])


  0%|          | 0/833 [00:00<?, ?it/s]

100%|██████████| 833/833 [00:29<00:00, 27.95it/s]


In [5]:
def generate_corr(row):
    stock1 = row['Stock_1']
    stock2 = row['Stock_2']
    df1 = stock_dict[stock1]
    df2 = stock_dict[stock2]
    if df1.date.min() >= df2.date.min():
        df1 = df1.loc[df1.date.isin(df2.date),:]
    else:
        df2 = df2.loc[df2.date.isin(df1.date),:]

    is_equal = np.array_equal(df1.date.values, df2.date.to_numpy())
    if is_equal:
        df1 = df1.set_index('date')
        df1 = df1.sort_index()
        df2 = df2.set_index('date')
        df2 = df2.sort_index()

        df1['returns'] = np.log(df1.close/df1.close.shift(1))
        df2['returns'] = np.log(df2.close/df2.close.shift(1))
        correlation = df1.returns.dropna().corr(df2.returns.dropna())


    else:
        correlation = np.nan
    return correlation
combinations_df['Correlation']=combinations_df.progress_apply(lambda x: generate_corr(x),axis=1)


  0%|          | 0/346528 [00:00<?, ?it/s]

100%|██████████| 346528/346528 [14:09<00:00, 408.03it/s] 


In [6]:
combinations_df = combinations_df.dropna(subset='Correlation')
combinations_df = combinations_df.loc[combinations_df.Correlation.apply(lambda x : abs(x))>corr_threshold,:]

In [7]:
def find_coint(row):
    stock1 = row['Stock_1']
    stock2 = row['Stock_2']
    df1 = np.log(stock_dict[stock1].close)
    df2 = np.log(stock_dict[stock2].close)
    score, p_value, crit_values = coint(df1, df2)
    return score,p_value,crit_values

combinations_df[['score','p_value','crit_values']]=combinations_df.progress_apply(lambda x: pd.Series(find_coint(x)),axis=1)
combinations_df = combinations_df.loc[combinations_df.p_value<0.05,:]


100%|██████████| 894/894 [00:09<00:00, 94.62it/s] 


In [8]:
def build_reg(row):
    stock1 = row['Stock_1']
    stock2 = row['Stock_2']
    y = np.log(stock_dict[stock1].close)
    x = np.log(stock_dict[stock2].close)
    x_with_constant = add_constant(x)
    model = OLS(y.values, x_with_constant.values).fit()

    alpha = model.params[0]  # Intercept
    beta = model.params[1]   # Hedge Ratio (Slope)

    return alpha, beta

combinations_df[['alpha','beta']]=combinations_df.progress_apply(lambda x: pd.Series(build_reg(x)),axis=1)

100%|██████████| 284/284 [00:00<00:00, 613.47it/s]


In [9]:
def find_z_score(row):
    global window
    stock1 = row['Stock_1']
    stock2 = row['Stock_2']
    alpha = row['alpha']
    beta = row['beta']

    y = np.log(stock_dict[stock1].close)
    x = np.log(stock_dict[stock2].close)
    spread = y - (beta * x + alpha)

    
    rolling_mean = spread.rolling(window=window).mean()
    rolling_std = spread.rolling(window=window).std()
    z_score = (spread - rolling_mean) / rolling_std
    return z_score.values[-1]

combinations_df['z_score'] = combinations_df.progress_apply(lambda x: find_z_score(x),axis=1)


100%|██████████| 284/284 [00:00<00:00, 933.10it/s]


In [11]:
combinations_df.loc[~combinations_df.z_score.isna(),:]

Unnamed: 0,Stock_1,Stock_2,Correlation,score,p_value,crit_values,alpha,beta,z_score
2181,HINDPETRO,IOC,0.772281,-3.574337,0.02635033,"[-3.941513937471082, -3.361080551920153, -3.06...",0.290513,1.156078,-1.078649
30015,TOP100CASE,MID150CASE,0.798669,-3.963862,0.008080336,"[-3.941513937471082, -3.361080551920153, -3.06...",0.668279,0.718642,-0.330412
79464,FACT,RCF,0.743421,-3.900117,0.009915722,"[-3.941513937471082, -3.361080551920153, -3.06...",-0.945985,1.54833,0.3103
81186,JINDALSTEL,TATASTEEL,0.705208,-4.605631,0.0008168368,"[-3.941513937471082, -3.361080551920153, -3.06...",3.302657,0.704997,-1.033471
84088,SAIL,TATASTEEL,0.764272,-3.509256,0.0315836,"[-3.941513937471082, -3.361080551920153, -3.06...",0.519652,0.852266,-1.278067
94951,GOLDBEES,GOLDBETA,0.937088,-9.707434,1.365146e-15,"[-3.941513937471082, -3.361080551920153, -3.06...",0.046568,0.986612,0.431187
94969,GOLDBEES,HDFCGOLD,0.98358,-9.487784,4.943431e-15,"[-3.941513937471082, -3.361080551920153, -3.06...",-0.016433,0.996666,0.468124
95092,GOLDBEES,GOLD1,0.985209,-6.186815,6.350123e-07,"[-3.941513937471082, -3.361080551920153, -3.06...",-0.006237,0.999864,1.865364
95264,GOLDBEES,GOLDIETF,0.981868,-4.862697,0.0002920727,"[-3.941513937471082, -3.361080551920153, -3.06...",-0.002355,0.993263,0.274794
95292,GOLDBEES,SETFGOLD,0.984282,-10.066711,1.685237e-16,"[-3.941513937471082, -3.361080551920153, -3.06...",-0.022284,0.998189,-0.144574
