In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
import statsmodels.api as sm

In [3]:
from statsmodels.tsa.stattools import adfuller

In [None]:
!ls

In [4]:
csvs = ['nasdaq_screener_1685729611962.csv','nasdaq_screener_1685729316139.csv', 'nasdaq_screener_1685729644309.csv',
'nasdaq_screener_1685729362138.csv', 'nasdaq_screener_1685729666989.csv', 'nasdaq_screener_1685729424243.csv', 'nasdaq_screener_1685729708105.csv',
'nasdaq_screener_1685729454561.csv', 'nasdaq_screener_1685729730816.csv',
'nasdaq_screener_1685729483889.csv',
'nasdaq_screener_1685729579410.csv']

sector_to_symbol_dict = {}

for csv in csvs:
    df = pd.read_csv(csv)
    for idx, row in df.iterrows():
        sector = row['Sector']
        sym = row['Symbol']
        
        if sector in sector_to_symbol_dict.keys():
            sector_to_symbol_dict[sector].append(sym)
        else:
            sector_to_symbol_dict[sector] = [sym]
        

In [5]:
sector_to_etf_dict = {}
sector_to_etf_dict['Technology'] = 'XLK'
sector_to_etf_dict['Healthcare'] = 'XLV'
sector_to_etf_dict['Energy'] = 'XLE'
sector_to_etf_dict['Financials'] = 'XLF'
sector_to_etf_dict['Consumer Staples'] = 'XLP'
sector_to_etf_dict['Utilities'] = 'XLU'
sector_to_etf_dict['Consumer Discretionary'] = 'XLY'
sector_to_etf_dict['Basic Materials'] = 'XLB'
sector_to_etf_dict['Industrials'] = 'VIS'
sector_to_etf_dict['Telecommunications'] = 'VOX'
sector_to_etf_dict['Real Estate'] = 'XLRE'

In [6]:
sectors = sector_to_etf_dict.keys()
sector_to_fd_of_data_dict = {}
for s in sectors:
    etf_sect = sector_to_etf_dict[s]
    sector_to_fd_of_data_dict[s] = yf.Ticker(etf_sect).history(period='max').index[0]

sector_to_etf_dict maps sector to the ETF representing that sector; sector_to_symbol_dict maps sector to a list of the symbols in that sector; sector_to_fd_of_data_dict maps sector to the date yfinance first has that  sector's ETF's data for

# Let's look at Energy sector

In [None]:
#energy etf: XLE
xle = yf.Ticker('XLE').history(period='max')
xle.head()

In [None]:
index = xle.index
months = []
years = []
days = []
for dt in index:
    months.append(dt.month)
    years.append(dt.year)
    days.append(dt.day)

xle['Month'] = pd.Series(months, index=index)
xle['Year'] = pd.Series(years, index=index)
xle['Day'] = pd.Series(days, index=index)
xle.reset_index(drop=True,inplace=True)

Sample 10 companies from the Energy sector

In [None]:
energy_equities = sector_to_symbol_dict['Energy']

In [None]:
import random
random.shuffle(energy_equities)
energy_equities = energy_equities[:10]
print(energy_equities) #['EGY', 'TUSK', 'CHK', 'KNTK', 'DKL', 'WWD', 'RES', 'PAA', 'VIVK', 'RNGR'] 

In [None]:
energy_equities_dfs = {}

for s in energy_equities:
    fd = yf.Ticker(s).history(period='max').index[0]
    if pd.Timestamp(year=fd.year,month=fd.month,day=fd.day) > pd.Timestamp(year=1998, month=12, day=22):
        energy_equities_dfs[s] = yf.Ticker(s).history(period='max')
    else:
        energy_equities_dfs[s] = yf.Ticker(s).history(start='1998-12-22')

In [None]:
energy_equities_dfs['EGY']

In [None]:
for s in energy_equities:
    
    df = energy_equities_dfs[s]
    index = df.index
    days = []
    months = []
    years = []
    
    for dt in index:
        days.append(dt.day)
        months.append(dt.month)
        years.append(dt.year)
    
    df['Month'] = pd.Series(months, index=index)
    df['Year'] = pd.Series(years, index=index)
    df['Day'] = pd.Series(days, index=index)
    
    df.reset_index(drop=True, inplace=True)
    
    #compute daily returns where the daily return of day t is simply (close(t)-close(t-1))/close(t-1) 
    daily_rets = []
    for idx, row in df.iterrows():
        if idx == 0:
            daily_rets.append(-1)
            continue
        
        cur_close = row['Close']
        yd_close = df.iloc[idx-1,:]['Close']
        daily_rets.append((cur_close-yd_close)/yd_close)
    
    df['Daily Returns'] = pd.Series(daily_rets)
    

In [None]:
energy_equities_dfs['EGY']

In [None]:
xle

Compute daily returns for the Energy ETF the same way you did for each of the individual equities 

In [None]:
daily_rets = []
for idx, row in xle.iterrows():
    if idx == 0:
        daily_rets.append(-1)
        continue
    
    cur_close = row['Close']
    yd_close = xle.iloc[idx-1,:]['Close']
    daily_rets.append((cur_close-yd_close)/yd_close)

xle['Daily Returns'] = pd.Series(daily_rets)
xle.head()

In [None]:
for s in energy_equities:
    df = energy_equities_dfs[s]
    energy_equities_dfs[s] = df.merge(right=xle,how='inner',on=['Year','Month','Day'], suffixes=[None,' ETF'])

In [None]:
energy_equities_dfs['EGY']

In [None]:
for s in energy_equities:
    df = energy_equities_dfs[s]
    stock_closes = df['Close']
    etf_closes = df['Close ETF']
    plt.figure()
    plt.scatter(etf_closes, stock_closes)
    plt.xlabel('ETF Close')
    plt.ylabel(s + ' Close')

In [None]:
import statsmodels.api as sm

energy_resid_dict = {}
energy_beta_dict = {}

for s in energy_equities:
    df = energy_equities_dfs[s]
    stock_closes = df['Close']
    etf_closes = df['Close ETF']
    X = np.array(etf_closes).reshape((-1,1))
    Y = stock_closes
    results = sm.OLS(Y,X).fit()
    beta = results.params['x1']
    energy_beta_dict[s] = beta
    residuals = []
    etf_closes = np.array(etf_closes)
    stock_closes = np.array(stock_closes)
    for idx in range(etf_closes.shape[0]):
        residuals.append(stock_closes[idx] - (beta * etf_closes[idx]))
    energy_resid_dict[s] = residuals[:]

In [None]:
plt.plot(np.arange(0,len(energy_resid_dict['EGY'])),energy_resid_dict['EGY'])

In [None]:
for s in energy_equities:
    r = energy_resid_dict[s]
    plt.figure()
    plt.plot(np.arange(0,len(r)),r)
    plt.xlabel('Day index')
    plt.ylabel('Residual (Spread)')
    plt.title(s)

In [None]:
from statsmodels.tsa.stattools import adfuller
for s in energy_equities:
    r = energy_resid_dict[s]
    pval = adfuller(r)[1]
    if pval < 0.05:
        print(s, pval, sep=', ')

This is implying that, of the energy equities we sampled, only KNTK, VIVK, and RNGR have stationary spreads

In [None]:
import math
vivk_resids = energy_resid_dict['VIVK']
insample_num = math.floor(0.75 * len(vivk_resids))
insample = vivk_resids[:insample_num]
outsample = vivk_resids[insample_num:]

In [None]:
mean = pd.Series(insample).describe()['mean']
sd = pd.Series(insample).describe()['std']

In [None]:
ub = mean + 2*sd
lb = mean - 2*sd

plt.plot(np.arange(0,len(outsample)),outsample,label='spread')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * ub,label='ub')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * lb ,label='lb')

In [None]:
kntk_resids = energy_resid_dict['KNTK']
insample_num = math.floor(0.75 * len(kntk_resids))
insample = kntk_resids[:insample_num]
outsample = kntk_resids[insample_num:]

In [None]:
mean = pd.Series(insample).describe()['mean']
sd = pd.Series(insample).describe()['std']

In [None]:
ub = mean + sd
lb = mean - sd

plt.plot(np.arange(0,len(outsample)),outsample,label='spread')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * ub,label='ub')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * lb ,label='lb')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * mean ,label='mean')

In [None]:
rngr_resids = energy_resid_dict['RNGR']
insample_num = math.floor(0.75 * len(rngr_resids))
insample = rngr_resids[:insample_num]
outsample = rngr_resids[insample_num:]

In [None]:
mean = pd.Series(insample).describe()['mean']
sd = pd.Series(insample).describe()['std']
ub = mean + sd
lb = mean - sd

In [None]:
plt.plot(np.arange(0,len(outsample)),outsample,label='spread')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * ub,label='ub')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * lb ,label='lb')
plt.plot(np.arange(0,len(outsample)),np.ones(len(outsample)) * mean ,label='mean')
plt.xlabel('outsample index')
plt.ylabel('spread')
plt.title('RNGR')
plt.legend()

In [None]:
energy_beta_dict['RNGR']

In [None]:
short = False
long_ = False
open_pos_idx = 0
prof = 0
num_trades = 0

for t in range(len(outsample)):
    cur_spread = outsample[t]
    if short:
        #open position: sold spread
        #need to buy to close
        if cur_spread < mean + 0.5 * sd:
            #close pos by buying spread
            prof += outsample[open_pos_idx] - cur_spread
            short = False
            num_trades+=1
            continue
        elif t - open_pos_idx > 20: #position open longer than 20 days
            num_trades += 1
            prof += outsample[open_pos_idx] - cur_spread
            short = False
            continue
        continue
    elif long_:
        #open position: bought spread
        #need to sell to closee
        if cur_spread > mean + 0.5 * sd:
            prof += cur_spread - outsample[open_pos_idx]
            long_ = False
            num_trades += 1
            continue
        elif t - open_pos_idx > 20: #position open longer than 20 days
            num_trades+=1
            prof += cur_spread - outsample[open_pos_idx]
            long_ = False
            continue
        continue
    
    if cur_spread < lb:
        open_pos_idx = t
        long_ = True
    elif cur_spread > ub:
        open_pos_idx = t
        short = True
    

In [None]:
prof

In [None]:
num_trades

# Repeat what you did for the sample of Energy sector stocks for each sector

In [7]:
sectors

dict_keys(['Technology', 'Healthcare', 'Energy', 'Financials', 'Consumer Staples', 'Utilities', 'Consumer Discretionary', 'Basic Materials', 'Industrials', 'Telecommunications', 'Real Estate'])

# Start with Tech

In [8]:
tech_etf = sector_to_etf_dict['Technology']
tech_etf_df = yf.Ticker(tech_etf).history(period='max')

index = tech_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

tech_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
tech_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
tech_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

tech_etf_df.reset_index(drop=True, inplace=True)

In [9]:
tech_equities = sector_to_symbol_dict['Technology']
tech_eq_dfs = {}

for s in tech_equities:
    try:
        if s[len(s)-1] == 'W':
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(tech_etf_df.iloc[0]['Year']), month = int(tech_etf_df.iloc[0]['Month']), day = int(tech_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if tech_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(tech_etf_df.iloc[0]['Year']))+'-'+str(int(tech_etf_df.iloc[0]['Month']))+'-0'+str(int(tech_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(tech_etf_df.iloc[0]['Year']))+'-'+str(int(tech_etf_df.iloc[0]['Month']))+'-'+str(int(tech_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=tech_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        tech_eq_dfs[s] = df
    except:
        print(s)
    
    

LIFWZ: 1d data not available for startTime=-2208994789 and endTime=1685822905. Only 100 years worth of day granularity data are allowed to be fetched per request.
LIFWZ


In [10]:
tech_equities = list(tech_eq_dfs.keys())
num = 0
tech_pairs = []
for s in tech_equities:
    df = tech_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        tech_pairs.append((s,sector_to_etf_dict['Technology']))

In [11]:
print(num/len(list(tech_eq_dfs.keys())))

0.1961904761904762


About 20% of tech stocks have stationary residuals when their closing prices are regressed against the closing prices of the representative ETF for the tech sector

# Healthcare

In [12]:
hc_etf = sector_to_etf_dict['Healthcare']
hc_etf_df = yf.Ticker(hc_etf).history(period='max')

index = hc_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

hc_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
hc_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
hc_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

hc_etf_df.reset_index(drop=True, inplace=True)

In [14]:
len(sector_to_symbol_dict['Health Care'])

1012

In [15]:
hc_equities = sector_to_symbol_dict['Health Care']
hc_eq_dfs = {}

for idx in range(len(hc_equities)):
    s = hc_equities[idx]
    if idx % 100 == 0:
        print(idx)
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(hc_etf_df.iloc[0]['Year']), month = int(hc_etf_df.iloc[0]['Month']), day = int(hc_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if hc_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(hc_etf_df.iloc[0]['Year']))+'-'+str(int(hc_etf_df.iloc[0]['Month']))+'-0'+str(int(hc_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(hc_etf_df.iloc[0]['Year']))+'-'+str(int(hc_etf_df.iloc[0]['Month']))+'-'+str(int(hc_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=hc_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        hc_eq_dfs[s] = df
    except:
        print(s)
    
    

0
100
200
300
400
500
600
700
800
900
1000


In [16]:
hc_equities = list(hc_eq_dfs.keys())
num = 0
hc_pairs = []
for s in hc_equities:
    df = hc_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        hc_pairs.append((s,sector_to_etf_dict['Healthcare']))

In [17]:
print(num/len(hc_equities))

0.22375397667020147


about 20% of the healthcare stocks have stationary residuals when their closing prices are regressed against the healtcare ETF's closing prices

# Energy

In [18]:
energy_etf = sector_to_etf_dict['Energy']
energy_etf_df = yf.Ticker(energy_etf).history(period='max')

index = energy_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

energy_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
energy_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
energy_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

energy_etf_df.reset_index(drop=True, inplace=True)

In [19]:
len(sector_to_symbol_dict['Energy'])

152

In [20]:
energy_equities = sector_to_symbol_dict['Energy']
energy_eq_dfs = {}

for idx in range(len(energy_equities)):
    if idx % 50 == 0:
        print(idx)
    s = energy_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(energy_etf_df.iloc[0]['Year']), month = int(energy_etf_df.iloc[0]['Month']), day = int(energy_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if energy_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(energy_etf_df.iloc[0]['Year']))+'-'+str(int(energy_etf_df.iloc[0]['Month']))+'-0'+str(int(energy_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(energy_etf_df.iloc[0]['Year']))+'-'+str(int(energy_etf_df.iloc[0]['Month']))+'-'+str(int(energy_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=energy_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        energy_eq_dfs[s] = df
    except:
        print(s)
    
    

0
50
100
150


In [21]:
energy_equities = list(energy_eq_dfs.keys())
num = 0
energy_pairs = []
for s in energy_equities:
    df = energy_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        energy_pairs.append((s,sector_to_etf_dict['Energy']))

In [22]:
print(num/len(energy_equities))

0.1736111111111111


# Financials

In [23]:
fin_etf = sector_to_etf_dict['Financials']
fin_etf_df = yf.Ticker(fin_etf).history(period='max')

index = fin_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

fin_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
fin_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
fin_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

fin_etf_df.reset_index(drop=True, inplace=True)

In [24]:
len(sector_to_symbol_dict['Finance'])

1593

In [25]:
fin_equities = sector_to_symbol_dict['Finance']
fin_eq_dfs = {}

for idx in range(len(fin_equities)):
    if idx % 100 == 0:
        print(idx)
    s = fin_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(fin_etf_df.iloc[0]['Year']), month = int(fin_etf_df.iloc[0]['Month']), day = int(fin_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if fin_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(fin_etf_df.iloc[0]['Year']))+'-'+str(int(fin_etf_df.iloc[0]['Month']))+'-0'+str(int(fin_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(fin_etf_df.iloc[0]['Year']))+'-'+str(int(fin_etf_df.iloc[0]['Month']))+'-'+str(int(fin_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=fin_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        fin_eq_dfs[s] = df
    except:
        print(s)
    
    

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
SFB: 1d data not available for startTime=-2208994789 and endTime=1685824243. Only 100 years worth of day granularity data are allowed to be fetched per request.
SFB
1400
1500


In [26]:
fin_equities = list(fin_eq_dfs.keys())
num = 0
fin_pairs = []
for s in fin_equities:
    df = fin_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        fin_pairs.append((s,sector_to_etf_dict['Financials']))

In [27]:
print(num/len(fin_equities))

0.05697770437654831


# Consumer Staples

In [28]:
cs_etf = sector_to_etf_dict['Consumer Staples']
cs_etf_df = yf.Ticker(cs_etf).history(period='max')

index = cs_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

cs_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
cs_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
cs_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

cs_etf_df.reset_index(drop=True, inplace=True)

In [29]:
len(sector_to_symbol_dict['Consumer Staples'])

105

In [30]:
cs_equities = sector_to_symbol_dict['Consumer Staples']
cs_eq_dfs = {}

for idx in range(len(cs_equities)):
    if idx % 10 == 0:
        print(idx)
    s = cs_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(cs_etf_df.iloc[0]['Year']), month = int(cs_etf_df.iloc[0]['Month']), day = int(cs_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if cs_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(cs_etf_df.iloc[0]['Year']))+'-'+str(int(cs_etf_df.iloc[0]['Month']))+'-0'+str(int(cs_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(cs_etf_df.iloc[0]['Year']))+'-'+str(int(cs_etf_df.iloc[0]['Month']))+'-'+str(int(cs_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=cs_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        cs_eq_dfs[s] = df
    except:
        print(s)
    
    

0
10
20
30
40
50
60
70
80
90
100


In [31]:
cs_equities = list(cs_eq_dfs.keys())
num = 0
cs_pairs = []
for s in cs_equities:
    df = cs_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        cs_pairs.append((s,sector_to_etf_dict['Consumer Staples']))

In [32]:
print(num/len(cs_equities))

0.1111111111111111


# Utilities

In [33]:
util_etf = sector_to_etf_dict['Utilities']
util_etf_df = yf.Ticker(util_etf).history(period='max')

index = util_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

util_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
util_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
util_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

util_etf_df.reset_index(drop=True, inplace=True)

In [34]:
len(sector_to_symbol_dict['Utilities'])

135

In [35]:
util_equities = sector_to_symbol_dict['Utilities']
util_eq_dfs = {}

for idx in range(len(util_equities)):
    if idx % 10 == 0:
        print(idx)
    s = util_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(util_etf_df.iloc[0]['Year']), month = int(util_etf_df.iloc[0]['Month']), day = int(util_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if util_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(util_etf_df.iloc[0]['Year']))+'-'+str(int(util_etf_df.iloc[0]['Month']))+'-0'+str(int(util_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(util_etf_df.iloc[0]['Year']))+'-'+str(int(util_etf_df.iloc[0]['Month']))+'-'+str(int(util_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=util_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        util_eq_dfs[s] = df
    except:
        print(s)
    
    

0
10
20
30
40
EAI: No data found for this date range, symbol may be delisted
EAI
50
60
70
80
90
100
110
120
130


In [36]:
util_equities = list(util_eq_dfs.keys())
num = 0
util_pairs = []
for s in util_equities:
    df = util_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        util_pairs.append((s,sector_to_etf_dict['Utilities']))

In [37]:
print(num/len(util_equities))

0.1111111111111111


# Consumer Discretionary

In [38]:
cd_etf = sector_to_etf_dict['Consumer Discretionary']
cd_etf_df = yf.Ticker(cd_etf).history(period='max')

index = cd_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

cd_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
cd_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
cd_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

cd_etf_df.reset_index(drop=True, inplace=True)

In [39]:
len(sector_to_symbol_dict['Consumer Discretionary'])

817

In [40]:
cd_equities = sector_to_symbol_dict['Consumer Discretionary']
cd_eq_dfs = {}

for idx in range(len(cd_equities)):
    if idx % 100 == 0:
        print(idx)
    s = cd_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(cd_etf_df.iloc[0]['Year']), month = int(cd_etf_df.iloc[0]['Month']), day = int(cd_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if cd_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(cd_etf_df.iloc[0]['Year']))+'-'+str(int(cd_etf_df.iloc[0]['Month']))+'-0'+str(int(cd_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(cd_etf_df.iloc[0]['Year']))+'-'+str(int(cd_etf_df.iloc[0]['Month']))+'-'+str(int(cd_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=cd_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        cd_eq_dfs[s] = df
    except:
        print(s)
    
    

0
100
200
300
400
500
600
700
800


In [41]:
cd_equities = list(cd_eq_dfs.keys())
num = 0
cd_pairs = []
for s in cd_equities:
    df = cd_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        cd_pairs.append((s,sector_to_etf_dict['Consumer Discretionary']))

In [42]:
print(num/len(cd_equities))

0.19414893617021275


# Basic Materials

In [43]:
bm_etf = sector_to_etf_dict['Basic Materials']
bm_etf_df = yf.Ticker(bm_etf).history(period='max')

index = bm_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

bm_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
bm_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
bm_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

bm_etf_df.reset_index(drop=True, inplace=True)

In [44]:
len(sector_to_symbol_dict['Basic Materials'])

35

In [45]:
bm_equities = sector_to_symbol_dict['Basic Materials']
bm_eq_dfs = {}

for idx in range(len(bm_equities)):
    if idx % 100 == 0:
        print(idx)
    s = bm_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(bm_etf_df.iloc[0]['Year']), month = int(bm_etf_df.iloc[0]['Month']), day = int(bm_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if bm_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(bm_etf_df.iloc[0]['Year']))+'-'+str(int(bm_etf_df.iloc[0]['Month']))+'-0'+str(int(bm_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(bm_etf_df.iloc[0]['Year']))+'-'+str(int(bm_etf_df.iloc[0]['Month']))+'-'+str(int(bm_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=bm_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        bm_eq_dfs[s] = df
    except:
        print(s)
    
    

0


In [46]:
bm_equities = list(bm_eq_dfs.keys())
num = 0
bm_pairs = []
for s in bm_equities:
    df = bm_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        bm_pairs.append((s,sector_to_etf_dict['Basic Materials']))

In [47]:
print(num/len(bm_equities))

0.06451612903225806


# Industrials

In [48]:
ind_etf = sector_to_etf_dict['Industrials']
ind_etf_df = yf.Ticker(ind_etf).history(period='max')

index = ind_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

ind_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
ind_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
ind_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

ind_etf_df.reset_index(drop=True, inplace=True)

In [49]:
len(sector_to_symbol_dict['Industrials'])

446

In [50]:
ind_equities = sector_to_symbol_dict['Industrials']
ind_eq_dfs = {}

for idx in range(len(ind_equities)):
    if idx % 100 == 0:
        print(idx)
    s = ind_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(ind_etf_df.iloc[0]['Year']), month = int(ind_etf_df.iloc[0]['Month']), day = int(ind_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if ind_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(ind_etf_df.iloc[0]['Year']))+'-'+str(int(ind_etf_df.iloc[0]['Month']))+'-0'+str(int(ind_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(ind_etf_df.iloc[0]['Year']))+'-'+str(int(ind_etf_df.iloc[0]['Month']))+'-'+str(int(ind_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=ind_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        ind_eq_dfs[s] = df
    except:
        print(s)
    
    

0
100
200
300
400


In [51]:
ind_equities = list(ind_eq_dfs.keys())
num = 0
ind_pairs = []
for s in ind_equities:
    df = ind_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        ind_pairs.append((s,sector_to_etf_dict['Industrials']))

In [52]:
print(num/len(ind_equities))

0.1473429951690821


# Telecommunications

In [53]:
tc_etf = sector_to_etf_dict['Telecommunications']
tc_etf_df = yf.Ticker(tc_etf).history(period='max')

index = tc_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

tc_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
tc_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
tc_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

tc_etf_df.reset_index(drop=True, inplace=True)

In [54]:
tc_equities = sector_to_symbol_dict['Telecommunications']
tc_eq_dfs = {}

for idx in range(len(tc_equities)):
    if idx % 100 == 0:
        print(idx)
    s = tc_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(tc_etf_df.iloc[0]['Year']), month = int(tc_etf_df.iloc[0]['Month']), day = int(tc_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if tc_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(tc_etf_df.iloc[0]['Year']))+'-'+str(int(tc_etf_df.iloc[0]['Month']))+'-0'+str(int(tc_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(tc_etf_df.iloc[0]['Year']))+'-'+str(int(tc_etf_df.iloc[0]['Month']))+'-'+str(int(tc_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=tc_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        tc_eq_dfs[s] = df
    except:
        print(s)
    
    

0


In [55]:
tc_equities = list(tc_eq_dfs.keys())
num = 0
tc_pairs = []
for s in tc_equities:
    df = tc_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        tc_pairs.append((s,sector_to_etf_dict['Industrials']))

# Real Estate

In [56]:
re_etf = sector_to_etf_dict['Real Estate']
re_etf_df = yf.Ticker(re_etf).history(period='max')

index = re_etf_df.index
months = []
years = []
days = []

for dt in index:
    months.append(int(dt.month))
    years.append(int(dt.year))
    days.append(int(dt.day))

re_etf_df['Month'] = pd.Series(months, index = index, dtype='int')
re_etf_df['Year'] = pd.Series(years, index = index, dtype='int')
re_etf_df['Day'] = pd.Series(days, index = index, dtype='int')

re_etf_df.reset_index(drop=True, inplace=True)

In [57]:
re_equities = sector_to_symbol_dict['Real Estate']
re_eq_dfs = {}

for idx in range(len(re_equities)):
    if idx % 100 == 0:
        print(idx)
    s = re_equities[idx]
    try:
        if len(s) >= 5:
            continue
        dt = yf.Ticker(s).history(period='max').index[0]
        if pd.Timestamp(year=dt.year, month=dt.month, day=dt.day) > pd.Timestamp(year=int(re_etf_df.iloc[0]['Year']), month = int(re_etf_df.iloc[0]['Month']), day = int(re_etf_df.iloc[0]['Day'])):
            df = yf.Ticker(s).history(period='max')
        else:
            if re_etf_df.iloc[0]['Day'] < 10:
                df = yf.Ticker(s).history(start=str(int(re_etf_df.iloc[0]['Year']))+'-'+str(int(re_etf_df.iloc[0]['Month']))+'-0'+str(int(re_etf_df.iloc[0]['Day'])))
            else:
                df = yf.Ticker(s).history(start=str(int(re_etf_df.iloc[0]['Year']))+'-'+str(int(re_etf_df.iloc[0]['Month']))+'-'+str(int(re_etf_df.iloc[0]['Day'])))

        index = df.index
        months = []
        years = []
        days = []

        for dt in index:
            months.append(dt.month)
            years.append(dt.year)
            days.append(dt.day)

        df['Year'] = pd.Series(years, index = index)
        df['Month'] = pd.Series(months, index = index)
        df['Day'] = pd.Series(days, index = index)

        df.reset_index(drop=True, inplace=True)

        df = df.merge(right=re_etf_df,how='inner',on=['Year','Month','Day'],suffixes=[None,' ETF'])
        re_eq_dfs[s] = df
    except:
        print(s)
    
    

0
100
200


In [58]:
re_equities = list(re_eq_dfs.keys())
num = 0
re_pairs = []
for s in re_equities:
    df = re_eq_dfs[s]
    stock_close = df['Close']
    etf_close = df['Close ETF']
    
    #fit linear regression of form (stock_close = beta * etf_close + error) and obtain residuals
    ols = sm.OLS(np.array(stock_close).reshape((-1,1)),np.array(etf_close).reshape((-1,1)))
    results = ols.fit()
    predicted_stock_close = ols.predict(results.params)
    residuals = np.subtract(np.array(stock_close),np.array(predicted_stock_close))
    
    #check if residuals are stationary
    pval = adfuller(residuals)[1]
    if pval < 0.05:
        num+=1
        re_pairs.append((s,sector_to_etf_dict['Real Estate']))

# Write pairs to a file

In [59]:
sectors

dict_keys(['Technology', 'Healthcare', 'Energy', 'Financials', 'Consumer Staples', 'Utilities', 'Consumer Discretionary', 'Basic Materials', 'Industrials', 'Telecommunications', 'Real Estate'])

In [None]:
with open('pairs.txt','w') as f:
    for pair in tech_pairs:
        f.write(pair[0]+', '+pair[1]+'\n')
    for pair in hc_pairs:
        f.write(pair[0]+', '+pair[1]+'\n')
    for pair in energy_pairs:
        f.write(pair[0]+', '+pair[1]+'\n')
    for pair in fin_pairs:
        f.write(pair[0]+', '+pair[1]+'\n')
    for pair in cs_pairs:
        f.write(pair[0]+', '+pair[1]+'\n')
    for pair in util_pairs:
        f.write(pair[0]+', '+pair[1]+'\n')