In [2]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [3]:
univ = [
    # --- TECHNOLOGY ---
    'MSFT', 'AAPL', 'INTC', 'CSCO', 'ORCL', 'IBM', 'ADBE', 'TXN', 'NVDA', 'QCOM',
    'AMAT', 'ADI', 'MU', 'LRCX', 'KLAC', 'AMD', 'APH', 'GLW', 'HPQ', 'MSI',
    'ADP', 'PAYX', 'FISV', 'FIS', 'CTSH', 'INTU', 'ADSK', 'SNPS', 'CDNS', 'MCHP',
    'STM', 'UMC', 'TSM', 'ASX', 'TEL', 'TER', 'NTAP', 'STX', 'WDC', 'ZBRA',
    'TRMB', 'TYL', 'AKAM', 'VRSN',

    # --- HEALTHCARE ---
    'JNJ', 'PFE', 'MRK', 'LLY', 'UNH', 'ABT', 'BMY', 'AMGN', 'GILD', 'BIIB',
    'SYK', 'MDT', 'BAX', 'BDX', 'CVS', 'CI', 'HUM', 'MCK', 'CAH', 'LH',
    'TMO', 'DHR', 'ISRG', 'EW', 'BSX', 'ZBH', 'STE', 'COO', 'HOLX', 'XRAY',
    'DGX', 'CNC', 'MOH', 'A', 'MTD', 'WAT', 'TECH', 'BIO',
    'RGEN', 'VRTX', 'REGN', 'INCY',

    # --- FINANCIALS ---
    'JPM', 'BAC', 'WFC', 'C', 'AXP', 'GS', 'MS', 'USB', 'BK', 'STT',
    'PGR', 'ALL', 'AIG', 'HIG', 'TRV', 'MMC', 'AON', 'BEN', 'SCHW', 'MCO',
    'SPGI', 'PNC', 'TFC', 'KEY', 'FITB', 'MTB', 'HBAN', 'RF', 'CMA', 'ZION',
    'L', 'CINF', 'WRB', 'AFL', 'PRU', 'MET', 'PFG', 'LNC', 'UNM',
    'RJF', 'SEIC', 'TROW', 'IVZ', 'AMG', 'BLK', 'ICE', 'CME', 'NDAQ', 'JKHY',
    'BRO', 'AJG', 'WTM', 'FAF',

    # --- CONSUMER STAPLES ---
    'KO', 'PEP', 'PG', 'WMT', 'COST', 'CL', 'MO', 'SYY', 'K', 'GIS',
    'HSY', 'CLX', 'MKC', 'TSN', 'CAG', 'TAP', 'EL', 'ADM', 'HRL', 'SJM',
    'KMB', 'CPB', 'SBUX', 'KR', 'DG', 'DLTR', 'TGT',
    'CHD', 'STZ', 'BF-B', 'NWL',

    # --- CONSUMER DISCRETIONARY ---
    'HD', 'LOW', 'MCD', 'NKE', 'F', 'DIS', 'TJX', 'VFC', 'YUM', 'DRI',
    'CMG', 'MAR', 'HLT', 'CCL', 'RCL', 'HAS', 'MAT', 'BBY', 'GPC',
    'AZO', 'ORLY', 'KMX', 'ROST', 'LB', 'M', 'KSS', 'DDS',
    'LEG', 'MHK', 'WHR', 'LEN', 'PHM',

    # --- INDUSTRIALS ---
    'GE', 'BA', 'CAT', 'HON', 'LMT', 'RTX', 'GD', 'MMM', 'UNP', 'FDX',
    'UPS', 'DE', 'EMR', 'ITW', 'ETN', 'PH', 'DOV', 'CMI', 'PCAR', 'NSC',
    'CSX', 'GWW', 'FAST', 'VMI', 'RSG', 'WM', 'CTAS', 'GPN', 'EFX', 'JCI',
    'TXT', 'NOC', 'LHX', 'HII', 'TDG', 'AME', 'ROK', 'SWK', 'SNA', 'MAS',

    # --- ENERGY & MATERIALS ---
    'XOM', 'CVX', 'COP', 'SLB', 'HAL', 'VLO', 'OXY', 'DVN',
    'EOG', 'APA', 'BKR', 'NEM', 'FCX', 'APD', 'ECL', 'SHW', 'PPG',
    'LYB', 'DOW', 'DD', 'IP', 'NUE',

    # --- UTILITIES ---
    'NEE', 'DUK', 'SO', 'AEP', 'ED', 'PEG', 'XEL', 'EIX', 'ETR', 'D',
    'WEC', 'ES', 'AWK', 'SRE', 'FE', 'CMS', 'DTE', 'PPL', 'CNP', 'NI',

    # --- REAL ESTATE ---
    'PLD', 'SPG', 'PSA', 'O', 'VTR', 'BXP', 'AVB', 'EQR', 'ESS',
    'MAA', 'UDR', 'HST', 'VNO', 'SLG'
]

In [4]:
# Importing daily price data since 2000
# data = yf.download(tickers=univ,
#                          start='2000-01-01',
#                          interval='1d',
#                          auto_adjust=True)

# # Calculate daily returns
# data[['Close', 'Volume']].to_pickle('price_volume_data.pkl')
price_volume_data = pd.read_pickle('price_volume_data.pkl')
rets = price_volume_data['Close'].pct_change(fill_method=None).dropna(how='all')

In [5]:
def PCA_process(rets, start_trading_date, window_PCA, num_factors):
    # # If a previous PCA has been done and saved, load it
    # if PCA_past != None:
    #     eigen_weights = PCA_past

    # Filter returns for the estimation window
    current_idx = rets.index.get_loc(start_trading_date)
    rets_window = rets.iloc[current_idx - window_PCA:current_idx]

    # Filter NaN values
    nan_count = rets_window.isna().sum()
    mask = nan_count == 0
    rets_window = rets_window.loc[:, mask]

    # Construct empirical correlation matrix
    corr_matrix = rets_window.corr()
    
    # Eigen decomposition
    eigenvalues, eigenvectors = np.linalg.eigh(corr_matrix)

    # Sort eigenvalues and eigenvectors in descending order
    eigenvalues = eigenvalues[::-1]
    eigenvectors = eigenvectors[:, ::-1]
    eigenvectors = pd.DataFrame(eigenvectors,
                                index=rets_window.columns,
                                columns=np.arange(1, len(eigenvalues) + 1))
    
    # Calculate percentage of variance explained by each factor
    variance_pct = eigenvalues / np.sum(eigenvalues)

    # For now focusing on 15 Factor model so num_factors=15, but can be adjusted for variable number by reaching a threshold of variance explained

    # Calculate weights & factor returns for eigenportfolio
    std = rets_window.std()
    eigenvectors_selected = eigenvectors.loc[:, 1:num_factors]
    eigen_weights = eigenvectors_selected.div(std, axis=0)
    eigenportfolio_rets = rets_window @ eigen_weights

    return rets_window, eigenportfolio_rets

In [99]:
# def calc_eigenportfolio_rets(eigen_weights, rets_window):
#     eigenportfolio_rets = rets_window @ eigen_weights
#     return eigenportfolio_rets

In [10]:
def OU_process(eigenportfolio_rets, rets_window, window_OU, num_factors):
    # Fit linear regression model using estimation window
    rets_window = rets_window.tail(window_OU)
    eigenportfolio_rets_window = eigenportfolio_rets.tail(window_OU)

    # Initialize residuals & beta DataFrame
    residuals = pd.DataFrame(index=eigenportfolio_rets_window.index,
                             columns=rets_window.columns)
    beta = pd.DataFrame(index=np.arange(1, num_factors + 1),
                         columns=rets_window.columns)
    
    # Calculate residuals for each ticker
    for ticker in rets_window.columns:
        y = rets_window[ticker]
        model = LinearRegression()
        model.fit(eigenportfolio_rets_window, y)
        y_pred = model.predict(eigenportfolio_rets_window)
        residuals[ticker] = y - y_pred
        beta[ticker] = model.coef_

    # Calculate cumulative residuals
    cum_residuals = residuals.cumsum()

    # Initialize OU parameters
    zeta = pd.DataFrame(index=cum_residuals.index[1:],
                  columns=cum_residuals.columns)
    a = pd.Series(index=cum_residuals.columns)
    b = pd.Series(index=cum_residuals.columns)

    # Calculate OU parameters for each ticker
    for ticker in cum_residuals.columns:
        X_lag = cum_residuals[ticker].iloc[:-1].values.reshape(-1, 1)
        y = cum_residuals[ticker].iloc[1:].values.reshape(-1, 1)
        model = LinearRegression()
        model.fit(X_lag, y)
        y_pred = model.predict(X_lag)
        zeta[ticker] = y - y_pred
        a[ticker] = model.intercept_[0]
        b[ticker] = model.coef_[0][0]

    kappa = -np.log(b) * 252
    m = a / (1 - b)
    sigma_eq = zeta.std(ddof=0) / np.sqrt(1 - b**2)

    # Calculate s-score for signal construction
    centered_m = m - m.mean()
    s_score = -centered_m / sigma_eq
    
    return s_score, kappa, sigma_eq

In [101]:
# def calc_OU_params(residuals, cum_residuals):
#     # Initialize OU parameters
#     zeta = pd.DataFrame(index=cum_residuals.index[1:],
#                   columns=cum_residuals.columns)
#     a = pd.Series(index=cum_residuals.columns)
#     b = pd.Series(index=cum_residuals.columns)

#     # Calculate OU parameters for each ticker
#     for ticker in cum_residuals.columns:
#         X_lag = cum_residuals[ticker].iloc[:-1].values.reshape(-1, 1)
#         y = cum_residuals[ticker].iloc[1:].values.reshape(-1, 1)
#         model = LinearRegression()
#         model.fit(X_lag, y)
#         y_pred = model.predict(X_lag)
#         zeta[ticker] = y - y_pred
#         a[ticker] = model.intercept_[0]
#         b[ticker] = model.coef_[0][0]

#     kappa = -np.log(b) * 252
#     m = a / (1 - b)
#     sigma_eq = zeta.std(ddof=0) / np.sqrt(1 - b**2)
    
#     return m, sigma_eq, kappa, a, b, zeta

In [102]:
# def signal_construction(m, sigma_eq):
#     # Calculate s-score for signal construction
#     centered_m = m - m.mean()
#     s_score = -centered_m / sigma_eq
    
#     return s_score

In [7]:
# Initialize variables
window_PCA = 252
window_OU = 60
start_trading_date = '2024-01-05'
num_factors = 15

In [11]:
rets_window, eigenportfolio_rets = PCA_process(rets, start_trading_date, window_PCA, num_factors)
s_score,kappa,sigma_eq = OU_process(eigenportfolio_rets, rets_window, window_OU, num_factors)

In [None]:
days = 0
Dates = rets.index[rets.index >= start_trading_date]
S_score = pd.DataFrame(index=Dates, columns=rets.columns)

for date in Dates:
    rets_window, eigenportfolio_rets = PCA_process(rets=rets,
                                                   start_trading_date=date,
                                                   window_PCA=window_PCA,
                                                   num_factors=num_factors)
    s_score, kappa = OU_process(eigenportfolio_rets=eigenportfolio_rets,
                         rets_window=rets_window,
                         window_OU=window_OU,
                         num_factors=num_factors)

    S_score.loc[date] = s_score

    days += 1
    if days % 30 == 0:
        print(f'Processed {days} days')
    elif date == Dates[-1]:
        print('Processing complete.')

KeyboardInterrupt: 

Exception ignored in: 'zmq.backend.cython._zmq.Frame.__dealloc__'
Traceback (most recent call last):
  File "zmq/backend/cython/_zmq.py", line 179, in zmq.backend.cython._zmq._check_rc
    PyErr_CheckSignals()
KeyboardInterrupt: 


In [None]:
# # Check for any open positions

# # Filter by kappa > 8.4 for quick reversion times
# mask = kappa > 8.4
# valid_tickers1 = kappa[mask].index

# # Filter by s_scores
# # Initialize s_score signals
# sig_open_buy = -1.25
# sig_open_sell = 1.25
# sig_close_short = 0.75
# sig_close_long = -0.5

# # If the ticker already has an open position
# if s_score < sig_close_short:
#     # Close the short position
#     # buy back the ticker position of $
# elif s_score > sig_close_long:
#     # Close the long position
#     # sell the ticker position of $

# # If the ticker does not have an open position
# if s_score < sig_open_buy:
#     # Open a long position
#     # buy the ticker position of $
# elif s_score > sig_open_sell:
#     # Open a short position
#     # short the ticker position of $

# # Calculate hedge to ensure beta-neutrality
# # *** Not sure how to implement for factors>2 because I am not sure what those factors represent ***

In [12]:
sigma_eq

Ticker
A       0.025093
AAPL    0.019622
ABT     0.009503
ADBE    0.011073
ADI     0.013923
          ...   
XRAY    0.017542
YUM     0.009939
ZBH     0.018869
ZBRA    0.019037
ZION    0.016656
Length: 297, dtype: float64

In [None]:
b