#### Import Package

In [101]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import ray

from sklearn.cluster import KMeans

from class_model.model_prep import ModelPrep
from core.operation import *
from core.system import *

In [273]:
factor_data = pd.read_parquet(get_parquet(live) / 'data_price.parquet.brotli')
risk_free = pd.read_parquet(get_parquet(live) / 'data_rf.parquet.brotli')
pca_ret = factor_data.copy(deep=True)
# Create returns and convert ticker index to columns
pca_ret = create_return(pca_ret, windows=[1])
ret = pca_ret[['RET_01']]
ret = ret['RET_01'].unstack(pca_ret.index.names[0])

# Execute Rolling PCA
window_size = 21
num_components = 5
pca_data = rolling_pca(data=ret, window_size=window_size, num_components=num_components, name='Return')
pca_data = pd.concat([pca_data, risk_free['RF']], axis=1)
pca_data = pca_data.loc[start:end]
pca_data['RF'] = pca_data['RF'].ffill()
pca_data = pca_data.fillna(0)
factor_col = pca_data.columns[:-1]

@ray.remote
def function(self, splice_data):
T = [1]
splice_data = create_return(splice_data, T)
splice_data = splice_data.fillna(0)

for t in T:
    ret = f'RET_{t:02}'
    windows = [21, 126]
    for window in windows:
        betas = rolling_ols_parallel(data=splice_data, ret=ret, factor_data=pca_data, factor_cols=factor_col.tolist(), window=window, name=f'ret_pca_{t:02}')
        splice_data = splice_data.join(betas)

#### Data

In [2]:
live = True
start = '2005-01-01'
current_date = date.today().strftime('%Y-%m-%d')
stock = read_stock(get_large(live) / 'permno_live.csv')

In [3]:
price_data = pd.read_parquet(get_parquet(live) / 'data_price.parquet.brotli')
sb_pca = ModelPrep(live=live, factor_name='factor_sb_pca', group='permno', interval='M', kind='fundamental', stock=stock, div=False, start=start, end=current_date, save=False).prep()

Creating factor_sb_pca ------------------------------------- | [92m✔[0m
Shape: (3114097, 24)


In [4]:
spy = get_data_fmp(ticker_list=['SPY'], start=start, current_date=current_date)[['Adj Close']]
spy.columns = ['Close']
spy = create_return(spy, [1])[['RET_01']]
spy.columns = ['spy']

Fetching data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.85s/ticker]


In [5]:
def create_multi_index(factor_data, stock):
    print("Creating multi-index...")
    factor_values = pd.concat([factor_data] * len(stock), ignore_index=True).values
    multi_index = pd.MultiIndex.from_product([stock, factor_data.index])
    multi_index_factor = pd.DataFrame(factor_values, columns=factor_data.columns, index=multi_index)
    multi_index_factor.index = multi_index_factor.index.set_names(['permno', 'date'])
    return multi_index_factor

spy = create_multi_index(spy.reset_index(level=0, drop=True), stock)

Creating multi-index...


In [6]:
price_data = create_return(price_data, [1])

In [7]:
analysis = pd.merge(sb_pca['epsil_ret_pca_01_126'], price_data['RET_01'], left_index=True, right_index=True, how='left')
analysis = analysis.merge(spy, left_index=True, right_index=True, how='left')

#### Kmean

In [122]:
window_size = 21
n_clusters = 10

In [123]:
analysis['epsil_mean'] = analysis.groupby('permno')['epsil_ret_pca_01_126'].rolling(window_size).mean().reset_index(level=0, drop=True)
analysis['epsil_std'] = analysis.groupby('permno')['epsil_ret_pca_01_126'].rolling(window_size).std().reset_index(level=0, drop=True)
analysis['epsil'] = (analysis['epsil_ret_pca_01_126'] - analysis['epsil_mean']) / analysis['epsil_std']

In [136]:
present = window_data(analysis, '2024-02-05', 200)[['epsil', 'RET_01']]

In [127]:
def rolling_kmean(data, column, window_size, n_clusters, name):
    @ray.remote
    def exec_kmean(i, data, windowSize, n_clusters):
        # Get window data
        window_data = data.iloc[i:i + windowSize]
        data = window_data.T
        data = data.dropna()

        # Run KMean
        kmeans = KMeans(n_clusters=n_clusters, init='k-means++', random_state=0, n_init=10)
        result_clusters = kmeans.fit_predict(data)

        # Create a dataframe that matches loadings to stock
        df_cluster = pd.DataFrame(result_clusters, columns=[f'kCluster_{n_clusters}'], index=window_data.columns)
        return df_cluster
        
    # Unstack Data
    data = data[column]
    data = data.unstack('permno')

    # Execute parallel processing
    ray.init(num_cpus=16, ignore_reinit_error=True)
    clusters_list = ray.get([exec_kmean.remote(i, data, window_size, n_clusters) for i in range(0, len(data) - window_size + 1)])
    ray.shutdown()
    
    # Concat all the window loadings
    results_clusters_combined = pd.concat(clusters_list, keys=data.index[window_size - 1:]).swaplevel()
    
    # Rearrange data to groupby stock
    results_clusters_combined.sort_index(level=[data.index.names[0], 'date'], inplace=True)
    results_clusters_combined.columns = [f'cluster' for i in range(1, len(results_clusters_combined.columns) + 1)]
    results_clusters_combined = results_clusters_combined.sort_index(level=['permno', 'date'])
    return results_clusters_combined

In [128]:
kmean = rolling_kmean(present, 'epsil', window_size, n_clusters, 'epsil')

2024-02-06 21:20:15,311	INFO worker.py:1642 -- Started a local Ray instance.


#### Analysis

In [149]:
kmean_unstack = kmean.unstack('permno')

In [166]:
kmean_unstack.columns = kmean_unstack.columns.get_level_values(1)

In [195]:
kmean_unstack

permno,10104,10107,10138,10145,10258,10333,10516,10933,11308,11394,...,93089,93094,93096,93101,93126,93296,93299,93312,93429,93436
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-05-18,1,5,9,7,0,5,6,6,2,3,...,3,6,1,1,8,1,6,9,3,9
2023-05-19,8,6,6,7,5,8,4,7,3,6,...,5,7,1,4,9,8,7,6,5,2
2023-05-22,3,4,5,6,7,4,6,6,2,9,...,9,6,3,0,4,0,6,5,4,8
2023-05-23,7,4,4,8,3,4,1,7,7,4,...,4,1,7,2,3,2,1,4,4,5
2023-05-24,6,4,0,3,8,9,0,6,1,6,...,8,2,6,9,8,9,3,0,8,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-01-30,0,1,5,9,3,3,4,6,4,6,...,6,0,4,8,7,5,0,9,6,6
2024-01-31,6,7,1,0,5,5,8,0,2,6,...,1,6,2,4,3,1,6,4,2,8
2024-02-01,4,7,7,9,2,2,8,9,8,1,...,4,0,8,5,9,5,0,2,8,1
2024-02-02,6,0,0,2,4,7,9,2,9,1,...,1,5,9,1,5,1,5,4,9,8


In [191]:
continuous_clusters_df = identify_continuous_groups(kmean_unstack)

In [182]:
kmean_unstack.head(2)[[12449, 14655]]

permno,12449,14655
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-05-18,0,0
2023-05-19,0,0


In [185]:
continuous_clusters_df.iloc[0][0]

[]

In [194]:
continuous_clusters_df.iloc[0][0]

{0: [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 1: [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 2: [2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,