In [38]:
# Import statements

# Sklearn
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cross_decomposition import CCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.cluster import KMeans, OPTICS, DBSCAN, SpectralClustering

# Other relevant
import umap
import statsmodels

# Standard
import pandas as pd
import numpy as np

In [2]:
# Read in dataset
daily = pd.read_csv('../data/final_processed/daily_prices.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')
ratios = pd.read_csv('../data/final_processed/firm_ratios.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')
sectors = pd.read_csv('../data/final_processed/sectors.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')
short = pd.read_csv('../data/final_processed/short_interest_rate.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')

  ratios = pd.read_csv('../data/final_processed/firm_ratios.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')


In [3]:
# Merge
df = daily.merge(ratios, on=['ticker', 'date'])
df = df.merge(short, on =['ticker', 'date'])
df = df.merge(sectors, on=['ticker', 'date'])

In [21]:
# Formation periods of interes: quarter, 6 months, 1 year, 2 years
df_q = df.loc['2005-01':'2005-03']
to_drop = df_q.loc[df_q.isna().any(axis=1)]['ticker'].unique()
df_q = df_q.loc[~df_q['ticker'].isin(to_drop)]

In [None]:
# Get index values and set up data set
df_train = df_q.reset_index().sort_values(['ticker', 'date'])
idx = df_train[['ticker', 'date']]
df_train = df_train.drop(['date','ticker'], axis=1)

# One hot encoding -> np array
ohe_column = 'gicdesc'
ohe_categories = df_train[ohe_column].unique().tolist()
enc = OneHotEncoder(sparse_output=False, categories=[ohe_categories]) 
transformer = make_column_transformer((enc, [ohe_column]), remainder='passthrough') 
X_train = transformer.fit_transform(df_train)

# PCA (should generalize this)
num_components = 2

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
pca = PCA(num_components)
components = pca.fit_transform(X_train)

# Create a new df with components
components_df = pd.DataFrame(data=components, columns=[f'pc {i+1}' for i in range(num_components)])

# Merge the principal components with indices then groupby tickers
merged_df = pd.concat([idx.reset_index(drop=True), components_df], axis=1)
grouped_df = merged_df.groupby('ticker')

# Concatenate the components for each ticker into a single vector
vecs = {}
for ticker, group in grouped_df:
    components = group[[f'pc {i+1}' for i in range(num_components)]].values.T
    vec_components = np.concatenate(components)
    vecs[ticker] = vec_components

# Create df with a single row for each ticker and vectorized components
vectorized_df = pd.DataFrame(list(vecs.items()), columns=['ticker', 'components'])

# Get mode of the vector lengths
vector_lengths = [len(vector) for vector in vectorized_df['components']]
lengths_series = pd.Series(vector_lengths)
mode_length = lengths_series.mode().iloc[0]

# Drop anything that doesn't have the right length components; should do it by mode of vector lengths
for i, vector in enumerate(vectorized_df['components']):
    if vector.shape != (mode_length,):
        vectorized_df = vectorized_df.drop(i)

In [37]:
# Extract the vectors from the vectorized_df DataFrame
vectors = np.array([np.array(vector) for vector in vectorized_df['components']])

# K-means
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters)
vectorized_df['cluster'] = kmeans.fit_predict(vectors)

# Group the tickers by the assigned cluster labels
clusters = vectorized_df.groupby('cluster')['ticker'].apply(list)
print(clusters)

cluster
0    [BBY, BC, CAR, DIS, GPC, GWW, HAS, HD, JWN, LE...
1    [ABT, AGN, BAX, BCR, BDX, BMY, JNJ, LLY, MRK, ...
2    [APA, APC, ASH, COP, CVX, HAL, KMG, MRO, OXY, ...
3    [APD, ATI, BMS, CCK, CHA, ECL, EMN, FCX, IFF, ...
4    [AEE, CMS, CNP, DUK, ED, EIX, ETR, EXC, OKE, P...
5               [ADI, CAH, INTC, MCK, QCOM, TXN, XLNX]
6       [AVP, BFO, CLX, COST, CVS, RAD, SVU, SYY, WMT]
7    [AVY, BA, CAT, CCU, CMI, CR, DHR, DOV, EMR, FD...
8                                 [ADP, GLW, IBM, XRX]
9    [ADM, CAG, CL, CTB, DDS, DLX, GIS, GT, HSY, KM...
Name: ticker, dtype: object


