In [87]:
import pandas as pd
import numpy as np
import json
import pickle as pkl

from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.cluster import KMeans, OPTICS, SpectralClustering
from sklearn.model_selection import ParameterGrid

import umap
from statsmodels.tsa.stattools import coint
from itertools import combinations

from tqdm import tqdm
import multiprocess as mp


import os
if os.getcwd().split('/')[-1] == 'clustering':
    os.chdir('../../')

In [88]:
daily = pd.read_csv('data/final_processed/daily_prices.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')
ratios = pd.read_csv('data/final_processed/firm_ratios.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')
sectors = pd.read_csv('data/final_processed/sectors.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')
short = pd.read_csv('data/final_processed/short_interest_rate.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')


# Merge
df = daily.merge(ratios, on=['ticker', 'date'])
df = df.merge(short, on =['ticker', 'date'])
df = df.merge(sectors, on=['ticker', 'date'])

  ratios = pd.read_csv('data/final_processed/firm_ratios.csv', parse_dates=['date']).sort_values(['date','ticker']).set_index('date')


In [89]:
# Define the date range of the dataset
start_date = '2002-01'
end_date = '2019-12'

# Create a date range with a customizable frequency
formation_period = '3M'
date_range = pd.date_range(start=start_date, end=end_date, freq=formation_period)

In [118]:
# Dimensionality reduction methods and clustering algorithms.. need to choose the right grids
dim_reduction_methods = [
    {'name': 'PCA', 'method': PCA, 'params': {'n_components': [2, 3, 4]}},
    {'name': 'KPCA', 'method': KernelPCA, 'params': {'n_components': [2, 3, 4], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}},
    {'name': 'UMAP', 'method': umap.UMAP, 'params': {'n_components': [2, 3, 4], 'n_neighbors': [5, 10, 15]}}
]

clustering_algorithms = [
    {'name': 'KMeans', 'method': KMeans, 'params': {'n_clusters': [3, 5, 7, 9, 10,], 'n_init': ['auto']}},
    # {'name': 'OPTICS', 'method': OPTICS, 'params': {'min_samples': [3, 5, 7]}},
    {'name': 'SpectralClustering', 'method': SpectralClustering, 'params': {'n_clusters': [3, 5, 7, 9, 10,]}}
]

In [121]:
class DimensionalityReduction:
    def __init__(self, df, date_range, formation_period):
        self.df = df
        self.date_range = date_range
        self.formation_period = formation_period
        self.scaler = StandardScaler()

    def __subset_data(self, period_start, period_end):
        # period_end = period_start + pd.DateOffset(months=int(formation_period[:-1]))
        df_period = self.df.loc[period_start.strftime('%Y-%m'):period_end.strftime('%Y-%m')]
        to_drop = df_period.loc[df_period.isna().any(axis=1)]['ticker'].unique()
        df_period = df_period.loc[~df_period['ticker'].isin(to_drop)]
        return df_period

    def preprocess(self, period_start, period_end):
        # Preprocessing steps
        df_period = self.__subset_data(period_start, period_end)
        df_train = df_period.reset_index().sort_values(['ticker', 'date'])
        idx = df_train[['ticker', 'date']].values
        df_train = df_train.drop(['date','ticker'], axis=1)

        ohe_column = 'gicdesc'
        ohe_categories = df_train[ohe_column].unique().tolist()
        enc = OneHotEncoder(sparse_output=False, categories=[ohe_categories]) 
        transformer = make_column_transformer((enc, [ohe_column]), remainder='passthrough') 
        X_train = transformer.fit_transform(df_train)
        
        X_train = self.scaler.fit_transform(X_train)
        return X_train, idx

    def __flatten_data(self, reduced_data, idx):
        merged_df = pd.DataFrame(np.concatenate((idx, reduced_data), axis=1))\
            .set_index([0,1]).stack().unstack(0).dropna(axis=1, how = 'any')
        return merged_df

    def dimensionality_reduction(self, X_train, idx, model, n_components):
        reduced_data = model(n_components = n_components).fit_transform(X_train)
        merged_df = self.__flatten_data(reduced_data, idx)
        return merged_df


class Clustering:
    def __init__(self, model, name, df):
        self.model = model
        self.name = name
        self.df = df
        self.tickers = df.columns.tolist()

    def train_clustering(self, param, name):
        clustering_method = self.model(**param)
        cluster_labels = clustering_method.fit_predict(self.df.T)
        labeled_df = pd.DataFrame({'ticker' : self.tickers, 'cluster': cluster_labels})

        # Group the tickers by the assigned cluster labels
        clusters = labeled_df.groupby('cluster')['ticker'].apply(list).rename(name)
        return clusters

    def clustering_param_tuning(self, param_list):
        def gen():
            vals = (str(v) for v in list(param_list.values())[0])
            for p in ParameterGrid(param_list):
                yield p, f'{self.name}_{next(vals)}'
        with mp.Pool(os.cpu_count()) as pool:
            clusterings = pool.starmap(self.train_clustering, iterable = gen())
        return clusterings

In [122]:
def main():
    global params
    results = []
    reduced_data_generator = DimensionalityReduction(df, date_range, formation_period)
    for period_start in tqdm(date_range[:1]):
        period_end = period_start + pd.DateOffset(months=int(formation_period[:-1])-1)
        X_train, idx = reduced_data_generator.preprocess(period_start, period_end)
        for dr in dim_reduction_methods:
            for dr_param_value in dr['params']['n_components']:
                merged_df = reduced_data_generator.dimensionality_reduction(X_train, idx, dr['method'], dr_param_value)
                for cl in clustering_algorithms:
                    cluster_generator = Clustering(cl['method'], cl['name'], merged_df)
                    clusters = cluster_generator.clustering_param_tuning(cl['params'])
                    with open(f"./results/clusterings/{period_start.strftime('%Y-%m')}-{period_end.strftime('%Y-%m')}_{dr['name']}-{dr_param_value}_{cl['name']}.pkl", "wb") as f:
                        pkl.dump(clusters, f)

clusters = main()

  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
[2.09421685e-13 2.43156866e-07 2.08674469e-06 1.45897573e-06]
not reaching the requested tolerance 1.6242265701293945e-06.
Use iteration 598 instead with accuracy 
9.472193750429129e-07.

  _, diffusion_map = lobpcg(
[1.22286184e-14 2.43156912e-07 2.08674486e-06 1.45897549e-06]
not reaching the requested tolerance 1.6242265701293945e-06.
  _, diffusion_map = lobpcg(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
  est = KMeans(
[2.23137218e-15 8.65178583e-06 1.57131028e-06 3.95514388e-06]
not reaching the requested tolerance 4.0084123611450195e-06.
Use iteration 144 instead with accuracy 
3.415239033183754e-

In [124]:
len(date_range) * (2 + 16/60)

163.2

In [103]:

def f(a, b,c):
    return b+c
    

def main():
    def gen():
        dd = (n for n in range(10))
        for i in range(10):
            yield i, i, next(dd)
    
    with mp.Pool(os.cpu_count()) as pool:
        res = pool.starmap(f, iterable = gen())
    return res

res = main()
res

[0, 2, 4, 6, 8, 10, 12, 14, 16, 18]

In [30]:
pool = mp.Pool(os.cpu_count())

In [32]:
help(pool.map)

Help on method map in module multiprocess.pool:

map(func, iterable, chunksize=None) method of multiprocess.pool.Pool instance
    Apply `func` to each element in `iterable`, collecting the results
    in a list that is returned.



In [335]:
len(ParameterGrid(cl['params']))

4