In [1]:
import os
import sys
import numpy as np 
import pandas as pd 
import math
from sklearn.cluster import KMeans
from tqdm.notebook import tqdm
from seaborn import histplot

In [3]:
os.chdir(os.path.expanduser("~"))
sys.path.append("Incoming/EagleKahnXylem")

In [4]:
from classification import encode_design_matrix

In [5]:
if not os.path.exists("Data/Stocks"):
    !kaggle datasets download -d borismarjanovic/price-volume-data-for-all-us-stocks-etfs
    !unzip -q price-volume-data-for-all-us-stocks-etfs.zip

In [6]:
settings = {}
settings['files'] = [ os.path.join(dirname, filename) for 
         dirname, _, filenames in os.walk('Data/Stocks') for 
         filename in filenames ]
settings['companies'] = [ os.path.basename(f).split(".")[0] for f in settings['files'] ]
settings['strata'] = "Data/macro_stratification.csv"

In [73]:
def volume(filename : str, log : bool = True) -> int:
    vol = []
    try:
        for v in pd.read_csv(filename)['Volume']:
            if log:
                vol.append(math.log(v))
            else:
                vol.append(v)
    except Exception:
        return []
    
    return vol

def max_volume(filename : str) -> int:
    try:
        return pd.read_csv(filename)['Volume'].max()
    except Exception:
        return None
    
def delta(filename : str) -> int:
    try:
        df = pd.read_csv(filename)
        return (df['Close']-df['Open']) / df['Open']
        #return (df['High'] - df['Low']) / df['High']
    except Exception:
        return 0

def direction(filename : str) -> int:
    df = pd.read_csv(filename)
    w = (df['Open'] > df['Close'])
    result = pd.DataFrame()
    result['Date'] = df['Date']
    result['Gain'] = np.repeat(-1, len(df['Date']))
    result.loc[w, ('Gain')] = 1
    return result

def z(x : np.array, scale : dict):
    return ( x - scale['mean'] ) / scale['sd']

def fit_kmeans_clusters(df: pd.DataFrame = None, k: int = 16, 
                        variables: list = [], inplace: bool = False):
    clusters = encode_design_matrix(
        df, variables=variables, 
        normalize=False)
    centers = KMeans(n_clusters=k,
        random_state=0).fit(clusters['x'])
    if inplace:
        df['stratum'] = centers.predict(clusters['x'])
    else:
        return centers.predict(clusters['x'])

def build_company_daily_deltas(companies : list, files : list, z_transform : dict):
    for company, f in zip(companies[:3], files[:3]):
        yield pd.DataFrame({
            'date' : direction(f)['Date'],
            'company' : company,
            'delta' : delta(f),
            'direction' : direction(f)['Gain'],
            'rank': z(np.array(volume(f)), z_transform).round(2) }) 

def stratify_company_macros(companies : list, files : list, 
                         quiet : bool = False, **kwargs):
    """ """
    # build a median interest id based on trading volume that we can 
    # use to post-stratify our input files
    macro_delta: list = []
    macro_volume: list = []
    k: int = kwargs.get("k", 2*2*4) # 2**2 variables * 4 quantiles to a cluster
    
    for f in tqdm(files, desc="Processing macros across all companies / years"):
        macro_delta.append(np.mean(delta(f)))
        macro_volume.append(np.mean(volume(f)))

    macro_delta = pd.Series(macro_delta).interpolate(method="zero")
    # bug-fix : occasional divide by zero problem if the open was 0
    macro_delta[np.isposinf(macro_delta)] = None
    macro_delta[np.isneginf(macro_delta)] = None
    macro_delta.interpolate(method='zero', inplace=True)
    # log-scale volume is easier to work with
    macro_volume = pd.Series(macro_volume).interpolate(method="zero")
    # estimates a z-score for ranking our volume
    transform = { 'mean': np.mean(macro_volume), 'sd': np.std(macro_volume) }
    macro_volume = z(macro_volume, transform).round(2)
    # builds a data frame of macro deltas and volume ranks for each company4*4
    # to use for building strata using k-means clustering
    macro_clusters = pd.DataFrame({
        'company':companies, 
        'delta': median_delta, 
        'vol_rank': macro_volume})

    fit_kmeans_clusters(macro_clusters,
        variables=['delta','vol_rank'], inplace=True)

    if not quiet:
        plot_1 = histplot({
            'z(daily volume)': macro_volume}, kde=True)
        plot_2 = histplot({
            'mean delta': macro_delta},
            binrange=(-0.1,0.1), kde=True)
        plot_3 = histplot(macro_clusters['stratum'])

    return macro_clusters




In [75]:
if os.path.exists(settings['strata']):
    macro_clusters = pd.read_csv(settings['strata'])
else:
    macro_clusters = stratify_company_macros(
        settings['companies'], settings['files'], quiet=True)
    macro_clusters.head()
    macro_clusters.to_csv(
        settings['strata'], 
        index=False)

In [8]:
# builds a very large list of dataframe of daily deltas and ranks that we will use
# for later hierarchical modeling 
ranked_daily_deltas = build_company_daily_deltas(
    companies, files, transform)

In [None]:
# for each company, fit a parametric model of closing price 
# and daily percent-change driven by annual and inter-annual 
# variation in time (year + julian day). We are going to partition-out
# the effect of company by pooling arbitrary companies into strata
# we derived from macro factor clusters (delta + volume)