In [1]:
%cd ..

import math
import json
import utils
import os
import torch
import pandas as pd
import yfinance as yf
import numpy as np
from itables import show
from multiprocessing import Pool 

if not os.path.exists('sp500.json'):
    sp500 = utils.spy_components()
    with open('./sp500.json', 'w') as f:
        json.dump(sp500, f, indent=2)
else:
    with open("./sp500.json", 'r') as f:
        sp500 = json.load(f)


/home/gangzhi/play_ground/stock_screener


Date
2016-06-07    42.217999
2016-06-09    42.083000
2016-07-01    41.147999
2016-07-28    40.647499
2016-08-17    42.300999
                ...    
2025-03-24          NaN
2025-03-25          NaN
2025-03-26          NaN
2025-03-27          NaN
2025-03-28          NaN
Name: Close, Length: 1618, dtype: float64

In [3]:
import os
SAVE_DIR='./data'
file = 'sp500_daily_prices.pkl'
file_path = os.path.join(SAVE_DIR, file)

download = True
if os.path.exists(file_path): 
    answer = input("Re-download SP500? [yes/no]")
    download = True if answer[0].lower() == 'y' else False 
if download:
    daily_df = yf.download(sp500).swaplevel(axis=1)
    daily_df.to_pickle(file_path)
else:
    daily_df = pd.read_pickle(file_path)


Re-download SP500? [yes/no] no


In [4]:
# Drop rows before the year 1990
daily_df = daily_df[daily_df.index.year >= 1990]
daily_df = daily_df[~(daily_df < 1).any(axis=1)]
split_daily_df = [daily_df[ticker] for ticker in sp500]


In [27]:
def feature_augment(df):
    # remove where volume is 0
    prev_close = df.Close.shift(1)
    prev_vol = df.Volume.shift(1)
    ret = pd.DataFrame()
    # ret['forward_roc20'] = df.Close / df.Close.shift(-20)
    ret['forward_roc5'] = df.Close / df.Close.shift(-5)
    
    ret['high'] = df.High / prev_close
    ret['low'] = df.Low / prev_close
    ret['open'] = df.Open / prev_close
    ret['close'] = df.Close / prev_close
    ret['vol'] = df.Volume / prev_vol
    # cross features
    ret['vol_x_high'] = ret['vol'] * ret['high']
    ret['vol_x_low'] =  ret['vol'] * ret['low']
    ret['vol_x_open'] =  ret['vol'] * ret['open']
    ret['vol_x_close'] =  ret['vol'] * ret['close']
    ret.dropna(inplace=True)
    ret = ret.map(math.log)
    # Feature names: hloc, v, v crossed h,l.o,c
    return ret


In [28]:
%%time
with Pool(4) as pool:
    data = pool.map(feature_augment, split_daily_df)


CPU times: user 136 ms, sys: 290 ms, total: 426 ms
Wall time: 1.31 s


In [29]:
stacked = torch.vstack([torch.from_numpy(df.values) for df in data])
stddev, mean = torch.std_mean(stacked, dim=0)
print(data[0].columns)
print(stddev)
print(mean)

Index(['forward_roc5', 'high', 'low', 'open', 'close', 'vol', 'vol_x_high',
       'vol_x_low', 'vol_x_open', 'vol_x_close'],
      dtype='object')
tensor([0.0536, 0.0204, 0.0219, 0.0164, 0.0244, 0.4009, 0.4023, 0.3994, 0.4009,
        0.4007], dtype=torch.float64)
tensor([-0.0030,  0.0130, -0.0122,  0.0006,  0.0006,  0.0006,  0.0136, -0.0115,
         0.0013,  0.0013], dtype=torch.float64)


In [18]:
tensors = [torch.from_numpy(d.values) for d in data]
v = torch.vstack(tensors)
print(torch.std_mean(v, dim=0))

(tensor([0.1059, 0.0204, 0.0220, 0.0164, 0.0245, 0.4010, 0.4024, 0.3995, 0.4010,
        0.4009], dtype=torch.float64), tensor([-0.0113,  0.0130, -0.0122,  0.0007,  0.0006,  0.0004,  0.0134, -0.0118,
         0.0010,  0.0010], dtype=torch.float64))


tensor([[-0.4071,  0.0096, -0.0058,  ...,  0.1657,  0.1661,  0.1777],
        [-0.4838, -0.0324, -0.0443,  ..., -0.0662, -0.0645, -0.0603],
        [-0.4137,  0.0855,  0.0698,  ...,  0.4963,  0.4964,  0.5110],
        ...,
        [-0.0986,  0.0070, -0.0097,  ..., -0.1687, -0.1528, -0.1601],
        [-0.1229,  0.0081, -0.0140,  ..., -0.1494, -0.1364, -0.1486],
        [-0.1344,  0.0045, -0.0250,  ...,  0.4500,  0.4673,  0.4535]],
       dtype=torch.float64)
