# Creating a Dataset Formatted for RNNs

## Imports & Settings

In [22]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path

import numpy as np
import pandas as pd

np.random.seed(42)

idx = pd.IndexSlice

## Build daily dataset

In [23]:
DATA_DIR = Path('..', '00_data')

In [24]:
prices = (pd.read_hdf(DATA_DIR / 'assets_v1.h5', 'stocks/prices/daily')
          .loc[idx[:, '2013':'2023'], ['adjusted_close', 'volume']])
prices.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1234632 entries, ('AAPL', '2013-01-02') to ('NWS', '2022-12-30')
Data columns (total 2 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   adjusted_close  1234632 non-null  float64
 1   volume          1234632 non-null  int64  
dtypes: float64(1), int64(1)
memory usage: 24.2+ MB


### Select most traded stocks

In [25]:
prices.index.names = ['ticker', 'date']

In [28]:
n_dates = len(prices.index.unique('date'))
dollar_vol = (prices.adjusted_close.mul(prices.volume)
              .unstack('ticker')
              .dropna(thresh=int(.95 * n_dates), axis=1)
              .rank(ascending=False, axis=1)
              .stack('ticker'))

most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index

returns = (prices.loc[idx[most_traded, :], 'adjusted_close']
           .unstack('ticker')
           .pct_change()
           .sort_index(ascending=False))
returns.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2518 entries, 2022-12-30 to 2013-01-02
Columns: 473 entries, AAPL to ROL
dtypes: float64(473)
memory usage: 9.1+ MB


### Stack 21-day time series

In [30]:
n = len(returns)
T = 21 # days
tcols = list(range(T))
tickers = returns.columns

In [31]:
data = pd.DataFrame()
for i in range(n-T-1):
    df = returns.iloc[i:i+T+1]
    date = df.index.max()
    data = pd.concat([data, 
                      df.reset_index(drop=True).T
                      .assign(date=date, ticker=tickers)
                      .set_index(['ticker', 'date'])])
data = data.rename(columns={0: 'label'}).sort_index().dropna()
data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1180133 entries, ('A', '2013-02-04') to ('ZTS', '2022-12-30')
Data columns (total 22 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   label   1180133 non-null  float64
 1   1       1180133 non-null  float64
 2   2       1180133 non-null  float64
 3   3       1180133 non-null  float64
 4   4       1180133 non-null  float64
 5   5       1180133 non-null  float64
 6   6       1180133 non-null  float64
 7   7       1180133 non-null  float64
 8   8       1180133 non-null  float64
 9   9       1180133 non-null  float64
 10  10      1180133 non-null  float64
 11  11      1180133 non-null  float64
 12  12      1180133 non-null  float64
 13  13      1180133 non-null  float64
 14  14      1180133 non-null  float64
 15  15      1180133 non-null  float64
 16  16      1180133 non-null  float64
 17  17      1180133 non-null  float64
 18  18      1180133 non-null  float64
 19  19      1180133 non-null  float

In [32]:
data.shape

(1180133, 22)

In [35]:
data.to_hdf('lstm_data.h5', 'returns_daily')

In [36]:
with pd.HDFStore('lstm_data.h5') as store:
    print(store.info())

<class 'pandas.io.pytables.HDFStore'>
File path: lstm_data.h5
/returns_daily            frame        (shape->[1180133,1])


## Build weekly dataset

We load the Quandl adjusted stock price data:

In [58]:
prices = (pd.read_hdf(DATA_DIR / 'assets_v1.h5', 'stocks/prices/daily')
          .adjusted_close.swaplevel()
          .unstack().loc['2005':])
prices.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4566 entries, 2005-01-03 to 2023-02-22
Columns: 503 entries, AAPL to NWS
dtypes: float64(503)
memory usage: 17.6+ MB


### Resample to weekly frequency

We start by generating weekly returns for 421 stocks without missing data for the 2007-2023 period, as follows:

This could present a problem due to leaving out influential stocks like tesla, but we will try it for now.

In [60]:
prices.index = pd.to_datetime(prices.index)

returns = (prices
           .resample('W')
           .last()
           .pct_change()
           .loc['2006': '2023']
           .dropna(axis=1)
           .sort_index(ascending=False))
returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 896 entries, 2023-02-26 to 2006-01-01
Freq: -1W-SUN
Columns: 421 entries, AAPL to DISH
dtypes: float64(421)
memory usage: 2.9 MB


In [61]:
returns.head().append(returns.tail())

Unnamed: 0_level_0,AAPL,MSFT,AMZN,GOOGL,BRK-B,NVDA,UNH,XOM,JNJ,JPM,...,AIZ,DXC,ALK,MHK,NWL,RL,LNC,DVA,LUMN,DISH
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-02-26,-0.023861,-0.025382,-0.014506,-0.028617,-0.015702,-0.029643,-0.019376,-0.013749,-0.016273,-0.025872,...,-0.014727,-0.024339,-0.032186,-0.054507,-0.048322,-0.029812,-0.044352,0.017157,-0.058524,-0.075618
2023-02-19,0.010198,-0.0167,-0.0042,-0.002326,-0.005324,0.005784,0.009772,-0.059022,-0.00374,0.008508,...,-0.007496,-0.004844,-0.009883,-0.015894,0.008802,0.038575,0.000296,0.010231,-0.007576,0.035113
2023-02-12,-0.021097,0.018386,-0.055905,-0.097442,0.004473,0.00782,0.047095,0.064778,-0.014944,-0.000354,...,-0.001646,0.000346,-0.072751,-0.075762,-0.083747,-0.061006,-0.041395,-0.028077,-0.247148,-0.093501
2023-02-05,0.058727,0.041062,0.011248,0.054443,-0.002135,0.036091,-0.028865,-0.031917,-0.021518,0.005487,...,0.025163,0.021931,0.04802,0.061885,0.017677,0.024996,0.02828,0.053358,-0.005671,0.047222
2023-01-29,0.058461,0.033053,0.051311,0.013773,-0.002259,0.1416,-0.001377,0.019939,-0.003022,0.038792,...,0.011171,-0.004577,0.024087,0.029319,0.052492,0.014622,0.080315,0.010334,0.01341,0.035971
2006-01-29,-0.053353,0.052253,0.029599,0.085182,-0.00136,0.055698,-0.015805,0.012558,-0.034375,0.044675,...,0.029892,0.004939,0.013307,0.000354,-0.018376,0.07341,0.003223,0.000729,0.002382,-0.004296
2006-01-22,-0.111004,-0.028687,-0.010811,-0.143244,-0.003388,0.030577,-0.023958,-0.007217,-0.016499,-0.046841,...,-0.00563,-0.059883,-0.05324,-0.027235,0.002092,-0.054088,-0.028715,0.011431,0.015991,0.000714
2006-01-15,0.121761,0.010402,-0.072488,0.001261,-0.006061,0.040889,-0.031161,0.02591,-0.01246,-0.002499,...,0.00339,-0.025376,-0.099836,-0.017389,-0.004582,-0.007969,0.000553,-0.020762,-0.010742,-0.036257
2006-01-08,0.061319,0.029065,0.01527,0.122458,0.011753,0.117068,0.01223,0.058039,0.041598,0.016918,...,0.017705,0.089462,-0.012876,0.018165,0.009675,0.005878,0.031189,0.093799,0.010248,0.065492
2006-01-01,-0.019869,-0.018392,-0.042056,-0.037297,-0.005926,-0.026091,-0.023724,-0.016286,-0.016529,-0.012689,...,0.006481,0.021383,0.010749,-0.010917,-0.009169,-0.003891,-0.015959,-0.005303,-0.00867,-0.007669


### Create & stack 52-week sequences

We'll use 52-week sequences, which we'll create in a stacked format:

In [62]:
n = len(returns)
T = 52 # weeks
tcols = list(range(T))
tickers = returns.columns

In [63]:
data = pd.DataFrame()
for i in range(n-T-1):
    df = returns.iloc[i:i+T+1]
    date = df.index.max()    
    data = pd.concat([data, (df.reset_index(drop=True).T
                             .assign(date=date, ticker=tickers)
                             .set_index(['ticker', 'date']))])
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 354903 entries, ('AAPL', Timestamp('2023-02-26 00:00:00')) to ('DISH', Timestamp('2007-01-07 00:00:00'))
Data columns (total 53 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       354903 non-null  float64
 1   1       354903 non-null  float64
 2   2       354903 non-null  float64
 3   3       354903 non-null  float64
 4   4       354903 non-null  float64
 5   5       354903 non-null  float64
 6   6       354903 non-null  float64
 7   7       354903 non-null  float64
 8   8       354903 non-null  float64
 9   9       354903 non-null  float64
 10  10      354903 non-null  float64
 11  11      354903 non-null  float64
 12  12      354903 non-null  float64
 13  13      354903 non-null  float64
 14  14      354903 non-null  float64
 15  15      354903 non-null  float64
 16  16      354903 non-null  float64
 17  17      354903 non-null  float64
 18  18      354903 non-null  float64
 19  19      3

In [64]:
data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))

In [65]:
data = data.rename(columns={0: 'fwd_returns'})

In [66]:
data['label'] = (data['fwd_returns'] > 0).astype(int)

In [67]:
data.shape

(354903, 54)

In [68]:
data.sort_index().to_hdf('lstm_data.h5', 'returns_weekly')