# Creating a Dataset Formatted for RNN Examples

### Loading Libraries

In [24]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warning
import warnings

# Path
from pathlib import Path

In [26]:
np.random.seed(42)

idx = pd.IndexSlice

warnings.filterwarnings('ignore')

### Building Daily Dataset

In [29]:
DATA_DIR = Path('..', 'data')

In [33]:
prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'quandl/wiki/prices')
          .loc[idx['2010':'2017', :], ['adj_close', 'adj_volume']])

prices.info()

####  Most Traded Stocks Select

In [35]:
n_dates = len(prices.index.unique('date'))

dollar_vol = (prices.adj_close.mul(prices.adj_volume)
              .unstack('ticker')
              .dropna(thresh=int(.95 * n_dates), axis=1)
              .rank(ascending=False, axis=1)
              .stack('ticker'))

In [38]:
most_traded = dollar_vol.groupby(level='ticker').mean().nsmallest(500).index

In [40]:
returns = (prices.loc[idx[:, most_traded], 'adj_close']
           .unstack('ticker')
           .pct_change()
           .sort_index(ascending=False))

returns.info()

#### Stacking 21-Day Time Series

In [43]:
n = len(returns)

T = 21

tcols = list(range(T))

tickers = returns.columns

In [45]:
data = pd.DataFrame()

for i in range(n-T-1):
    df = returns.iloc[i:i+T+1]
    date = df.index.max()
    data = pd.concat([data, 
                      df.reset_index(drop=True).T
                      .assign(date=date, ticker=tickers)
                      .set_index(['ticker', 'date'])])

data = data.rename(columns={0: 'label'}).sort_index().dropna()
data.loc[:, tcols[1:]] = (data.loc[:, tcols[1:]].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))

data.info()

In [47]:
data.shape

In [49]:
data.to_hdf('data.h5', 'returns_daily')

#### Building Weekly Dataset

In [None]:
prices = (pd.read_hdf(DATA_DIR / 'assets.h5', 'quandl/wiki/prices')
          .adj_close
          .unstack().loc['2007':])

prices.info()

#### Resampling to Weekly Frequency

In [60]:
returns = (prices
           .resample('W')
           .last()
           .pct_change()
           .loc['2008': '2017']
           .dropna(axis=1)
           .sort_index(ascending=False))

returns.info()

In [62]:
returns.head().append(returns.tail())

#### Creating & Stacking 52-Week Sequences

In [65]:
n = len(returns)

T = 52 

tcols = list(range(T))

tickers = returns.columns

In [67]:
data = pd.DataFrame()

for i in range(n-T-1):
    df = returns.iloc[i:i+T+1]
    date = df.index.max()    
    data = pd.concat([data, (df.reset_index(drop=True).T
                             .assign(date=date, ticker=tickers)
                             .set_index(['ticker', 'date']))])

data.info()

In [69]:
data[tcols] = (data[tcols].apply(lambda x: x.clip(lower=x.quantile(.01),
                                                  upper=x.quantile(.99))))

In [71]:
data = data.rename(columns={0: 'fwd_returns'})

In [76]:
data['label'] = (data['fwd_returns'] > 0).astype(int)

In [78]:
data.shape

In [80]:
data.sort_index().to_hdf('data.h5', 'returns_weekly')