# CNN for Trading - Part 1: Feature Engineering

### Loading Libraries

In [95]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import pandas_datareader.data as web

# Data Visualization
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# Warnings
import warnings

# Path 
import sys
from time import time
from pathlib import Path
from random import randint

# StatsModel
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS


# Technical Analysis
from talib import (RSI, BBANDS, MACD,
                   NATR, WILLR, WMA,
                   EMA, SMA, CCI, CMO,
                   MACD, PPO, ROC,
                   ADOSC, ADX, MOM)

In [81]:
%matplotlib inline

In [83]:
idx = pd.IndexSlice

sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

In [85]:
DATA_STORE = '../data/assets.h5'

In [87]:
MONTH = 21

YEAR = 12 * MONTH

In [89]:
START = '2000-01-01'

END = '2017-12-31'

In [91]:
T = [1, 5, 10, 21, 42, 63]

In [97]:
results_path = Path('results', 'cnn_for_trading')

if not results_path.exists():
    results_path.mkdir(parents=True)

### Loading Quandl Wiki Stock Prices & Meta Data

In [100]:
adj_ohlcv = ['adj_open', 'adj_close', 'adj_low', 'adj_high', 'adj_volume']

In [102]:
with pd.HDFStore(DATA_STORE) as store:
    
    prices = (store['quandl/wiki/prices']
              .loc[idx[START:END, :], adj_ohlcv]
              .rename(columns=lambda x: x.replace('adj_', ''))
              .swaplevel()
              .sort_index()
             .dropna())
    metadata = (store['us_equities/stocks'].loc[:, ['marketcap', 'sector']])

ohlcv = prices.columns.tolist()

In [104]:
prices.volume /= 1e3

prices.index.names = ['symbol', 'date']
metadata.index.name = 'symbol'

### Rolling Universe: Picking 500 Most-Traded Stocks

In [107]:
dollar_vol = prices.close.mul(prices.volume).unstack('symbol').sort_index()

In [109]:
years = sorted(np.unique([d.year for d in prices.index.get_level_values('date').unique()]))

In [111]:
train_window = 5 

universe_size = 500

In [113]:
universe = []

for i, year in enumerate(years[5:], 5):
    start = str(years[i-5])
    end = str(years[i])
    most_traded = (dollar_vol.loc[start:end, :]
                   .dropna(thresh=1000, axis=1)
                   .median()
                   .nlargest(universe_size)
                   .index)
    universe.append(prices.loc[idx[most_traded, start:end], :])
universe = pd.concat(universe)

In [115]:
universe = universe.loc[~universe.index.duplicated()]

In [117]:
universe.info(null_counts=True)

In [119]:
universe.groupby('symbol').size().describe()

In [121]:
universe.to_hdf('data.h5', 'universe')

### Generating Technical Indicators Factors

In [124]:
T = list(range(6, 21))

#### Relativing Strength Index

In [127]:
for t in T:
    universe[f'{t:02}_RSI'] = universe.groupby(level='symbol').close.apply(RSI, timeperiod=t)

#### Williams %R

In [130]:
for t in T:
    universe[f'{t:02}_WILLR'] = (universe.groupby(level='symbol', group_keys=False)
     .apply(lambda x: WILLR(x.high, x.low, x.close, timeperiod=t)))

#### Computing Bollinger Bands

In [133]:
def compute_bb(close, timeperiod):
    high, mid, low = BBANDS(close, timeperiod=timeperiod)
    return pd.DataFrame({f'{timeperiod:02}_BBH': high, f'{timeperiod:02}_BBL': low}, index=close.index)

In [135]:
for t in T:
    bbh, bbl = f'{t:02}_BBH', f'{t:02}_BBL'
    universe = (universe.join(
        universe.groupby(level='symbol').close.apply(compute_bb,
                                                     timeperiod=t)))
    universe[bbh] = universe[bbh].sub(universe.close).div(universe[bbh]).apply(np.log1p)
    universe[bbl] = universe.close.sub(universe[bbl]).div(universe.close).apply(np.log1p)

### Normalized Average True Range

In [138]:
for t in T:
    universe[f'{t:02}_NATR'] = universe.groupby(level='symbol', 
                                group_keys=False).apply(lambda x: 
                                                        NATR(x.high, x.low, x.close, timeperiod=t))

#### Percentage Price Oscillator

In [141]:
for t in T:
    universe[f'{t:02}_PPO'] = universe.groupby(level='symbol').close.apply(PPO, fastperiod=t, matype=1)

#### Moving Average Convergence/Divergence

In [144]:
def compute_macd(close, signalperiod):
    macd = MACD(close, signalperiod=signalperiod)[0]
    return (macd - np.mean(macd))/np.std(macd)

In [146]:
for t in T:
    universe[f'{t:02}_MACD'] = (universe
                  .groupby('symbol', group_keys=False)
                  .close
                  .apply(compute_macd, signalperiod=t))

#### Momentum

In [149]:
for t in T:
    universe[f'{t:02}_MOM'] = universe.groupby(level='symbol').close.apply(MOM, timeperiod=t)

#### Weighted Moving Average

In [152]:
for t in T:
    universe[f'{t:02}_WMA'] = universe.groupby(level='symbol').close.apply(WMA, timeperiod=t)

#### Exponential Moving Average

In [155]:
for t in T:
    universe[f'{t:02}_EMA'] = universe.groupby(level='symbol').close.apply(EMA, timeperiod=t)

#### Commodity Channel Index

In [158]:
for t in T:    
    universe[f'{t:02}_CCI'] = (universe.groupby(level='symbol', group_keys=False)
     .apply(lambda x: CCI(x.high, x.low, x.close, timeperiod=t)))

#### Chande Momentum Oscillator

In [161]:
for t in T:
    universe[f'{t:02}_CMO'] = universe.groupby(level='symbol').close.apply(CMO, timeperiod=t)

#### Rate of Change

In [164]:
for t in T:
    universe[f'{t:02}_ROC'] = universe.groupby(level='symbol').close.apply(ROC, timeperiod=t)

#### Chaikin A/D Oscillator

In [167]:
for t in T:
    universe[f'{t:02}_ADOSC'] = (universe.groupby(level='symbol', group_keys=False)
     .apply(lambda x: ADOSC(x.high, x.low, x.close, x.volume, fastperiod=t-3, slowperiod=4+t)))

#### Average Directional Movement Index

In [170]:
for t in T:
    universe[f'{t:02}_ADX'] = universe.groupby(level='symbol', 
                                group_keys=False).apply(lambda x: 
                                                        ADX(x.high, x.low, x.close, timeperiod=t))

In [172]:
universe.drop(ohlcv, axis=1).to_hdf('data.h5', 'features')

### Computing Historical Returns

#### Historical Returns

In [175]:
by_sym = universe.groupby(level='symbol').close

for t in [1,5]:
    universe[f'r{t:02}'] = by_sym.pct_change(t)

#### Removing Outliers

In [178]:
universe[[f'r{t:02}' for t in [1, 5]]].describe()

In [180]:
outliers = universe[universe.r01>1].index.get_level_values('symbol').unique()
len(outliers)

In [182]:
universe = universe.drop(outliers, level='symbol')

#### Historical Return Quantiles

In [185]:
for t in [1, 5]:
    universe[f'r{t:02}dec'] = (universe[f'r{t:02}'].groupby(level='date')
             .apply(lambda x: pd.qcut(x, q=10, labels=False, duplicates='drop')))

### Rolling Factor Betas

In [188]:
factor_data = (web.DataReader('F-F_Research_Data_5_Factors_2x3_daily', 'famafrench', 
                              start=START)[0].rename(columns={'Mkt-RF': 'Market'}))

factor_data.index.names = ['date']

In [190]:
factor_data.info()

In [192]:
windows = list(range(15, 90, 5))

len(windows)

In [194]:
t = 1

ret = f'r{t:02}'

factors = ['Market', 'SMB', 'HML', 'RMW', 'CMA']

windows = list(range(15, 90, 5))

for window in windows:
    print(window)
    betas = []
    for symbol, data in universe.groupby(level='symbol'):
        model_data = data[[ret]].merge(factor_data, on='date').dropna()
        model_data[ret] -= model_data.RF

        rolling_ols = RollingOLS(endog=model_data[ret], 
                                 exog=sm.add_constant(model_data[factors]), window=window)
        factor_model = rolling_ols.fit(params_only=True).params.drop('const', axis=1)
        result = factor_model.assign(symbol=symbol).set_index('symbol', append=True)
        betas.append(result)
    betas = pd.concat(betas).rename(columns=lambda x: f'{window:02}_{x}')
    universe = universe.join(betas)

#### Computing Forward Returns

In [197]:
for t in [1, 5]:
    universe[f'r{t:02}_fwd'] = universe.groupby(level='symbol')[f'r{t:02}'].shift(-t)
    universe[f'r{t:02}dec_fwd'] = universe.groupby(level='symbol')[f'r{t:02}dec'].shift(-t)

### Storing Model Data

In [200]:
universe = universe.drop(ohlcv, axis=1)

In [202]:
universe.info(null_counts=True)

In [204]:
drop_cols = ['r01', 'r01dec', 'r05',  'r05dec']

In [206]:
outcomes = universe.filter(like='_fwd').columns

In [208]:
universe = universe.sort_index()

with pd.HDFStore('data.h5') as store:
    store.put('features', universe.drop(drop_cols, axis=1).drop(outcomes, axis=1).loc[idx[:, '2001':], :])
    store.put('targets', universe.loc[idx[:, '2001':], outcomes])