# Long-Short Strategy, Part 1: Preparing Alpha Factors and Features

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Warnings
import warnings

# pATH
from pathlib import Path

# OS & Time
import os
from datetime import datetime

# Data Visualization
import graphviz
import seaborn as sns
import matplotlib.pyplot as plt

# Technical Analysis
import talib
from talib import RSI, BBANDS, MACD, ATR

In [2]:
%matplotlib inline

In [3]:
MONTH = 21

YEAR = 12 * MONTH

In [4]:
START = '2010-01-01'

END = '2017-12-31'

In [5]:
idx = pd.IndexSlice

sns.set_style('darkgrid')

In [6]:
percentiles = [.001, .01, .02, .03, .04, .05]

percentiles += [1-p for p in percentiles[::-1]]

In [7]:
T = [1, 5, 10, 21, 42, 63]

### Loading Quandl Wiki Stock Prices & Meta Data

In [8]:
DATA_STORE = '../data/assets.h5'

ohlcv = ['adj_open', 'adj_close', 'adj_low', 'adj_high', 'adj_volume']

with pd.HDFStore(DATA_STORE) as store:
    prices = (store['quandl/wiki/prices']
              .loc[idx[START:END, :], ohlcv] 
              .rename(columns=lambda x: x.replace('adj_', '')) 
              .swaplevel()
              .sort_index())
    metadata = (store['us_equities/stocks'].loc[:, ['marketcap', 'sector']])

In [9]:
prices.volume /= 1e3 

prices.index.names = ['symbol', 'date']

metadata.index.name = 'symbol'

### Removing Stocks with Insufficient Observations

In [10]:
min_obs = 7 * YEAR

nobs = prices.groupby(level='symbol').size()
keep = nobs[nobs > min_obs].index
prices = prices.loc[idx[keep, :], :]

### Aligning Price & Meta Data

In [11]:
metadata = metadata[~metadata.index.duplicated() & metadata.sector.notnull()]

metadata.sector = metadata.sector.str.lower().str.replace(' ', '_')

In [12]:
shared = (prices.index.get_level_values('symbol').unique()
          .intersection(metadata.index))

metadata = metadata.loc[shared, :]
prices = prices.loc[idx[shared, :], :]

### Limiting Universe to 1,000 Stocks with Highest Market Cap

In [13]:
universe = metadata.marketcap.nlargest(1000).index
prices = prices.loc[idx[universe, :], :]
metadata = metadata.loc[universe]

In [14]:
metadata.sector.value_counts()

In [None]:
prices.info(show_counts=True)

In [None]:
metadata.info()

### Ranking Assets by Rolling Average Dollar Volume

#### Computing Dollar Volume

In [15]:
prices['dollar_vol'] = prices[['close', 'volume']].prod(1).div(1e3)

#### 21-Day Moving Average

In [16]:
dollar_vol_ma = (prices
                 .dollar_vol
                 .unstack('symbol')
                 .rolling(window=21, min_periods=1) # 1 trading month
                 .mean())

#### Ranking Stocks by Moving Average

In [17]:
prices['dollar_vol_rank'] = (dollar_vol_ma
                            .rank(axis=1, ascending=False)
                            .stack('symbol')
                            .swaplevel())

In [18]:
prices.info(show_counts=True)

### Adding some Basic Factors

#### Computing The Relative Strength Index

In [19]:
prices['rsi'] = prices.groupby(level='symbol').close.apply(RSI)

In [20]:
ax = sns.distplot(prices.rsi.dropna())
ax.axvline(30, ls='--', lw=1, c='k')
ax.axvline(70, ls='--', lw=1, c='k')
ax.set_title('RSI Distribution with Signal Threshold')
sns.despine()
plt.tight_layout();
plt.show()

#### Computing Bollinger Bands

In [21]:
def compute_bb(close):
    high, mid, low = BBANDS(close, timeperiod=20)
    return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)

In [22]:
prices = (prices.join(prices
                      .groupby(level='symbol')
                      .close
                      .apply(compute_bb)))

In [23]:
prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)

prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)

In [24]:
fig, axes = plt.subplots(ncols=2, figsize=(15, 5))

sns.distplot(prices.loc[prices.dollar_vol_rank<100, 'bb_low'].dropna(), ax=axes[0])
sns.distplot(prices.loc[prices.dollar_vol_rank<100, 'bb_high'].dropna(), ax=axes[1])
sns.despine()
plt.tight_layout();
plt.show()

#### Computing Average True Range

In [25]:
prices['NATR'] = prices.groupby(level='symbol', 
                                group_keys=False).apply(lambda x: 
                                                        talib.NATR(x.high, x.low, x.close))

In [26]:
def compute_atr(stock_data):
    df = ATR(stock_data.high, stock_data.low, 
             stock_data.close, timeperiod=14)
    return df.sub(df.mean()).div(df.std())

In [27]:
prices['ATR'] = (prices.groupby('symbol', group_keys=False)
                 .apply(compute_atr))

#### Computing Moving Average Convergence/Divergence

In [28]:
prices['PPO'] = prices.groupby(level='symbol').close.apply(talib.PPO)

In [29]:
def compute_macd(close):
    macd = MACD(close)[0]
    return (macd - np.mean(macd))/np.std(macd)

In [30]:
prices['MACD'] = (prices
                  .groupby('symbol', group_keys=False)
                  .close
                  .apply(compute_macd))

#### Combining Price & Meta Data

In [31]:
metadata.sector = pd.factorize(metadata.sector)[0].astype(int)

prices = prices.join(metadata[['sector']])

### Computing Returns

#### Historical Returns

In [32]:
by_sym = prices.groupby(level='symbol').close

for t in T:
    prices[f'r{t:02}'] = by_sym.pct_change(t)

#### Daily Historical Return Deciles

In [33]:
for t in T:
    prices[f'r{t:02}dec'] = (prices[f'r{t:02}']
                             .groupby(level='date')
                             .apply(lambda x: pd.qcut(x, 
                                                      q=10, 
                                                      labels=False, 
                                                      duplicates='drop')))

#### Daily Sector Return Deciles

In [34]:
for t in T:
    prices[f'r{t:02}q_sector'] = (prices
                                  .groupby(['date', 'sector'])[f'r{t:02}']
                                  .transform(lambda x: pd.qcut(x, 
                                                               q=5, 
                                                               labels=False, 
                                                               duplicates='drop')))

#### Computing Forward Returns

In [36]:
for t in [1, 5, 21]:
    prices[f'r{t:02}_fwd'] = prices.groupby(level='symbol')[f'r{t:02}'].shift(-t)

### Removing Outliers

In [37]:
prices[[f'r{t:02}' for t in T]].describe()

In [38]:
outliers = prices[prices.r01 > 1].index.get_level_values('symbol').unique()

In [39]:
prices = prices.drop(outliers, level='symbol')

### Creating Time & Sector Dummy Variables

In [40]:
prices['year'] = prices.index.get_level_values('date').year
prices['month'] = prices.index.get_level_values('date').month
prices['weekday'] = prices.index.get_level_values('date').weekday

### Storing Model Data

In [41]:
prices.info(show_counts=True)

In [42]:
prices.drop(['open', 'close', 'low', 'high', 'volume'], axis=1).to_hdf('data.h5', 'model_data')