# How to Transform Data into Factors

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import pandas_datareader.data as web

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Scikit-Learn
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression

# from pyfinance.ols import PandasRollingOLS
# replaces pyfinance.ols.PandasRollingOLS (no longer maintained)

# StatsModels
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS

# Technical Analysis
from talib import RSI, BBANDS, MACD, NATR, ATR

In [2]:
idx = pd.IndexSlice

sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

### Loading US Equity OHLCV Data

In [3]:
DATA_STORE = '../data/assets.h5'

In [4]:
YEAR = 12

In [5]:
START = 1995

END = 2017

In [7]:
with pd.HDFStore(DATA_STORE) as store:
    prices = (store['quandl/wiki/prices']
              .loc[idx[str(START):str(END), :], :]
              .filter(like='adj_')
              .dropna()
              .swaplevel()
              .rename(columns=lambda x: x.replace('adj_', ''))
              .join(store['us_equities/stocks']
                    .loc[:, ['sector']])
              .dropna())

In [8]:
prices.info(null_counts=True)

In [9]:
len(prices.index.unique('ticker'))

### Remove Stocks with Less than Ten Years of Data

In [10]:
min_obs = 10 * 252

nobs = prices.groupby(level='ticker').size()
to_drop = nobs[nobs < min_obs].index
prices = prices.drop(to_drop, level='ticker')

In [11]:
prices.info(null_counts=True)

In [12]:
len(prices.index.unique('ticker'))

### Adding some Basic Factors

#### Compute the Relative Strength Index

In [13]:
prices['rsi'] = prices.groupby(level='ticker').close.apply(RSI)

In [14]:
sns.distplot(prices.rsi);
plt.show()

#### Computing Bollinger Bands

In [15]:
def compute_bb(close):
    high, mid, low = BBANDS(np.log1p(close), timeperiod=20)
    return pd.DataFrame({'bb_high': high,
                         'bb_mid': mid,
                         'bb_low': low}, index=close.index)

In [16]:
prices = (prices.join(prices
                      .groupby(level='ticker')
                      .close
                      .apply(compute_bb)))

In [17]:
prices.info(null_counts=True)

In [18]:
prices.filter(like='bb_').describe()

In [19]:
fig, axes = plt.subplots(ncols=3, figsize=(15,4))

for i, col in enumerate(['bb_low', 'bb_mid', 'bb_low']):
    sns.distplot(prices[col], ax=axes[i])
    axes[i].set_title(col);
fig.tight_layout();
plt.show()

In [20]:
prices['bb_up'] = prices.bb_high.sub(np.log1p(prices.close))

prices['bb_down'] = np.log1p(prices.close).sub(prices.bb_low)

In [21]:
fig, axes = plt.subplots(ncols=2, figsize=(10,4))

for i, col in enumerate(['bb_down', 'bb_up']):
    sns.boxenplot(prices[col], ax=axes[i])
    axes[i].set_title(col);
fig.tight_layout();
plt.show()

#### Computing Average True Range

In [23]:
by_ticker = prices.groupby('ticker', group_keys=False)

In [24]:
def compute_atr(stock_data):
    atr = ATR(stock_data.high, 
              stock_data.low, 
              stock_data.close, 
              timeperiod=14)
    return atr.sub(atr.mean()).div(atr.std())

In [25]:
prices['atr'] = by_ticker.apply(compute_atr)

In [26]:
sns.distplot(prices.atr);
plt.show()

In [27]:
prices['natr'] = by_ticker.apply(lambda x: NATR(high=x.high, low=x.low, close=x.close))

In [28]:
sns.distplot(prices.natr[prices.natr<10]);
plt.show()

#### Computing Moving Average Convergence/Divergence

In [30]:
def compute_macd(close):
    macd = MACD(close)[0]
    return macd.sub(macd.mean()).div(macd.std())

In [31]:
prices['macd'] = prices.groupby(level='ticker').close.apply(compute_macd)

In [32]:
sns.distplot(prices.macd);
plt.show()

#### Computing Dollar Volume to Determine Universe

In [33]:
prices['dollar_volume'] = (prices.loc[:, 'close']
                           .mul(prices.loc[:, 'volume'], axis=0))

prices.dollar_volume /= 1e6

In [34]:
prices.to_hdf('data.h5', 'us/equities/prices')

In [35]:
prices = pd.read_hdf('data.h5', 'us/equities/prices')

prices.info(null_counts=True)

### Resample OHLCV Prices to Monthly Frequency

In [36]:
last_cols = [c for c in prices.columns.unique(0) if c not in ['dollar_volume', 'volume',
                                                              'open', 'high', 'low']]

In [37]:
prices = prices.unstack('ticker')

In [38]:
data = (pd.concat([prices.dollar_volume.resample('M').mean().stack('ticker').to_frame('dollar_volume'),
                   prices[last_cols].resample('M').last().stack('ticker')],
                  axis=1)
        .swaplevel()
        .dropna())

In [39]:
data.info()

### Selecting 500 Most-Traded Equities

In [40]:
data['dollar_volume'] = (data.loc[:, 'dollar_volume']
                         .unstack('ticker')
                         .rolling(window=5*12, min_periods=12)
                         .mean()
                         .stack()
                         .swaplevel())

In [41]:
data['dollar_vol_rank'] = (data
                           .groupby('date')
                           .dollar_volume
                           .rank(ascending=False))

data = data[data.dollar_vol_rank < 500].drop(['dollar_volume', 'dollar_vol_rank'], axis=1)

In [42]:
len(data.index.unique('ticker'))

### Creating Monthly Return Series

In [43]:
outlier_cutoff = 0.01

lags = [1, 3, 6, 12]

returns = []

In [44]:
for lag in lags:
    returns.append(data
                   .close
                   .unstack('ticker')
                   .sort_index()
                   .pct_change(lag)
                   .stack('ticker')
                   .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                          upper=x.quantile(1-outlier_cutoff)))
                   .add(1)
                   .pow(1/lag)
                   .sub(1)
                   .to_frame(f'return_{lag}m')
                   )
    
returns = pd.concat(returns, axis=1).swaplevel()
returns.info(null_counts=True)

In [45]:
returns.describe()

In [46]:
cmap = sns.diverging_palette(10, 220, as_cmap=True)
sns.clustermap(returns.corr('spearman'), annot=True, center=0, cmap=cmap);
plt.show()

In [47]:
data = data.join(returns).drop('close', axis=1).dropna()
data.info(null_counts=True)

In [48]:
min_obs = 5*12

nobs = data.groupby(level='ticker').size()
to_drop = nobs[nobs < min_obs].index
data = data.drop(to_drop, level='ticker')

In [49]:
len(data.index.unique('ticker'))

### Rolling Factor Betas

In [50]:
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']

factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3', 
                             'famafrench', 
                             start=START)[0].drop('RF', axis=1)

factor_data.index = factor_data.index.to_timestamp()
factor_data = factor_data.resample('M').last().div(100)
factor_data.index.name = 'date'
factor_data.info()

In [51]:
factor_data = factor_data.join(data['return_1m']).dropna().sort_index()
factor_data['return_1m'] -= factor_data['Mkt-RF']
factor_data.info()

In [52]:
factor_data.describe()

In [53]:
T = 60
# betas = (factor_data
#          .groupby(level='ticker', group_keys=False)
#          .apply(lambda x: PandasRollingOLS(window=min(T, x.shape[0]-1), 
#                                            y=x.return_1m, 
#                                            x=x.drop('return_1m', axis=1)).beta)
#         .rename(columns={'Mkt-RF': 'beta'}))
betas = (factor_data.groupby(level='ticker',
                             group_keys=False)
         .apply(lambda x: RollingOLS(endog=x.return_1m,
                                     exog=sm.add_constant(x.drop('return_1m', axis=1)),
                                     window=min(T, x.shape[0]-1))
                .fit(params_only=True)
                .params
                .rename(columns={'Mkt-RF': 'beta'})
                .drop('const', axis=1)))

In [54]:
betas.describe().join(betas.sum(1).describe().to_frame('total'))

In [55]:
betas.describe().join(betas.sum(1).describe().to_frame('total'))

In [56]:
cmap = sns.diverging_palette(10, 220, as_cmap=True)
sns.clustermap(betas.corr(), annot=True, cmap=cmap, center=0);
plt.show()

In [57]:
data = (data
        .join(betas
              .groupby(level='ticker')
              .shift())
       .dropna()
       .sort_index())

In [58]:
data.info()

### Momentum Factors

In [59]:
for lag in [3, 6, 12]:
    data[f'momentum_{lag}'] = data[f'return_{lag}m'].sub(data.return_1m)
    if lag > 3:
        data[f'momentum_3_{lag}'] = data[f'return_{lag}m'].sub(data.return_3m)    

### Date Indicators

In [60]:
dates = data.index.get_level_values('date')

data['year'] = dates.year
data['month'] = dates.month

### Target: Holding Period Returns

In [61]:
data['target'] = data.groupby(level='ticker')[f'return_1m'].shift(-1)

In [62]:
data = data.dropna()

In [63]:
data.sort_index().info(null_counts=True)

### Sector Breakdown

In [64]:
ax = data.reset_index().groupby('sector').ticker.nunique().sort_values().plot.barh(title='Sector Breakdown')
ax.set_ylabel('')
ax.set_xlabel('# Tickers')
sns.despine()
plt.tight_layout();
plt.show()

### Store Data

In [65]:
with pd.HDFStore('data.h5') as store:
    store.put('us/equities/monthly', data)

### Evaluate Mutual Information

In [66]:
X = data.drop('target', axis=1)

X.sector = pd.factorize(X.sector)[0]

In [67]:
mi = mutual_info_regression(X=X, y=data.target)

In [68]:
mi_reg = pd.Series(mi, index=X.columns)

mi_reg.nlargest(10)

In [69]:
mi = mutual_info_classif(X=X, y=(data.target>0).astype(int))

In [70]:
mi_class = pd.Series(mi, index=X.columns)

mi_class.nlargest(10)

In [71]:
mi = mi_reg.to_frame('Regression').join(mi_class.to_frame('Classification')

In [72]:
mi.index = [' '.join(c.upper().split('_')) for c in mi.index]

In [73]:
fig, axes = plt.subplots(ncols=2, figsize=(12, 4))

for i, t in enumerate(['Regression', 'Classification']):
    mi[t].nlargest(20).sort_values().plot.barh(title=t, ax=axes[i])
    axes[i].set_xlabel('Mutual Information')

fig.suptitle('Mutual Information', fontsize=14)
sns.despine()
fig.tight_layout()
fig.subplots_adjust(top=.9)
plt.show()