# How to Transform Data into Factors

### Loading Libraries

In [1]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd
import pandas_datareader.data as web

# StatModels
import statsmodels.api as sm
from statsmodels.regression.rolling import RollingOLS

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt
# replaces pyfinance.ols.PandasRollingOLS (no longer maintained)

# Date & Time
from datetime import datetime

# Warnings
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
idx = pd.IndexSlice

sns.set_style('whitegrid')

### Getting Data

In [4]:
DATA_STORE = '../data/assets.h5'

In [5]:
START = 2000

END = 2018

In [7]:
with pd.HDFStore(DATA_STORE) as store:
    prices = (store['quandl/wiki/prices']
              .loc[idx[str(START):str(END), :], 'adj_close']
              .unstack('ticker'))
    stocks = store['us_equities/stocks'].loc[:, ['marketcap', 'ipoyear', 'sector']]

In [10]:
prices.info()

In [11]:
stocks.info()

### Keeping Data with Stock Info

In [12]:
stocks = stocks[~stocks.index.duplicated()]

stocks.index.name = 'ticker'

In [13]:
shared = prices.columns.intersection(stocks.index)

In [14]:
stocks = stocks.loc[shared, :]

stocks.info()

In [15]:
prices = prices.loc[:, shared]

prices.info()

In [16]:
assert prices.shape[1] == stocks.shape[0]

### Setting Monthly Return Series

In [17]:
monthly_prices = prices.resample('M').last()

In [18]:
monthly_prices.info()

In [19]:
outlier_cutoff = 0.01
data = pd.DataFrame()
lags = [1, 2, 3, 6, 9, 12]
for lag in lags:
    data[f'return_{lag}m'] = (monthly_prices
                           .pct_change(lag)
                           .stack()
                           .pipe(lambda x: x.clip(lower=x.quantile(outlier_cutoff),
                                                  upper=x.quantile(1-outlier_cutoff)))
                           .add(1)
                           .pow(1/lag)
                           .sub(1)
                           )
data = data.swaplevel().dropna()
data.info()

### Dropping Stocks with less than 10 yrs of Returns

In [20]:
min_obs = 120
nobs = data.groupby(level='ticker').size()
keep = nobs[nobs>min_obs].index

data = data.loc[idx[keep,:], :]
data.info()

In [21]:
data.describe()

In [22]:
# cmap = sns.diverging_palette(10, 220, as_cmap=True)

sns.clustermap(data.corr('spearman'), annot=True, center=0, cmap='Blues');

In [23]:
data.index.get_level_values('ticker').nunique()

### Rolling Factor Betas

In [24]:
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']

factor_data = web.DataReader('F-F_Research_Data_5_Factors_2x3', 'famafrench', start='2000')[0].drop('RF', axis=1)
factor_data.index = factor_data.index.to_timestamp()
factor_data = factor_data.resample('M').last().div(100)
factor_data.index.name = 'date'

factor_data.info()

In [25]:
factor_data = factor_data.join(data['return_1m']).sort_index()

factor_data.info()

In [26]:
T = 24

betas = (factor_data.groupby(level='ticker',
                             group_keys=False)
         .apply(lambda x: RollingOLS(endog=x.return_1m,
                                     exog=sm.add_constant(x.drop('return_1m', axis=1)),
                                     window=min(T, x.shape[0]-1))
                .fit(params_only=True)
                .params
                .drop('const', axis=1)))

In [27]:
betas.describe().join(betas.sum(1).describe().to_frame('total'))

In [28]:
cmap = sns.diverging_palette(10, 220, as_cmap=True)

sns.clustermap(betas.corr(), annot=True, cmap=cmap, center=0);

In [29]:
data = (data
        .join(betas
              .groupby(level='ticker')
              .shift()))
data.info()

### Imputing Mean for Missing Factor Betas

In [30]:
data.loc[:, factors] = data.groupby('ticker')[factors].apply(lambda x: x.fillna(x.mean()))

data.info()

### Momentum Factors

In [31]:
for lag in [2,3,6,9,12]:
    data[f'momentum_{lag}'] = data[f'return_{lag}m'].sub(data.return_1m)

data[f'momentum_3_12'] = data[f'return_12m'].sub(data.return_3m)

### Date Indicators

In [33]:
dates = data.index.get_level_values('date')

data['year'] = dates.year

data['month'] = dates.month

### Lagged Returns

In [34]:
for t in range(1, 7):
    data[f'return_1m_t-{t}'] = data.groupby(level='ticker').return_1m.shift(t)

data.info()

### `Target`: Holding Period Returns

In [35]:
for t in [1,2,3,6,12]:
    data[f'target_{t}m'] = data.groupby(level='ticker')[f'return_{t}m'].shift(-t)

In [37]:
cols = ['target_1m',
        'target_2m',
        'target_3m', 
        'return_1m',
        'return_2m',
        'return_3m',
        'return_1m_t-1',
        'return_1m_t-2',
        'return_1m_t-3']

data[cols].dropna().sort_index().head(10)

In [38]:
# data.info()

### Creating Age Proxy

In [39]:
data = (data
        .join(pd.qcut(stocks.ipoyear, q=5, labels=list(range(1, 6)))
              .astype(float)
              .fillna(0)
              .astype(int)
              .to_frame('age')))

data.age = data.age.fillna(-1)

#### Create Dynamic Size Proxy

In [40]:
stocks.info()

In [9]:
size_factor = (monthly_prices
               .loc[data.index.get_level_values('date').unique(),
                    data.index.get_level_values('ticker').unique()]
               .sort_index(ascending=False)
               .pct_change()
               .fillna(0)
               .add(1)
               .cumprod())

size_factor.info()

In [41]:
msize = (size_factor
         .mul(stocks
              .loc[size_factor.columns, 'marketcap'])).dropna(axis=1, how='all')

#### Creating Size Indicator as Deciles per Period

In [42]:
data['msize'] = (msize
                 .apply(lambda x: pd.qcut(x, q=10, labels=list(range(1, 11)))
                        .astype(int), axis=1)
                 .stack()
                 .swaplevel())

data.msize = data.msize.fillna(-1)

### Combining Data

In [43]:
data = data.join(stocks[['sector']])

data.sector = data.sector.fillna('Unknown')

In [44]:
data.info()

### Storing Data

In [45]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('engineered_features', data.sort_index().loc[idx[:, :datetime(2018, 3, 1)], :])
    print(store.info())

### Creating Dummy Variables

In [46]:
dummy_data = pd.get_dummies(data,
                            columns=['year','month', 'msize', 'age',  'sector'],
                            prefix=['year','month', 'msize', 'age', ''],
                            prefix_sep=['_', '_', '_', '_', ''])

dummy_data = dummy_data.rename(columns={c:c.replace('.0', '') for c in dummy_data.columns})
dummy_data.info()