# 101 Formulaic Alphas

### Loading Libraries

In [75]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Warnings
import warnings

# Scikit-Learn
from sklearn.feature_selection import mutual_info_regression

# SciPy
from scipy.stats import spearmanr

# Technical Analysis
from talib import WMA

In [76]:
idx= pd.IndexSlice

sns.set_style('whitegrid')

warnings.filterwarnings('ignore')

### Functions

#### Cross-section

In [80]:
def rank(df):
    """Return the cross-sectional percentile rank

     Args:
         :param df: tickers in columns, sorted dates in rows.

     Returns:
         pd.DataFrame: the ranked values
     """
    return df.rank(axis=1, pct=True)

In [82]:
def scale(df):
    """
    Scaling time serie.
    :param df: a pandas DataFrame.
    :param k: scaling factor.
    :return: a pandas DataFrame rescaled df such that sum(abs(df)) = k
    """
    return df.div(df.abs().sum(axis=1), axis=0)

#### Operators

In [85]:
def log(df):
    return np.log1p(df)

In [87]:
def sign(df):
    return np.sign(df)

In [89]:
def power(df, exp):
    return df.pow(exp)

### Time Series

#### Pandas Implementation

In [93]:
def ts_lag(df: pd.DataFrame, t: int = 1) -> pd.DataFrame:
    """Return the lagged values t periods ago.

    Args:
        :param df: tickers in columns, sorted dates in rows.
        :param t: lag

    Returns:
        pd.DataFrame: the lagged values
    """
    return df.shift(t)

In [95]:
def ts_delta(df, period=1):
    """
    Wrapper function to estimate difference.
    :param df: a pandas DataFrame.
    :param period: the difference grade.
    :return: a pandas DataFrame with today’s value minus the value 'period' days ago.
    """
    return df.diff(period)

In [97]:
def ts_sum(df: pd.DataFrame, window: int = 10) -> pd.DataFrame:
    """Computes the rolling ts_sum for the given window size.

    Args:
        df (pd.DataFrame): tickers in columns, dates in rows.
        window      (int): size of rolling window.

    Returns:
        pd.DataFrame: the ts_sum over the last 'window' days.
    """
    return df.rolling(window).sum()

In [99]:
def ts_mean(df, window=10):
    """Computes the rolling mean for the given window size.

    Args:
        df (pd.DataFrame): tickers in columns, dates in rows.
        window      (int): size of rolling window.

    Returns:
        pd.DataFrame: the mean over the last 'window' days.
    """
    return df.rolling(window).mean()

In [101]:
def ts_weighted_mean(df, period=10):
    """
    Linear weighted moving average implementation.
    :param df: a pandas DataFrame.
    :param period: the LWMA period
    :return: a pandas DataFrame with the LWMA.
    """
    return (df.apply(lambda x: WMA(x, timeperiod=period)))

In [103]:
def ts_std(df, window=10):
    """
    Wrapper function to estimate rolling standard deviation.
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: a pandas DataFrame with the time-series min over the past 'window' days.
    """
    return (df
            .rolling(window)
            .std())

In [105]:
def ts_rank(df, window=10):
    """
    Wrapper function to estimate rolling rank.
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: a pandas DataFrame with the time-series rank over the past window days.
    """
    return (df
            .rolling(window)
            .apply(lambda x: x.rank().iloc[-1]))

In [107]:
def ts_product(df, window=10):
    """
    Wrapper function to estimate rolling ts_product.
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: a pandas DataFrame with the time-series ts_product over the past 'window' days.
    """
    return (df
            .rolling(window)
            .apply(np.prod))

In [109]:
def ts_min(df, window=10):
    """
    Wrapper function to estimate rolling min.
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: a pandas DataFrame with the time-series min over the past 'window' days.
    """
    return df.rolling(window).min()

In [111]:
def ts_max(df, window=10):
    """
    Wrapper function to estimate rolling min.
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: a pandas DataFrame with the time-series max over the past 'window' days.
    """
    return df.rolling(window).max()

In [113]:
def ts_argmax(df, window=10):
    """
    Wrapper function to estimate which day ts_max(df, window) occurred on
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: well.. that :)
    """
    return df.rolling(window).apply(np.argmax).add(1)

In [115]:
def ts_argmin(df, window=10):
    """
    Wrapper function to estimate which day ts_min(df, window) occurred on
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: well.. that :)
    """
    return (df.rolling(window)
            .apply(np.argmin)
            .add(1))

In [117]:
def ts_corr(x, y, window=10):
    """
    Wrapper function to estimate rolling correlations.
    :param x, y: pandas DataFrames.
    :param window: the rolling window.
    :return: a pandas DataFrame with the time-series min over the past 'window' days.
    """
    return x.rolling(window).corr(y)

In [119]:
def ts_cov(x, y, window=10):
    """
    Wrapper function to estimate rolling covariance.
    :param df: a pandas DataFrame.
    :param window: the rolling window.
    :return: a pandas DataFrame with the time-series min over the past 'window' days.
    """
    return x.rolling(window).cov(y)

### Loading Data

#### 500 Most-Traded Stocks

In [122]:
# ohlcv = ['open', 'high', 'low', 'close', 'volume']

# data = (pd.read_hdf('data.h5', 'data/top500')
#         .loc[:, ohlcv + ['ret_01', 'sector', 'ret_fwd']]
#         .rename(columns={'ret_01': 'returns'})
#         .sort_index())

In [124]:
# adv20 = data.groupby('ticker').rolling(20).volume.mean().reset_index(0, drop=True)

In [126]:
# data = data.assign(adv20=adv20)

In [128]:
# data = data.join(data.groupby('date')[ohlcv].rank(axis=1, pct=True), rsuffix='_rank')

In [130]:
# data.info(null_counts=True)

In [132]:
# data.to_hdf('factors.h5', 'data')

### Input Data

In [135]:
# o = data.open.unstack('ticker')
# h = data.high.unstack('ticker')
# l = data.low.unstack('ticker')
# c = data.close.unstack('ticker')
# v = data.volume.unstack('ticker')

# vwap = o.add(h).add(l).add(c).div(4)
# adv20 = v.rolling(20).mean()

# r = data.returns.unstack('ticker')

### Evaluate Alphas

In [142]:
# alphas = data[['returns', 'ret_fwd']].copy()
# mi,ic = {}, {}

In [144]:
def get_mutual_info_score(returns, alpha, n=100000):
    df = pd.DataFrame({'y': returns, 'alpha': alpha}).dropna().sample(n=n)
    return mutual_info_regression(y=df.y, X=df[['alpha']])[0]

### Alpha 001

In [147]:
def alpha001(c, r):
    """(rank(ts_argmax(power(((returns < 0)
        ? ts_std(returns, 20)
        : close), 2.), 5)) -0.5)"""
    c[r < 0] = ts_std(r, 20)
    return (rank(ts_argmax(power(c, 2), 5)).mul(-.5)
            .stack().swaplevel())

In [149]:
alpha = 1

In [153]:
# %%time
# alphas[f'{alpha:03}'] = alpha001(c, r)

In [155]:
# alphas.info()

In [157]:
# alphas[f'{alpha:03}'].to_hdf('alphas.h5', f'alphas/{alpha:03}')

In [159]:
# sns.distplot(alphas[f'{alpha:03}']);
# plt.show()

In [161]:
# g = sns.jointplot(x=f'{alpha:03}', y='ret_fwd', data=alphas)

In [163]:
# mi[1] = get_mutual_info_score(alphas.ret_fwd, alphas[f'{alpha:03}'])
# mi[1]

### Alpha 002

In [166]:
def alpha002(o, c, v):
    """(-1 * ts_corr(rank(ts_delta(log(volume), 2)), rank(((close - open) / open)), 6))"""
    s1 = rank(ts_delta(log(v), 2))
    s2 = rank((c / o) - 1)
    alpha = -ts_corr(s1, s2, 6)
    return alpha.stack('ticker').swaplevel().replace([-np.inf, np.inf], np.nan)

In [168]:
alpha = 2

In [170]:
# %%time
# alphas[f'{alpha:03}'] = alpha002(o, c, v)

In [172]:
# alphas[f'{alpha:03}'].to_hdf('alphas.h5', f'alphas/{alpha:03}')

In [174]:
# sns.distplot(alphas[f'{alpha:03}']);

In [176]:
# g = sns.jointplot(x=f'{alpha:03}', y='ret_fwd', data=alphas)

In [178]:
# mi[alpha] = get_mutual_info_score(alphas.ret_fwd, alphas[f'{alpha:03}'])
# mi[2]

### Alpha 003

In [181]:
def alpha003(o, v):
    """(-1 * ts_corr(rank(open), rank(volume), 10))"""

    return (-ts_corr(rank(o), rank(v), 10)
            .stack('ticker')
            .swaplevel()
            .replace([-np.inf, np.inf], np.nan))

In [183]:
alpha = 3

In [185]:
# %%time
# alphas[f'{alpha:03}'] = alpha003(o, v)

In [187]:
# alphas[f'{alpha:03}'].to_hdf('alphas.h5', f'alphas/{alpha:03}')

In [189]:
# sns.distplot(alphas[f'{alpha:03}'].clip(lower=-1));

In [191]:
# g = sns.jointplot(x=f'{alpha:03}', y='ret_fwd', data=alphas);

In [193]:
# mi[alpha] = get_mutual_info_score(alphas.ret_fwd, alphas[f'{alpha:03}'])
# mi[alpha]

### Alpha 004

In [196]:
def alpha004(l):
    """(-1 * Ts_Rank(rank(low), 9))"""
    return (-ts_rank(rank(l), 9)
            .stack('ticker')
            .swaplevel())

In [198]:
alpha = 4

In [253]:
# %%time
# alphas[f'{alpha:03}'] = alpha004(l)

In [255]:
# alphas[f'{alpha:03}'].to_hdf('alphas.h5', f'alphas/{alpha:03}')

In [257]:
# sns.distplot(alphas[f'{alpha:03}']);

In [259]:
# g = sns.jointplot(x=f'{alpha:03}', y='ret_fwd', data=alphas);

In [261]:
# mi[alpha] = get_mutual_info_score(alphas.ret_fwd, alphas[f'{alpha:03}'])
# mi[alpha]

### Alpha 005

In [263]:
def alpha005(o, vwap, c):
    """(rank((open - ts_mean(vwap, 10))) * (-1 * abs(rank((close - vwap)))))"""
    return (rank(o.sub(ts_mean(vwap, 10)))
            .mul(rank(c.sub(vwap)).mul(-1).abs())
            .stack('ticker')
            .swaplevel())

In [265]:
alpha = 5

In [267]:
# %%time
# alphas[f'{alpha:03}'] = alpha005(o, vwap, c)

In [269]:
# alphas[f'{alpha:03}'].to_hdf('alphas.h5', f'alphas/{alpha:03}')

In [271]:
# sns.distplot(alphas[f'{alpha:03}']);

In [273]:
# g = sns.jointplot(x=f'{alpha:03}', y='ret_fwd', data=alphas);