In [1]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
from talib import RSI, BBANDS, MACD, ATR
# ATR指标：https://zhuanlan.zhihu.com/p/369351721
# TR=max(|最高价-最低价|, |最高价-昨收|, |昨收-最低价|); 真实波幅ATR=TR的N(14)日简单移动平均
# 使用示例：当前价格比之前的价格高N个ATR的涨幅则买入；低N个ATR的涨幅则卖出

MONTH = 21
YEAR = 12 * MONTH
min_obs = 2 * YEAR

START = '2013-01-01'
END = '2017-12-31'

sns.set_style('whitegrid')
idx = pd.IndexSlice

In [2]:
ohlcv = ['adj_open', 'adj_close', 'adj_low', 'adj_high', 'adj_volume']
DATA_STORE = '../data/assets.h5'
with pd.HDFStore(DATA_STORE) as store:
    prices = (store['quandl/wiki/prices']
              .loc[idx[START:END, :], ohlcv]
              .rename(columns=lambda x: x.replace('adj_', ''))
              .assign(volume=lambda x: x.volume.div(1000))
              .swaplevel()
              .sort_index())
    stocks = (store['us_equities/stocks'].loc[:, ['marketcap', 'ipoyear', 'sector']])

nobs = prices.groupby(level='ticker').size()
keep = nobs[nobs > min_obs].index
prices = prices.loc[idx[keep, :], :]

In [3]:
stocks = stocks[~stocks.index.duplicated() & stocks.sector.notnull()]
stocks.sector = stocks.sector.str.lower().str.replace(' ', '_')
stocks.index.name = 'ticker'

shared = (prices.index.get_level_values('ticker').unique().intersection(stocks.index))
stocks = stocks.loc[shared, :]
prices = prices.loc[idx[shared, :], :]
#prices.info(show_counts=True)
#stocks.info(show_counts=True)
#stocks.sector.value_counts()

In [4]:
# compute dollar volume to determine universe
prices['dollar_vol'] = prices[['close', 'volume']].prod(axis=1)
prices['dollar_vol_1m'] = (prices.dollar_vol.groupby('ticker').rolling(window=21).mean()).values
#prices.info(show_counts=True)

In [5]:
prices['dollar_vol_rank'] = (prices.groupby('date').dollar_vol_1m.rank(ascending=False))
#prices.info(show_counts=True)

In [6]:
#prices['rsi'] = prices.groupby(level='ticker').close.apply(RSI)
#prices['rsi'] = prices.close.groupby(level='ticker').apply(lambda x: RSI(x.swaplevel().unstack().iloc[:,0]))
prices['rsi'] = prices.close.groupby('ticker', group_keys=False).apply(RSI)

#ax = sns.distplot(prices.rsi.dropna())
#ax.axvline(30, ls='--', lw=1, c='k')
#ax.axvline(70, ls='--', lw=1, c='k')
#ax.set_title('RSI Distribution with Signal Threshold')
#plt.tight_layout();

In [7]:
def compute_bb(close):
    high, mid, low = BBANDS(close, timeperiod=20)
    return pd.DataFrame({'bb_high': high, 'bb_low': low}, index=close.index)
prices = (prices.join(prices.groupby(level='ticker', group_keys=False).close.apply(compute_bb)))
prices['bb_high'] = prices.bb_high.sub(prices.close).div(prices.bb_high).apply(np.log1p)
prices['bb_low'] = prices.close.sub(prices.bb_low).div(prices.close).apply(np.log1p)

#fig, axes = plt.subplots(ncols=2, figsize=(15, 5))
#sns.distplot(prices.loc[prices.dollar_vol_rank<100, 'bb_low'].dropna(), ax=axes[0])
#sns.distplot(prices.loc[prices.dollar_vol_rank<100, 'bb_high'].dropna(), ax=axes[1])
#plt.tight_layout();

In [8]:
def compute_atr(stock_data):
    df = ATR(stock_data.high, stock_data.low, stock_data.close, timeperiod=14)
    return df.sub(df.mean()).div(df.std())
prices['atr'] = (prices.groupby('ticker', group_keys=False).apply(compute_atr))
#sns.distplot(prices[prices.dollar_vol_rank<50].atr.dropna());

In [9]:
def compute_macd(close):
    macd = MACD(close)[0]
    return (macd - np.mean(macd))/np.std(macd)
prices['macd'] = (prices.groupby('ticker', group_keys=False).close.apply(compute_macd))
#prices.macd.describe(percentiles=[.001, .01, .02, .03, .04, .05, .95, .96, .97, .98, .99, .999]).apply(lambda x: f'{x:,.1f}')
#sns.distplot(prices[prices.dollar_vol_rank<100].macd.dropna());

In [10]:
lags = [1, 5, 10, 21, 42, 63]
returns = prices.groupby(level='ticker').close.pct_change()
percentiles=[.0001, .001, .01]
percentiles+= [1-p for p in percentiles]
#returns.describe(percentiles=percentiles).iloc[2:].to_frame('percentiles').style.format(lambda x: f'{x:,.2%}')
q = 0.0001
for lag in lags:
    prices[f'return_{lag}d'] = (prices.groupby(level='ticker').close
                                .pct_change(lag)
                                .pipe(lambda x: x.clip(lower=x.quantile(q), upper=x.quantile(1 - q)))
                                .add(1).pow(1 / lag).sub(1))
for t in [1, 2, 3, 4, 5]:
    for lag in [1, 5, 10, 21]:
        prices[f'return_{lag}d_lag{t}'] = (prices.groupby(level='ticker')[f'return_{lag}d'].shift(t * lag))
        
for t in [1, 5, 10, 21]:
    prices[f'target_{t}d'] = prices.groupby(level='ticker')[f'return_{t}d'].shift(-t) #当做label

In [11]:
prices = prices.join(stocks[['sector']])
prices['year'] = prices.index.get_level_values('date').year
prices['month'] = prices.index.get_level_values('date').month
#prices.info(show_counts=True)
prices.assign(sector=pd.factorize(prices.sector, sort=True)[0]).to_hdf('data.h5', 'model_data/no_dummies')
prices = pd.get_dummies(prices,
                        columns=['year', 'month', 'sector'],
                        prefix=['year', 'month', ''],
                        prefix_sep=['_', '_', ''],
                        drop_first=True)
#prices.info(show_counts=True)
prices.to_hdf('data.h5', 'model_data')

In [33]:
target = 'target_5d'
top100 = prices[prices.dollar_vol_rank<100].copy()
top100.loc[:, 'rsi_signal'] = pd.cut(top100.rsi, bins=[0, 30, 70, 100])
#top100.groupby('rsi_signal')['target_5d'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
rsi_signal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
"(0, 30]",4209.0,0.001126,0.010457,-0.067138,-0.003606,0.001051,0.006156,0.061889
"(30, 70]",107244.0,0.000446,0.007711,-0.170571,-0.003054,0.00065,0.004246,0.075653
"(70, 100]",10634.0,1.8e-05,0.006354,-0.087857,-0.002818,0.000145,0.003121,0.05857


In [35]:
metric = 'bb_low'
#j=sns.jointplot(x=metric, y=target, data=top100)

df = top100[[metric, target]].dropna()
r, p = spearmanr(df[metric], df[target])
print(f'{r:,.2%} ({p:.2%})')

-2.68% (0.00%)


In [None]:
metric = 'bb_high'
j=sns.jointplot(x=metric, y=target, data=top100)

df = top100[[metric, target]].dropna()
r, p = spearmanr(df[metric], df[target])
print(f'{r:,.2%} ({p:.2%})')

In [None]:
metric = 'atr'
j=sns.jointplot(x=metric, y=target, data=top100)

df = top100[[metric, target]].dropna()
r, p = spearmanr(df[metric], df[target])
print(f'{r:,.2%} ({p:.2%})')

In [None]:
metric = 'macd'
j=sns.jointplot(x=metric, y=target, data=top100)

df = top100[[metric, target]].dropna()
r, p = spearmanr(df[metric], df[target])
print(f'{r:,.2%} ({p:.2%})')