# 第11章 金融和经济数据应用

## 数据规整化方面的话题

### 时间序列以及截面对齐

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
np.set_printoptions(precision=4, suppress=True)

In [69]:
prices = pd.read_csv('stock_px.csv', header=None, index_col = 0,
                     names=[' ','AA','AAPL','GE','IBM','JNJ','MSFT','PEP','SPX','XOM'],
                     skiprows = 5444, nrows = 7, usecols = [0,2,5,8,9])
prices

Unnamed: 0,AAPL,JNJ,SPX,XOM
,,,,
2011-09-06 00:00:00,379.74,64.64,1165.24,71.15
2011-09-07 00:00:00,383.93,65.43,1198.62,73.65
2011-09-08 00:00:00,384.14,64.95,1185.9,72.82
2011-09-09 00:00:00,377.48,63.64,1154.23,71.01
2011-09-12 00:00:00,379.94,63.59,1162.27,71.84
2011-09-13 00:00:00,384.62,63.61,1172.87,71.65
2011-09-14 00:00:00,389.3,63.73,1188.68,72.64


In [70]:
volume = pd.read_csv('volume.csv', header=None, index_col = 0,
                     names=[' ','AA','AAPL','GE','IBM','JNJ','MSFT','PEP','SPX','XOM'],
                     skiprows = 5444, nrows = 5, usecols = [0,2,5,9])
volume

Unnamed: 0,AAPL,JNJ,XOM
,,,
2011-09-06 00:00:00,18173500.0,15848300.0,25416300.0
2011-09-07 00:00:00,12492000.0,10759700.0,23108400.0
2011-09-08 00:00:00,14839800.0,15551500.0,22434800.0
2011-09-09 00:00:00,20171900.0,17008200.0,27969100.0
2011-09-12 00:00:00,16697300.0,13448200.0,26205800.0


In [71]:
prices * volume

Unnamed: 0,AAPL,JNJ,SPX,XOM
,,,,
2011-09-06 00:00:00,6901205000.0,1024434000.0,,1808370000.0
2011-09-07 00:00:00,4796054000.0,704007200.0,,1701934000.0
2011-09-08 00:00:00,5700561000.0,1010070000.0,,1633702000.0
2011-09-09 00:00:00,7614489000.0,1082402000.0,,1986086000.0
2011-09-12 00:00:00,6343972000.0,855171000.0,,1882625000.0
2011-09-13 00:00:00,,,,
2011-09-14 00:00:00,,,,


In [72]:
vwap = (prices * volume).sum() / volume.sum()

In [73]:
vwap

AAPL    380.655181
JNJ      64.394769
SPX            NaN
XOM      72.024288
dtype: float64

In [74]:
vwap.dropna()

AAPL    380.655181
JNJ      64.394769
XOM      72.024288
dtype: float64

In [75]:
prices.align(volume, join='inner')

(                       AAPL    JNJ    XOM
                                          
 2011-09-06 00:00:00  379.74  64.64  71.15
 2011-09-07 00:00:00  383.93  65.43  73.65
 2011-09-08 00:00:00  384.14  64.95  72.82
 2011-09-09 00:00:00  377.48  63.64  71.01
 2011-09-12 00:00:00  379.94  63.59  71.84,
                            AAPL         JNJ         XOM
                                                        
 2011-09-06 00:00:00  18173500.0  15848300.0  25416300.0
 2011-09-07 00:00:00  12492000.0  10759700.0  23108400.0
 2011-09-08 00:00:00  14839800.0  15551500.0  22434800.0
 2011-09-09 00:00:00  20171900.0  17008200.0  27969100.0
 2011-09-12 00:00:00  16697300.0  13448200.0  26205800.0)

In [76]:
s1 = Series(range(3), index=['a', 'b', 'c'])
s2 = Series(range(4), index=['d', 'b', 'c', 'e'])
s3 = Series(range(3), index=['f', 'a', 'c'])

In [77]:
DataFrame({'one': s1, 'two': s2, 'three': s3})

Unnamed: 0,one,three,two
a,0.0,1.0,
b,1.0,,1.0
c,2.0,2.0,2.0
d,,,0.0
e,,,3.0
f,,0.0,


In [78]:
DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face'))

Unnamed: 0,one,three,two
f,,0.0,
a,0.0,1.0,
c,2.0,2.0,2.0
e,,,3.0


### 频率不同的时间序列的运算

In [79]:
ts1 = Series(np.random.randn(3),
                index=pd.date_range('2012-6-13', periods=3, freq='W-WED'))
ts1

2012-06-13   -0.204708
2012-06-20    0.478943
2012-06-27   -0.519439
Freq: W-WED, dtype: float64

In [80]:
ts1.resample('B')

DatetimeIndexResampler [freq=<BusinessDay>, axis=0, closed=left, label=left, convention=start, base=0]

In [81]:
ts1.resample('B', fill_method='ffill')

the new syntax is .resample(...).ffill()
  """Entry point for launching an IPython kernel.


2012-06-13   -0.204708
2012-06-14   -0.204708
2012-06-15   -0.204708
2012-06-18   -0.204708
2012-06-19   -0.204708
2012-06-20    0.478943
2012-06-21    0.478943
2012-06-22    0.478943
2012-06-25    0.478943
2012-06-26    0.478943
2012-06-27   -0.519439
Freq: B, dtype: float64

In [82]:
dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18', '2012-6-21', '2012-6-22', '2012-6-29'])
ts2 = Series(np.random.randn(6), index=dates)
ts2

2012-06-12   -0.555730
2012-06-17    1.965781
2012-06-18    1.393406
2012-06-21    0.092908
2012-06-22    0.281746
2012-06-29    0.769023
dtype: float64

In [83]:
ts1.reindex(ts2.index, method='ffill')

2012-06-12         NaN
2012-06-17   -0.204708
2012-06-18   -0.204708
2012-06-21    0.478943
2012-06-22    0.478943
2012-06-29   -0.519439
dtype: float64

In [84]:
ts2 + ts1.reindex(ts2.index, method='ffill')

2012-06-12         NaN
2012-06-17    1.761073
2012-06-18    1.188698
2012-06-21    0.571851
2012-06-22    0.760689
2012-06-29    0.249584
dtype: float64

#### 使用Period

In [85]:
gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46], index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))

In [86]:
infl = Series([0.025, 0.045, 0.037, 0.04], index=pd.period_range('1982', periods=4, freq='A-DEC'))

In [87]:
gdp

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

In [88]:
infl

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-DEC, dtype: float64

In [89]:
infl_q = infl.asfreq('Q-SEP', how='end')
infl_q

1983Q1    0.025
1984Q1    0.045
1985Q1    0.037
1986Q1    0.040
Freq: Q-SEP, dtype: float64

In [90]:
infl_q.reindex(gdp.index, method='ffill')

1984Q2    0.045
1984Q3    0.045
1984Q4    0.045
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.037
Freq: Q-SEP, dtype: float64

### 时间和“最当前”数据选取

In [92]:
rng = pd.date_range('2012-06-01 09:30', '2012-06-01 15:59', freq='T')
rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1, 4)])

In [93]:
ts = Series(np.arange(len(rng), dtype=float), index=rng)
ts

2012-06-01 09:30:00       0.0
2012-06-01 09:31:00       1.0
2012-06-01 09:32:00       2.0
2012-06-01 09:33:00       3.0
2012-06-01 09:34:00       4.0
2012-06-01 09:35:00       5.0
2012-06-01 09:36:00       6.0
2012-06-01 09:37:00       7.0
2012-06-01 09:38:00       8.0
2012-06-01 09:39:00       9.0
                        ...  
2012-06-06 15:50:00    1550.0
2012-06-06 15:51:00    1551.0
2012-06-06 15:52:00    1552.0
2012-06-06 15:53:00    1553.0
2012-06-06 15:54:00    1554.0
2012-06-06 15:55:00    1555.0
2012-06-06 15:56:00    1556.0
2012-06-06 15:57:00    1557.0
2012-06-06 15:58:00    1558.0
2012-06-06 15:59:00    1559.0
Length: 1560, dtype: float64

In [94]:
from datetime import time
ts[time(10, 0)]

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [95]:
ts.at_time(time(10, 0))

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [96]:
ts.between_time(time(10, 0), time(10, 1))

2012-06-01 10:00:00      30.0
2012-06-01 10:01:00      31.0
2012-06-04 10:00:00     420.0
2012-06-04 10:01:00     421.0
2012-06-05 10:00:00     810.0
2012-06-05 10:01:00     811.0
2012-06-06 10:00:00    1200.0
2012-06-06 10:01:00    1201.0
dtype: float64

In [97]:
indexer = np.sort(np.random.permutation(len(ts))[700:])

In [98]:
irr_ts = ts.copy()
irr_ts[indexer] = np.nan
irr_ts['2012-06-01 09:50':'2012-06-01 10:00']

2012-06-01 09:50:00    20.0
2012-06-01 09:51:00     NaN
2012-06-01 09:52:00    22.0
2012-06-01 09:53:00     NaN
2012-06-01 09:54:00    24.0
2012-06-01 09:55:00    25.0
2012-06-01 09:56:00    26.0
2012-06-01 09:57:00     NaN
2012-06-01 09:58:00     NaN
2012-06-01 09:59:00    29.0
2012-06-01 10:00:00    30.0
dtype: float64

In [99]:
selection = pd.date_range('2012-06-01 10:00', periods=4, freq='B')
irr_ts.asof(selection)

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
Freq: B, dtype: float64

### 拼接多个数据源

In [100]:
data1 = DataFrame(np.ones((6, 3), dtype=float),
    columns=['a', 'b', 'c'],
    index=pd.date_range('6/12/2012', periods=6))

In [101]:
data2 = DataFrame(np.ones((6, 3), dtype=float) * 2, columns=['a', 'b', 'c'],
index=pd.date_range('6/13/2012', periods=6))

In [102]:
spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]])
spliced

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c
2012-06-12,1.0,1.0,1.0
2012-06-13,1.0,1.0,1.0
2012-06-14,1.0,1.0,1.0
2012-06-15,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0


In [103]:
data2 = DataFrame(np.ones((6, 4), dtype=float) * 2, columns=['a', 'b', 'c', 'd'],
index=pd.date_range('6/13/2012', periods=6))

In [104]:
spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]])
spliced

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,
2012-06-14,1.0,1.0,1.0,
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [105]:
spliced_filled = spliced.combine_first(data2)
spliced_filled

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [106]:
spliced.update(data2, overwrite=False)
spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [107]:
cp_spliced = spliced.copy()
cp_spliced[['a', 'c']] = data1[['a', 'c']]
cp_spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,1.0,2.0,1.0,2.0
2012-06-16,1.0,2.0,1.0,2.0
2012-06-17,1.0,2.0,1.0,2.0
2012-06-18,,2.0,,2.0


### 收益指数和累积收益

In [113]:
# pip install fix_yahoo_finance
import pandas_datareader.data as web
import fix_yahoo_finance as yf  
yf.pdr_override()  
#import pandas.io.data as web

In [114]:
price = web.get_data_yahoo('AAPL', '2011-01-01')['Adj Close']
price[-5:]

[*********************100%***********************]  1 of 1 downloaded


Date
2018-08-22    215.050003
2018-08-23    215.490005
2018-08-24    216.160004
2018-08-27    217.940002
2018-08-28    219.699997
Name: Adj Close, dtype: float64

In [115]:
price['2011-10-03'] / price['2011-3-01'] - 1

0.07239996862860165

In [116]:
returns = price.pct_change()
ret_index = (1 + returns).cumprod()
ret_index[0] = 1
ret_index

Date
2010-12-31    1.000000
2011-01-03    1.021732
2011-01-04    1.027065
2011-01-05    1.035466
2011-01-06    1.034629
2011-01-07    1.042039
2011-01-10    1.061663
2011-01-11    1.059152
2011-01-12    1.067770
2011-01-13    1.071677
                ...   
2018-08-15    6.789440
2018-08-16    6.888905
2018-08-17    7.026476
2018-08-20    6.958014
2018-08-21    6.944450
2018-08-22    6.944773
2018-08-23    6.958983
2018-08-24    6.980619
2018-08-27    7.038102
2018-08-28    7.094939
Name: Adj Close, Length: 1928, dtype: float64

In [117]:
m_returns = ret_index.resample('BM', how='last').pct_change()
m_returns['2012']

the new syntax is .resample(...).last()
  """Entry point for launching an IPython kernel.


Date
2012-01-31    0.127111
2012-02-29    0.188311
2012-03-30    0.105284
2012-04-30   -0.025970
2012-05-31   -0.010702
2012-06-29    0.010852
2012-07-31    0.045822
2012-08-31    0.122802
2012-09-28    0.002796
2012-10-31   -0.107600
2012-11-30    0.015453
2012-12-31   -0.090743
Freq: BM, Name: Adj Close, dtype: float64

In [118]:
m_rets = (1 + returns).resample('M', how='prod', kind='period') - 1
m_rets['2012']

the new syntax is .resample(...).prod()
  """Entry point for launching an IPython kernel.


Date
2012-01    0.127111
2012-02    0.188311
2012-03    0.105284
2012-04   -0.025970
2012-05   -0.010702
2012-06    0.010852
2012-07    0.045822
2012-08    0.122802
2012-09    0.002796
2012-10   -0.107600
2012-11    0.015453
2012-12   -0.090743
Freq: M, Name: Adj Close, dtype: float64

In [120]:
#returns[dividend_dates] += dividend_pcts

## 分组变换和分析

In [123]:
import random; random.seed(0) 
import string
N = 1000
def rands(n):
    choices = string.ascii_uppercase
    return ''.join([random.choice(choices)  for _ in xrange(n)]) 
tickers = np.array([rands(5) for _ in xrange(N)])

In [124]:
M = 500
df = DataFrame({'Momentum' : np.random.randn(M) / 200 + 0.03,
    'Value' : np.random.randn(M) / 200 + 0.08, 'ShortInterest' : np.random.randn(M) / 200 - 0.02}, index=tickers[:M])

In [125]:
ind_names = np.array(['FINANCIAL', 'TECH'])
sampler = np.random.randint(0, len(ind_names), N) 
industries = Series(ind_names[sampler], index=tickers,
                        name='industry')

In [126]:
by_industry = df.groupby(industries)
by_industry.mean()

Unnamed: 0_level_0,Momentum,ShortInterest,Value
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FINANCIAL,0.029747,-0.019472,0.079945
TECH,0.029781,-0.020253,0.08038


In [127]:
by_industry.describe()

Unnamed: 0_level_0,Momentum,Momentum,Momentum,Momentum,Momentum,Momentum,Momentum,Momentum,ShortInterest,ShortInterest,ShortInterest,ShortInterest,ShortInterest,Value,Value,Value,Value,Value,Value,Value,Value
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
FINANCIAL,245.0,0.029747,0.005197,0.017282,0.026414,0.029308,0.033093,0.047588,245.0,-0.019472,...,-0.016036,-0.006443,245.0,0.079945,0.004852,0.066168,0.076265,0.080032,0.083147,0.091375
TECH,255.0,0.029781,0.005137,0.016069,0.026437,0.029606,0.033333,0.043895,255.0,-0.020253,...,-0.017017,-0.005357,255.0,0.08038,0.005056,0.065367,0.077087,0.080554,0.08346,0.095978


In [128]:
def zscore(group):
    return (group - group.mean()) / group.std() 
df_stand = by_industry.apply(zscore)

In [129]:
df_stand.groupby(industries).agg(['mean', 'std'])

Unnamed: 0_level_0,Momentum,Momentum,ShortInterest,ShortInterest,Value,Value
Unnamed: 0_level_1,mean,std,mean,std,mean,std
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,-5.877385e-16,1.0,-3.566308e-16,1.0,8.286569e-15,1.0
TECH,2.103328e-15,1.0,-3.793044e-15,1.0,2.351278e-15,1.0


In [130]:
ind_rank = by_industry.rank(ascending=False)
ind_rank.groupby(industries).agg(['min', 'max'])

Unnamed: 0_level_0,Momentum,Momentum,ShortInterest,ShortInterest,Value,Value
Unnamed: 0_level_1,min,max,min,max,min,max
industry,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
FINANCIAL,1.0,245.0,1.0,245.0,1.0,245.0
TECH,1.0,255.0,1.0,255.0,1.0,255.0


In [131]:
by_industry.apply(lambda x: zscore(x.rank()))

Unnamed: 0,Momentum,ShortInterest,Value
VTKGN,1.288025,-1.328699,-1.233792
KUHMP,-1.255828,0.973619,0.070552
XNHTQ,1.030061,1.425153,0.790184
GXZVX,0.155215,-0.733742,-0.084663
ISXRM,-1.368711,1.142944,0.225767
CLPXZ,0.732140,-1.613420,1.057536
MWGUO,1.532071,-1.410048,-0.799931
ASKVR,-1.552147,0.945398,-0.366871
AMWGI,-1.125327,-0.094907,0.921954
WEOGZ,0.352512,1.328699,-1.586304


### 分组因子暴露

In [132]:
from numpy.random import rand
fac1, fac2, fac3 = np.random.rand(3, 1000)
ticker_subset = tickers.take(np.random.permutation(N)[:1000])
# Weighted sum of factors plus noise
port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + rand(1000),
index=ticker_subset)
factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3},
                            index=ticker_subset)

In [133]:
factors.corrwith(port)

f1    0.416542
f2   -0.673998
f3    0.189592
dtype: float64

In [144]:
#pd.stats.api.OLS(y=port, x=factors).beta

AttributeError: 'module' object has no attribute 'stats'

In [145]:
#def beta_exposure(chunk, factors=None):
#    return pd.ols(y=chunk, x=factors).beta

In [146]:
by_ind = port.groupby(industries)

In [148]:
#exposures = by_ind.apply(beta_exposure, factors=factors)
#exposures.unstack()

### 十分位和四分位分析

In [149]:
#import pandas.io.data as web

In [150]:
data = web.get_data_yahoo('SPY', '2006-01-01')
data

[*********************100%***********************]  1 of 1 downloaded


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2006-01-03,125.190002,127.000000,124.389999,126.699997,98.264160,73256700
2006-01-04,126.860001,127.489998,126.699997,127.300003,98.729507,51899600
2006-01-05,127.150002,127.589996,126.879997,127.379997,98.791534,47307500
2006-01-06,128.020004,128.580002,127.360001,128.440002,99.613632,62885900
2006-01-09,128.419998,129.059998,128.380005,128.770004,99.869576,43527400
2006-01-10,128.389999,128.979996,128.259995,128.899994,99.970367,44960800
2006-01-11,129.020004,129.440002,128.729996,129.309998,100.288345,49598900
2006-01-12,129.080002,129.279999,128.440002,128.800003,99.892838,40509200
2006-01-13,128.570007,128.899994,128.199997,128.679993,99.799782,44856700
2006-01-17,128.199997,128.419998,127.809998,128.330002,99.528313,52066600


In [151]:
px = data['Adj Close'] 
returns = px.pct_change()

In [154]:
def to_index(rets):
    index = (1 + rets).cumprod()
    first_loc = max(index.notnull().argmax() - 1, 0) 
    index.values[first_loc] = 1
    return index

def trend_signal(rets, lookback, lag):
    #signal = pd.rolling(rets, lookback, min_periods=lookback - 5).sum()
    return signal.shift(lag)

In [158]:
#signal = trend_signal(returns, 100, 3)

In [157]:
#trade_friday = signal.resample('W-FRI').resample('B', fill_method='ffill')
#to_index(trade_rets).plot()

In [161]:
#vol = pd.rolling_std(returns, 250, min_periods=200) * np.sqrt(250)
def sharpe(rets, ann=250):
    return rets.mean() / rets.std() * np.sqrt(ann)

In [162]:
#trade_rets.groupby(pd.qcut(vol, 4)).agg(sharpe)

## 更多示例应用

### 信号前沿分析

In [164]:
names = ['AAPL', 'GOOG', 'MSFT', 'DELL', 'GS', 'MS', 'BAC', 'C'] 
def get_px(stock, start, end):
    return web.get_data_yahoo(stock, start, end)['Adj Close']
#px = DataFrame({n: get_px(n, '1/1/2009', '6/1/2012') for n in names})

In [165]:
#px = px.asfreq('B').fillna(method='pad')
#rets = px.pct_change()
#((1 + rets).cumprod() - 1).plot()

In [166]:
def calc_mom(price, lookback, lag):
    mom_ret = price.shift(lag).pct_change(lookback) 
    ranks = mom_ret.rank(axis=1, ascending=False) 
    demeaned = ranks - ranks.mean(axis=1)
    return demeaned / demeaned.std(axis=1)

In [167]:
compound = lambda x : (1 + x).prod() - 1 
daily_sr = lambda x: x.mean() / x.std()

In [168]:
def strat_sr(prices, lb, hold):
    # Compute portfolio weights
    freq = '%dB' % hold
    port = calc_mom(prices, lb, lag=1)
    daily_rets = prices.pct_change()
    # Compute portfolio returns
    port = port.shift(1).resample(freq, how='first') 
    returns = daily_rets.resample(freq, how=compound) 
    port_rets = (port * returns).sum(axis=1)
    return daily_sr(port_rets) * np.sqrt(252 / hold)

In [170]:
#strat_sr(px, 70, 30)

In [172]:
'''
from collections import defaultdict
lookbacks = range(20, 90, 5) 
holdings = range(20, 90, 5) 
dd = defaultdict(dict)
for lb in lookbacks:
    for hold in holdings:
        dd[lb][hold] = strat_sr(px, lb, hold)
ddf = DataFrame(dd)
ddf.index.name = 'Holding Period' 
ddf.columns.name = 'Lookback Period'
'''

'\nfrom collections import defaultdict\nlookbacks = range(20, 90, 5) \nholdings = range(20, 90, 5) \ndd = defaultdict(dict)\nfor lb in lookbacks:\n    for hold in holdings:\n        dd[lb][hold] = strat_sr(px, lb, hold)\n'

In [173]:
import matplotlib.pyplot as plt
def heatmap(df, cmap=plt.cm.gray_r):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    axim = ax.imshow(df.values, cmap=cmap, interpolation='nearest') 
    ax.set_xlabel(df.columns.name) 
    ax.set_xticks(np.arange(len(df.columns))) 
    ax.set_xticklabels(list(df.columns)) 
    ax.set_ylabel(df.index.name) 
    ax.set_yticks(np.arange(len(df.index))) 
    ax.set_yticklabels(list(df.index))
    plt.colorbar(axim)

In [175]:
#heatmap(ddf)

### 期货合约转仓

In [176]:
#import pandas.io.data as web

In [178]:
px = web.get_data_yahoo('SPY')['Adj Close'] * 10
px

[*********************100%***********************]  1 of 1 downloaded


Date
1993-01-29     272.34995
1993-02-01     274.28684
1993-02-02     274.86790
1993-02-03     277.77365
1993-02-04     278.93591
1993-02-05     278.74176
1993-02-08     278.74176
1993-02-09     276.80479
1993-02-10     277.19221
1993-02-11     278.54851
                 ...    
2018-08-15    2817.79999
2018-08-16    2840.59998
2018-08-17    2850.59998
2018-08-20    2856.70013
2018-08-21    2863.39996
2018-08-22    2861.70013
2018-08-23    2857.90009
2018-08-24    2875.10010
2018-08-27    2897.79999
2018-08-28    2899.20013
Name: Adj Close, Length: 6443, dtype: float64

In [180]:
from datetime import datetime
expiry = {'ESU2': datetime(2012, 9, 21),
    'ESZ2': datetime(2012, 12, 21)} 
#expiry = Series(expiry).order()
expiry

{'ESU2': datetime.datetime(2012, 9, 21, 0, 0),
 'ESZ2': datetime.datetime(2012, 12, 21, 0, 0)}

In [181]:
np.random.seed(12347)
N = 200
walk = (np.random.randint(0, 200, size=N) - 100) * 0.25 
perturb = (np.random.randint(0, 20, size=N) - 10) * 0.25 
walk = walk.cumsum()

rng = pd.date_range(px.index[0], periods=len(px) + N, freq='B') 
near = np.concatenate([px.values, px.values[-1] + walk])
far = np.concatenate([px.values, px.values[-1] + walk + perturb]) 
prices = DataFrame({'ESU2': near, 'ESZ2': far}, index=rng)

In [182]:
prices.tail()

Unnamed: 0,ESU2,ESZ2
2018-07-11,2928.45013,2930.20013
2018-07-12,2914.70013,2916.95013
2018-07-13,2922.70013,2924.45013
2018-07-16,2939.20013,2938.45013
2018-07-17,2919.20013,2916.95013


In [183]:
def get_roll_weights(start, expiry, items, roll_periods=5): 
    # start : first date to compute weighting DataFrame
    # expiry : Series of ticker -> expiration dates
    # items : sequence of contract names
    dates = pd.date_range(start, expiry[-1], freq='B') 
    weights = DataFrame(np.zeros((len(dates), len(items))),
        index=dates, columns=items)
    prev_date = weights.index[0]
    for i, (item, ex_date) in enumerate(expiry.iteritems()):
        if i < len(expiry) - 1:
            weights.ix[prev_date:ex_date - pd.offsets.BDay(), item] = 1 
            roll_rng = pd.date_range(end=ex_date - pd.offsets.BDay(),
                        periods=roll_periods + 1, freq='B')
            decay_weights = np.linspace(0, 1, roll_periods + 1) 
            weights.ix[roll_rng, item] = 1 - decay_weights 
            weights.ix[roll_rng, expiry.index[i + 1]] = decay_weights
        else:
            weights.ix[prev_date:, item] = 1
        prev_date = ex_date
    return weights

In [186]:
#weights = get_roll_weights('6/1/2012', expiry, prices.columns)
#weights.ix['2012-09-12':'2012-09-21']

### 移动相关系数与线性回归

In [187]:
aapl = web.get_data_yahoo('AAPL', '2000-01-01')['Adj Close'] 
msft = web.get_data_yahoo('MSFT', '2000-01-01')['Adj Close']
aapl_rets = aapl.pct_change() 
msft_rets = msft.pct_change()

[*********************100%***********************]  1 of 1 downloaded
[*********************100%***********************]  1 of 1 downloaded


In [189]:
#pd.rolling_corr(aapl_rets, msft_rets, 250).plot()

In [190]:
#model = pd.ols(y=aapl_rets, x={'MSFT': msft_rets}, window=250)
#model.beta

In [None]:
#model.beta['MSFT'].plot()