# 股票交易数据分析

In [201]:
%pwd

u'C:\\Users\\Joey-YongChang.Huang@cn.abb.com\\kamidox\\work\\stock-data'

In [202]:
import pandas as pd
import numpy as np

## 原始数据: 2000 年 - 2009 年 5 分钟历史成交数据

需要确保原始数据放在 raw 目录下，且每个年份单独一个目录。搜索 ghancn 可以免费下载 2009 年之前的数据。2009 年之后的需要购买。

目前的免费数据质量不高，里面有不少错误。具体参阅 README.md。

In [203]:
%ls raw

 Volume in drive C has no label.
 Volume Serial Number is D0E7-324C

 Directory of C:\Users\Joey-YongChang.Huang@cn.abb.com\kamidox\work\stock-data\raw

2015-11-13  17:03    <DIR>          .
2015-11-13  17:03    <DIR>          ..
2015-11-13  15:29    <DIR>          2007
2015-11-12  14:24    <DIR>          2008
               0 File(s)              0 bytes
               4 Dir(s)  62,529,323,008 bytes free


In [204]:
names = ['date',
         'time',
         'opening_price',
         'ceiling_price',
         'floor_price',
         'closing_price',
         'volume',
         'amount']
# 读取数据时，我们以日期为索引，并解析成日期格式
raw = pd.read_csv('raw/2008/SH600690.csv', names=names, header=None, index_col='date', parse_dates=True)
raw.head()

Unnamed: 0_level_0,time,opening_price,ceiling_price,floor_price,closing_price,volume,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2008-01-02,09:35,22.5,22.63,22.5,22.51,2042.5,4604723
2008-01-02,09:40,22.51,22.51,22.29,22.37,1545.17,3460503
2008-01-02,09:45,22.39,22.62,22.38,22.62,1744.76,3921443
2008-01-02,09:50,22.6,23.0,22.6,22.95,5339.0,12225939
2008-01-02,09:55,22.98,23.2,22.89,23.2,12577.73,28947824


### 转化为日交易数据

In [205]:
# 股票涨跌幅检查，不能超过 10% ，过滤掉一些不合法的数据
def _valid_price(g):
    return (((g.max() - g.min()) / g.min()) < 0.223).all()

# 按照日期分组
days = raw.groupby(level=0).agg(
    {'opening_price': lambda g: _valid_price(g) and g[0] or 0,
     'ceiling_price': lambda g: _valid_price(g) and np.max(g) or 0,
     'floor_price': lambda g: _valid_price(g) and np.min(g) or 0,
     'closing_price': lambda g: _valid_price(g) and g[-1] or 0,
     'volume': 'sum',
     'amount': 'sum'})
days.head()

Unnamed: 0_level_0,floor_price,opening_price,ceiling_price,volume,amount,closing_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-01-02,22.29,22.5,24.5,200809.34,476179680,24.03
2008-01-03,23.81,24.03,25.2,166037.98,406906304,24.54
2008-01-04,23.68,24.53,24.76,149078.64,358418560,24.17
2008-01-07,23.75,24.03,24.75,93950.43,227289136,24.38
2008-01-08,23.49,24.38,24.38,149056.24,355752416,23.53


### 合并数据

可以参阅 stock.py 里的 `minutes_to_days_batch()` 函数。把所有的数据转化为日交易数据，然后以股票代号为文件名保存在 `data` 目录下。

## 选股

什么股票是好股票？要回答这个问题，先要把最简单的问题说清楚。炒股就是低买高卖，实现获利。那么好股票的标准就是在你的持股周期内，**波动最大的股票**。这很好理解吧，波动最大，我们才有可能在相对低点买入，在相对高点卖出，获利最大。

在一定的时间周期内，**衡量股票波动的指标定义为 最高价/最低价**。以我们表格中的数据，就是 ceiling_price/floor_price。这个比率最大的股票就是好股票。

关于时间周期，这个和炒股策略有关。有些人喜欢做短线，可能就持股几天，或一两周。有些人习惯做长线，可能持股几个月甚至几年。

有了这个思路，我们就可以玩转已经转换为日交易数据的股票，选出近期波动最大的股票。假设我们的目标是**选出一个月内波动最大的股票**。我们看一下如何用 pandas 实现这个目标。

### 过滤数据

我们先要按照考查周期来过滤数据。为了简单起见，我们假设波动周期是30个自然日，即如果某个股票停牌，那么他的价格就一直没有变化，则波动为 0。

这里，我们直接使用青岛海尔 600690 这个股票来作为示例。我们直接读取已经合并过的数据。

In [231]:
qdhr = pd.read_csv('test-data/SH600690.csv', index_col='date', parse_dates=True)
qdhr.head()

Unnamed: 0_level_0,floor_price,opening_price,ceiling_price,volume,amount,closing_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-01-04,9.28,9.3,10.14,259264.75,254734000,9.8
2007-01-05,9.53,9.7,10.15,171169.97,170154432,9.9
2007-01-08,9.93,9.93,10.78,159340.58,164954896,10.6
2007-01-09,10.08,10.68,11.15,227163.31,246309216,10.55
2007-01-10,10.26,10.49,11.13,232858.18,246221520,11.1


In [232]:
# 填充数据：生成日期索引
l = len(qdhr)
start = qdhr.iloc[0:1].index.tolist()[0]
end = qdhr.iloc[l - 1: l].index.tolist()[0]
idx = pd.date_range(start=start, end=end)
idx

DatetimeIndex(['2007-01-04', '2007-01-05', '2007-01-06', '2007-01-07',
               '2007-01-08', '2007-01-09', '2007-01-10', '2007-01-11',
               '2007-01-12', '2007-01-13',
               ...
               '2008-12-22', '2008-12-23', '2008-12-24', '2008-12-25',
               '2008-12-26', '2008-12-27', '2008-12-28', '2008-12-29',
               '2008-12-30', '2008-12-31'],
              dtype='datetime64[ns]', length=728, freq='D')

In [233]:
# 填充数据，把缺失的交易数据用前一个交易数据来填充，但交易量设置为 0
data = qdhr.reindex(idx)
zvalues = data.loc[~(data.volume > 0)].loc[:, ['volume', 'amount']]
data.update(zvalues.fillna(0))
data.fillna(method='ffill', inplace=True)
data.head()

Unnamed: 0,floor_price,opening_price,ceiling_price,volume,amount,closing_price
2007-01-04,9.28,9.3,10.14,259264.75,254734000,9.8
2007-01-05,9.53,9.7,10.15,171169.97,170154432,9.9
2007-01-06,9.53,9.7,10.15,0.0,0,9.9
2007-01-07,9.53,9.7,10.15,0.0,0,9.9
2007-01-08,9.93,9.93,10.78,159340.58,164954896,10.6


In [234]:
import numpy as np

# 定义产生分组索引的函数，比如我们要计算的周期是 20 天，则按照日期，20 个交易日一组
def gen_item_group_index(total, group_len):
    """ generate an item group index array 
    
    suppose total = 10, unitlen = 2, then we will return array [0 0 1 1 2 2 3 3 4 4]
    """
    
    group_count = total / group_len
    group_index = np.arange(total)
    for i in range(group_count):
        group_index[i * group_len: (i + 1) * group_len] = i
    group_index[(i + 1) * group_len : total] = i + 1
    return group_index.tolist()

gen_item_group_index(10, 3)

[0, 0, 0, 1, 1, 1, 2, 2, 2, 3]

In [235]:
period = 30

group_index = gen_item_group_index(len(data), period)
# 把分组索引数据添加到股票数据里
data['group_index'] = group_index
print len(data)
data.head().append(data.tail())

728


Unnamed: 0,floor_price,opening_price,ceiling_price,volume,amount,closing_price,group_index
2007-01-04,9.28,9.3,10.14,259264.75,254734000,9.8,0
2007-01-05,9.53,9.7,10.15,171169.97,170154432,9.9,0
2007-01-06,9.53,9.7,10.15,0.0,0,9.9,0
2007-01-07,9.53,9.7,10.15,0.0,0,9.9,0
2007-01-08,9.93,9.93,10.78,159340.58,164954896,10.6,0
2008-12-27,8.97,9.15,9.23,0.0,0,9.08,24
2008-12-28,8.97,9.15,9.23,0.0,0,9.08,24
2008-12-29,8.73,9.04,9.15,38576.07,34625144,9.11,24
2008-12-30,8.95,9.14,9.14,62983.38,56876600,8.96,24
2008-12-31,8.95,9.0,9.11,32829.3,29620508,8.99,24


In [236]:
# 针对下跌的波动，我们把最高价设置为负数
def _ceiling_price(g):
    return g.idxmin() < g.idxmax() and np.max(g) or (-np.max(g))
    

# 根据索引分组计算
group = data.groupby('group_index').agg({
                                        'volume': 'sum', 
                                        'floor_price': 'min', 
                                        'ceiling_price': _ceiling_price})
group

Unnamed: 0_level_0,volume,ceiling_price,floor_price
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4634226.68,-12.38,9.02
1,3499001.47,11.64,8.8
2,6061972.34,12.79,9.41
3,6086797.19,15.5,12.0
4,5687407.73,17.15,13.49
5,4817152.7,18.89,11.85
6,2828241.75,18.03,14.4
7,4550833.2,23.96,16.42
8,2388246.85,28.37,22.0
9,2134361.76,-25.8,19.04


In [237]:
# 添加每个分组的起始日期
date_col = pd.DataFrame({"group_index": group_index, "date": idx})
group['date'] = date_col.groupby('group_index').agg('first')
group.head()

Unnamed: 0_level_0,volume,ceiling_price,floor_price,date
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,4634226.68,-12.38,9.02,2007-01-04
1,3499001.47,11.64,8.8,2007-02-03
2,6061972.34,12.79,9.41,2007-03-05
3,6086797.19,15.5,12.0,2007-04-04
4,5687407.73,17.15,13.49,2007-05-04


In [238]:
# 添加我们的波动指标 股票波动系数 = 最高价/最低价
group['ripples_radio'] = group.ceiling_price / group.floor_price
group.head()

Unnamed: 0_level_0,volume,ceiling_price,floor_price,date,ripples_radio
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,4634226.68,-12.38,9.02,2007-01-04,-1.372506
1,3499001.47,11.64,8.8,2007-02-03,1.322727
2,6061972.34,12.79,9.41,2007-03-05,1.359192
3,6086797.19,15.5,12.0,2007-04-04,1.291667
4,5687407.73,17.15,13.49,2007-05-04,1.271312


In [239]:
# 降序排列。我们把分组的起始日期，交易量总和都列出来，也可以观察一下交易量和股票波动比的关系
ripples = group.sort_values('ripples_radio', ascending=False)
ripples.head()

Unnamed: 0_level_0,volume,ceiling_price,floor_price,date,ripples_radio
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,4817152.7,18.89,11.85,2007-06-03,1.594093
7,4550833.2,23.96,16.42,2007-08-02,1.459196
11,3014588.11,23.44,16.4,2007-11-30,1.429268
2,6061972.34,12.79,9.41,2007-03-05,1.359192
18,2813493.25,11.31,8.46,2008-06-27,1.336879


In [241]:
# 我们算出前 10 个上涨的波动。作为这个股票的波动值。
# 最后，我们就可以根据所有股票的波动值来选择最优的股票了。
ripples.head(10).ripples_radio.mean()

1.3657990069195818

In [242]:
# 我们也可以看一下前 10 个下跌的波动。
ripples.tail(10).ripples_radio.mean()

-1.4124407127785106

### 计算涨跌幅

In [187]:
data = pd.read_csv('test-data/SZ000565.csv', index_col='date', parse_dates=True)
data.head()

Unnamed: 0_level_0,floor_price,opening_price,ceiling_price,volume,amount,closing_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2007-01-04,4.16,4.22,4.27,17877.88,7477370.52,4.19
2007-01-05,4.15,4.16,4.27,10857.66,4588246.02,4.24
2007-01-08,4.27,4.27,4.45,30770.01,13467986.0,4.44
2007-01-09,4.42,4.48,4.54,26276.89,11726492.0,4.45
2007-01-10,4.36,4.45,4.9,80840.76,37866240.01,4.9


In [193]:
rise = data.closing_price.diff()
rise.head()

date
2007-01-04     NaN
2007-01-05    0.05
2007-01-08    0.20
2007-01-09    0.01
2007-01-10    0.45
Name: closing_price, dtype: float64

In [196]:
rise.iloc[0] = 0
rise.head()

date
2007-01-04    0.00
2007-01-05    0.05
2007-01-08    0.20
2007-01-09    0.01
2007-01-10    0.45
Name: closing_price, dtype: float64

In [198]:
data['rise'] = rise
data.head()

Unnamed: 0_level_0,floor_price,opening_price,ceiling_price,volume,amount,closing_price,rise
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2007-01-04,4.16,4.22,4.27,17877.88,7477370.52,4.19,0.0
2007-01-05,4.15,4.16,4.27,10857.66,4588246.02,4.24,0.05
2007-01-08,4.27,4.27,4.45,30770.01,13467986.0,4.44,0.2
2007-01-09,4.42,4.48,4.54,26276.89,11726492.0,4.45,0.01
2007-01-10,4.36,4.45,4.9,80840.76,37866240.01,4.9,0.45


### 计算指定时间点之前的一段时间内波动最大的股票

比如，我们有时候想知道最近一个月内上涨最多的股票，或者最近一个月内下跌最多的股票

In [248]:
end_date = '2008-12-31'
period = 30

end_date = pd.Timestamp(end_date)
start_date = end_date - pd.Timedelta(days=period)

data = pd.read_csv('test-data/SZ000565.csv', index_col='date', parse_dates=True)
data = data.loc[start_date:end_date]
data

Unnamed: 0_level_0,floor_price,opening_price,ceiling_price,volume,amount,closing_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-12-01,7.4,7.58,7.9,41747.12,32146100.0,7.88
2008-12-02,7.55,7.56,8.38,74552.15,60296610.0,8.32
2008-12-03,8.4,8.4,8.93,85361.64,74200820.0,8.82
2008-12-04,8.42,8.88,9.08,110410.46,97406100.0,8.5
2008-12-05,8.33,8.4,9.35,126479.91,113357200.0,9.35
2008-12-08,9.35,9.4,9.99,149491.39,143603800.0,9.69
2008-12-09,9.1,9.73,9.73,89871.9,84052300.0,9.15
2008-12-10,9.09,9.11,9.55,70036.94,65713890.0,9.46
2008-12-11,9.06,9.4,9.47,57735.24,53284680.0,9.06
2008-12-12,8.15,8.8,9.0,59210.49,50380260.0,8.29


In [251]:
# 计算波动值
_ripple_radio = lambda data: data.ceiling_price.max() / data.floor_price.min()
ripple_radio = data.floor_price.idxmin() < data.ceiling_price.idxmax() and _ripple_radio(data) or -_ripple_radio(data)
ripple_radio

-1.4394812680115274

最后，遍历所有的股票，计算其指定日期之前的一段时间的波动值，选出波动最大的股票，即是我们关注的股票

## 使用 stock.py 来过滤数据

In [256]:
import stock as st
reload(st)

<module 'stock' from 'stock.pyc'>

### 获取指定股票的历史所有波动数据

In [253]:
ripples = st.stock_ripples('test-data/SZ000565.csv', period=30)

mean ripples range on top 10 in period of 30 for test-data/SZ000565.csv: 1.5747


In [257]:
# 显示指定周期内，股票的前十个波动周期
ripples.head(10)

Unnamed: 0_level_0,volume,ceiling_price,floor_price,ripples_radio,date
group_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13,684527.13,53.4,31.41,1.700096,2008-01-29
22,1263764.46,9.1,5.43,1.675875,2008-10-25
5,1683845.12,15.0,9.23,1.625135,2007-06-03
11,750365.28,32.0,19.9,1.60804,2007-11-30
15,982090.48,43.62,27.7,1.574729,2008-03-29
6,1872200.46,19.1,12.21,1.564292,2007-07-03
1,697691.1,8.12,5.25,1.546667,2007-02-03
0,1014832.08,6.29,4.15,1.515663,2007-01-04
4,1679180.0,15.6,10.48,1.48855,2007-05-04
12,994532.42,42.99,29.7,1.447475,2007-12-30


### 获取某个波动数据日交易记录

In [258]:
# 有时候，我们想看一下某个股票某个波动周期下的原始数据
stock_file = 'test-data/SZ000565.csv'
st.ripple_raw_data(stock_file, ripple_idx=0, days=30)

mean ripples range on top 10 in period of 30 for test-data/SZ000565.csv: 1.5747


Unnamed: 0_level_0,floor_price,opening_price,ceiling_price,volume,amount,closing_price
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-01-29,35.22,37.2,38.4,31080.85,114278600.0,36.8
2008-01-30,35.2,36.9,37.8,32685.97,119162400.0,36.17
2008-01-31,34.68,36.0,36.99,16936.99,61100520.0,34.9
2008-02-01,31.41,35.11,35.7,37800.12,125568300.0,33.96
2008-02-04,34.23,34.23,36.98,28833.04,103536100.0,36.45
2008-02-05,34.88,36.6,37.79,22899.41,83959020.0,37.25
2008-02-13,36.5,37.29,40.5,38630.03,152748400.0,39.68
2008-02-14,39.8,39.8,41.37,24816.46,101289300.0,40.89
2008-02-15,40.3,40.9,44.98,58228.07,248587900.0,43.92
2008-02-18,43.0,43.0,45.98,37877.2,169736200.0,44.64


### 获取所有股票在指定时间点之前的一定时间内的波动排行板

In [264]:
ripples = st.recent_ripples(basedir='test-data', end_date='2007-11-30', period=30)
ripples.head(10).append(ripples.tail(10))

head 5 recent ripples in period of 30 for all stocks in test-data till 2007-11-30 00:00:00:
    stock_id   ripples
6   SH600689  1.460396
10  SH600693  1.363095
18  SZ000565  1.339080
4   SH600687  1.327731
8   SH600691  1.315024
tail 5 recent ripples in period of 30 for all stocks in test-data till 2007-11-30 00:00:00:
    stock_id   ripples
5   SH600688 -1.393103
2   SH600685 -1.422372
16  SZ000563 -1.440594
12  SZ000001 -1.457738
15  SZ000562 -1.612127


Unnamed: 0,stock_id,ripples
6,SH600689,1.460396
10,SH600693,1.363095
18,SZ000565,1.33908
4,SH600687,1.327731
8,SH600691,1.315024
20,SZ000567,1.263158
19,SZ000566,1.187879
9,SH600692,1.16185
17,SZ000564,-1.130506
1,SH600273,-1.206849


### 获取所有股票在指定时间点之前的一定时间内的交易记录

In [323]:
reload(st)
days = st.row_data('test-data/SZ000565.csv', end_date='2007-11-30', period=30)
days

Unnamed: 0_level_0,floor_price,opening_price,ceiling_price,volume,amount,closing_price,rise,rise_ratio
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2007-10-31,17.4,17.68,18.85,68564.6,125930200.0,18.77,1.17,0.066477
2007-11-01,18.2,18.7,19.35,46222.86,86826500.0,18.2,-0.57,-0.030368
2007-11-02,17.65,17.95,20.02,99849.0,195159400.0,20.02,1.82,0.1
2007-11-05,19.53,20.1,21.18,75735.5,154037400.0,19.67,-0.35,-0.017483
2007-11-06,19.55,19.55,21.38,50352.08,103503500.0,20.85,1.18,0.05999
2007-11-07,20.34,20.51,21.09,26222.33,54403490.0,20.6,-0.25,-0.01199
2007-11-08,18.9,20.5,20.66,38444.63,75386720.0,19.03,-1.57,-0.076214
2007-11-09,18.02,18.7,19.1,27578.91,51353280.0,18.8,-0.23,-0.012086
2007-11-12,18.1,18.2,19.49,20073.82,37601000.0,19.29,0.49,0.026064
2007-11-13,19.2,19.2,20.49,32208.06,64240500.0,19.95,0.66,0.034215
