## 交易明显数据（分笔数据）重采样

In [24]:
import tushare as ts
import pandas as pd

In [25]:
df = ts.get_tick_data('600728', date='2018-11-09',retry_count=5, pause=0.01,src='tt')

In [27]:
df.tail(10)

Unnamed: 0,time,price,change,volume,amount,type
0,09:25:02,6.97,0.05,2493,1737621,卖盘
1,09:30:02,6.96,-0.01,1300,905216,卖盘
2,09:30:05,6.96,0.0,619,429454,买盘
3,09:30:08,6.93,-0.03,116,80148,卖盘
4,09:30:11,6.93,0.0,44,30471,买盘
5,09:30:20,6.96,0.03,155,107655,买盘
6,09:30:23,6.91,-0.05,3,2073,中性盘
7,09:30:29,6.91,0.0,8,5528,卖盘
8,09:30:30,6.91,0.0,99,68685,卖盘
9,09:30:35,6.91,0.0,1,691,卖盘


In [4]:
df['time'] = '2018-11-09 ' + df['time']   #时间前面加上日期，不用for，整列统一加上
df['time'] = pd.to_datetime(df['time'])   #将整列从文本变成时间格式

In [5]:
df.tail(10)

Unnamed: 0,time,price,change,volume,amount,type
2414,2018-11-09 14:56:34,6.92,0.01,136,93991,买盘
2415,2018-11-09 14:56:38,6.92,0.0,20,13840,买盘
2416,2018-11-09 14:56:41,6.92,0.0,272,188072,买盘
2417,2018-11-09 14:56:43,6.92,0.0,172,119010,买盘
2418,2018-11-09 14:56:47,6.91,-0.01,9,6219,卖盘
2419,2018-11-09 14:56:50,6.92,0.01,50,34600,买盘
2420,2018-11-09 14:56:53,6.91,-0.01,180,124519,卖盘
2421,2018-11-09 14:56:56,6.92,0.01,106,73343,买盘
2422,2018-11-09 14:56:59,6.92,0.0,168,116245,买盘
2423,2018-11-09 15:00:03,6.9,-0.02,2561,1767090,卖盘


## 将转换格式后的时间列设置为index

In [6]:
df = df.set_index('time') # 将转换格式后的时间列设置为index

In [7]:
df.tail(10)

Unnamed: 0_level_0,price,change,volume,amount,type
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-11-09 14:56:34,6.92,0.01,136,93991,买盘
2018-11-09 14:56:38,6.92,0.0,20,13840,买盘
2018-11-09 14:56:41,6.92,0.0,272,188072,买盘
2018-11-09 14:56:43,6.92,0.0,172,119010,买盘
2018-11-09 14:56:47,6.91,-0.01,9,6219,卖盘
2018-11-09 14:56:50,6.92,0.01,50,34600,买盘
2018-11-09 14:56:53,6.91,-0.01,180,124519,卖盘
2018-11-09 14:56:56,6.92,0.01,106,73343,买盘
2018-11-09 14:56:59,6.92,0.0,168,116245,买盘
2018-11-09 15:00:03,6.9,-0.02,2561,1767090,卖盘


In [8]:
##新建一个价格dataframe,然后自动将分笔数据变成1分钟数据，还加上了时间列
price_df = df['price'].resample('1min').ohlc() 

In [9]:
price_df = price_df.dropna() #删除空值 

In [10]:
price_df.head(10)

Unnamed: 0_level_0,open,high,low,close
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-11-09 09:25:00,6.97,6.97,6.97,6.97
2018-11-09 09:30:00,6.96,6.96,6.9,6.96
2018-11-09 09:31:00,6.92,6.96,6.92,6.93
2018-11-09 09:32:00,6.96,6.96,6.94,6.95
2018-11-09 09:33:00,6.96,6.96,6.95,6.96
2018-11-09 09:34:00,6.95,6.98,6.95,6.98
2018-11-09 09:35:00,6.98,6.98,6.97,6.98
2018-11-09 09:36:00,6.98,6.98,6.96,6.96
2018-11-09 09:37:00,6.97,6.98,6.96,6.98
2018-11-09 09:38:00,6.97,6.98,6.97,6.98


In [11]:
vols = df['volume'].resample('1min').sum() #从分笔数据计算分钟数据成交量

In [12]:
vols = vols.dropna() #去掉空值

In [13]:
vol_df = pd.DataFrame(vols, columns=['volume'])

In [14]:
amounts = df['amount'].resample('1min').sum()

In [15]:
amounts = amounts.dropna()

In [16]:
amount_df = pd.DataFrame(amounts, columns=['amount'])

# 根据index对几个dataframe进行合并，这里最重要。

In [17]:
newdf = price_df.merge(vol_df, left_index=True, 
                       right_index=True).merge(amount_df, 
                                               left_index=True, 
                                               right_index=True) 

In [18]:
newdf.tail(20)

Unnamed: 0_level_0,open,high,low,close,volume,amount
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-11-09 14:38:00,6.96,6.96,6.95,6.96,951,661524
2018-11-09 14:39:00,6.96,6.97,6.96,6.97,1008,701845
2018-11-09 14:40:00,6.96,6.96,6.94,6.95,1155,802882
2018-11-09 14:41:00,6.94,6.95,6.94,6.95,569,395049
2018-11-09 14:42:00,6.95,6.96,6.95,6.96,965,671167
2018-11-09 14:43:00,6.96,6.97,6.95,6.96,145,100821
2018-11-09 14:44:00,6.97,6.97,6.96,6.96,488,340005
2018-11-09 14:45:00,6.96,6.96,6.95,6.95,70,48704
2018-11-09 14:46:00,6.93,6.96,6.93,6.94,1545,1071975
2018-11-09 14:47:00,6.94,6.95,6.94,6.94,187,129808


In [19]:
d_dict = {'open':'first',
          'high':'max',
          'close':'last',
          'low':'min',
          'volume':'sum', 
          'amount':'sum'}

In [20]:
new = pd.DataFrame()
for col in newdf.columns:
    new[col] = newdf[col].resample('5min',how=d_dict[col])

the new syntax is .resample(...).first()
  This is separate from the ipykernel package so we can avoid doing imports until
the new syntax is .resample(...).max()
  This is separate from the ipykernel package so we can avoid doing imports until
the new syntax is .resample(...).min()
  This is separate from the ipykernel package so we can avoid doing imports until
the new syntax is .resample(...).last()
  This is separate from the ipykernel package so we can avoid doing imports until
the new syntax is .resample(...).sum()
  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
new.tail(20)

Unnamed: 0_level_0,open,high,low,close,volume,amount
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-11-09 13:25:00,6.97,6.99,6.97,6.99,1044,729008
2018-11-09 13:30:00,6.98,7.01,6.98,6.99,2753,1928380
2018-11-09 13:35:00,6.99,7.02,6.99,7.01,1630,1142716
2018-11-09 13:40:00,7.01,7.01,6.99,6.99,2432,1701934
2018-11-09 13:45:00,6.99,7.0,6.99,6.99,1161,811705
2018-11-09 13:50:00,6.99,6.99,6.97,6.98,1843,1286764
2018-11-09 13:55:00,6.98,6.98,6.96,6.97,1198,834859
2018-11-09 14:00:00,6.97,6.97,6.94,6.97,2955,2056229
2018-11-09 14:05:00,6.97,6.97,6.96,6.96,1294,900686
2018-11-09 14:10:00,6.95,6.96,6.93,6.93,3478,2414090
