In [1]:
import pandas as pd
import numpy as np

## 时间日期

* 时间戳 tiimestamp：固定的时刻 -> pd.Timestamp
* 固定时期 period：比如 2016年3月份，再如2015年销售额 -> pd.Period
* 时间间隔 interval：由起始时间和结束时间来表示，固定时期是时间间隔的一个特殊

**时间日期在 Pandas 里的作用**

* 分析金融数据，如股票交易数据
* 分析服务器日志


### Python datetime

python 标准库里提供了时间日期的处理。这个是时间日期的基础。

In [2]:
from datetime import datetime
from datetime import timedelta

In [3]:
now = datetime.now()
now

datetime.datetime(2016, 4, 26, 0, 14, 30, 282504)

In [4]:
now.year, now.month, now.day

(2016, 4, 26)

### 时间差

In [5]:
date1 = datetime(2016, 3, 20)
date2 = datetime(2016, 3, 16)
delta = date1 - date2
delta

datetime.timedelta(4)

In [6]:
delta.days

4

In [7]:
delta.total_seconds()

345600.0

In [8]:
date2 + delta

datetime.datetime(2016, 3, 20, 0, 0)

In [9]:
date2 + timedelta(4.5)

datetime.datetime(2016, 3, 20, 12, 0)

### 字符串和 datetime 转换

关于 datetime 格式定义，可以参阅 python 官方文档

In [10]:
date = datetime(2016, 3, 20, 8, 30)
date

datetime.datetime(2016, 3, 20, 8, 30)

In [11]:
str(date)

'2016-03-20 08:30:00'

In [12]:
date.strftime('%Y-%m-%d %H:%M:%S')

'2016-03-20 08:30:00'

In [13]:
datetime.strptime('2016-03-20 09:30', '%Y-%m-%d %H:%M')

datetime.datetime(2016, 3, 20, 9, 30)

## Pandas 里的时间序列

Pandas 里使用 Timestamp 来表达时间

In [14]:
dates = [datetime(2016, 3, 1), datetime(2016, 3, 2), datetime(2016, 3, 3), datetime(2016, 3, 4)]
s = pd.Series(np.random.randn(4), index=dates)
s

2016-03-01    2.145321
2016-03-02    1.073716
2016-03-03   -0.647886
2016-03-04    1.590132
dtype: float64

In [15]:
type(s.index)

pandas.tseries.index.DatetimeIndex

In [16]:
type(s.index[0])

pandas.tslib.Timestamp

## 日期范围

### 生成日期范围

In [17]:
pd.date_range('20160320', '20160331')

DatetimeIndex(['2016-03-20', '2016-03-21', '2016-03-22', '2016-03-23',
               '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27',
               '2016-03-28', '2016-03-29', '2016-03-30', '2016-03-31'],
              dtype='datetime64[ns]', freq='D')

In [18]:
pd.date_range(start='20160320', periods=10)

DatetimeIndex(['2016-03-20', '2016-03-21', '2016-03-22', '2016-03-23',
               '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27',
               '2016-03-28', '2016-03-29'],
              dtype='datetime64[ns]', freq='D')

In [19]:
## 规则化时间戳
pd.date_range(start='2016-03-20 16:23:32', periods=10, normalize=True)

DatetimeIndex(['2016-03-20', '2016-03-21', '2016-03-22', '2016-03-23',
               '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27',
               '2016-03-28', '2016-03-29'],
              dtype='datetime64[ns]', freq='D')

### 时间频率

In [20]:
## 星期
pd.date_range(start='20160320', periods=10, freq='W')

DatetimeIndex(['2016-03-20', '2016-03-27', '2016-04-03', '2016-04-10',
               '2016-04-17', '2016-04-24', '2016-05-01', '2016-05-08',
               '2016-05-15', '2016-05-22'],
              dtype='datetime64[ns]', freq='W-SUN')

In [21]:
# 月
pd.date_range(start='20160320', periods=10, freq='M')

DatetimeIndex(['2016-03-31', '2016-04-30', '2016-05-31', '2016-06-30',
               '2016-07-31', '2016-08-31', '2016-09-30', '2016-10-31',
               '2016-11-30', '2016-12-31'],
              dtype='datetime64[ns]', freq='M')

In [22]:
## 每个月最后一个工作日组成的索引
pd.date_range(start='20160320', periods=10, freq='BM')

DatetimeIndex(['2016-03-31', '2016-04-29', '2016-05-31', '2016-06-30',
               '2016-07-29', '2016-08-31', '2016-09-30', '2016-10-31',
               '2016-11-30', '2016-12-30'],
              dtype='datetime64[ns]', freq='BM')

In [23]:
# 小时
pd.date_range(start='20160320', periods=10, freq='4H')

DatetimeIndex(['2016-03-20 00:00:00', '2016-03-20 04:00:00',
               '2016-03-20 08:00:00', '2016-03-20 12:00:00',
               '2016-03-20 16:00:00', '2016-03-20 20:00:00',
               '2016-03-21 00:00:00', '2016-03-21 04:00:00',
               '2016-03-21 08:00:00', '2016-03-21 12:00:00'],
              dtype='datetime64[ns]', freq='4H')

## 时期及算术运算

pd.Period 表示时期，比如几日，月或几个月等。比如用来统计每个月的销售额，就可以用时期作为单位。

In [24]:
p1 = pd.Period(2010)
p1

Period('2010', 'A-DEC')

In [25]:
p2 = p1 + 2
p2

Period('2012', 'A-DEC')

In [26]:
p2 - p1

2

In [27]:
p1 = pd.Period(2016, freq='M')
p1

Period('2016-01', 'M')

In [28]:
p1 + 3

Period('2016-04', 'M')

### 时期序列

In [29]:
pd.period_range(start='2016-01', periods=12, freq='M')

PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
             '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12'],
            dtype='int64', freq='M')

In [30]:
pd.period_range(start='2016-01', end='2016-10', freq='M')

PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
             '2016-07', '2016-08', '2016-09', '2016-10'],
            dtype='int64', freq='M')

In [31]:
# 直接用字符串
index = pd.PeriodIndex(['2016Q1', '2016Q2', '2016Q3'], freq='Q-DEC')
index

PeriodIndex(['2016Q1', '2016Q2', '2016Q3'], dtype='int64', freq='Q-DEC')

### 时期的频率转换

asfreq

* A-DEC: 以 12 月份作为结束的年时期
* A-NOV: 以 11 月份作为结束的年时期
* Q-DEC: 以 12 月份作为结束的季度时期

![time freq](time_freq.png)

In [32]:
p = pd.Period('2016', freq='A-DEC')
p

Period('2016', 'A-DEC')

In [33]:
p.asfreq('M', how='start')

Period('2016-01', 'M')

In [34]:
p.asfreq('M', how='end')

Period('2016-12', 'M')

In [35]:
p = pd.Period('2016-04', freq='M')
p

Period('2016-04', 'M')

In [36]:
p.asfreq('A-DEC')

Period('2016', 'A-DEC')

In [37]:
# 以年为周期，以一年中的 3 月份作为年的结束（财年）
p.asfreq('A-MAR')

Period('2017', 'A-MAR')

### 季度时间频率

Pandas 支持 12 种季度型频率，从 Q-JAN 到 Q-DEC

In [38]:
p = pd.Period('2016Q4', 'Q-JAN')
p

Period('2016Q4', 'Q-JAN')

In [39]:
# 以 1 月份结束的财年中，2016Q4 的时期是指 2015-11-1 到 2016-1-31
p.asfreq('D', how='start'), p.asfreq('D', how='end')

(Period('2015-11-01', 'D'), Period('2016-01-31', 'D'))

In [40]:
# 获取该季度倒数第二个工作日下午4点的时间戳
p4pm = (p.asfreq('B', how='end') - 1).asfreq('T', 'start') + 16 * 60
p4pm

Period('2016-01-28 16:00', 'T')

In [41]:
# 转换为 timestamp
p4pm.to_timestamp()

Timestamp('2016-01-28 16:00:00')

### Timestamp 和 Period 相互转换

In [42]:
ts = pd.Series(np.random.randn(5), index = pd.date_range('2016-01-01', periods=5, freq='M'))
ts

2016-01-31   -0.447504
2016-02-29   -1.393422
2016-03-31   -0.632505
2016-04-30    0.011181
2016-05-31    0.941598
Freq: M, dtype: float64

In [43]:
ts.to_period()

2016-01   -0.447504
2016-02   -1.393422
2016-03   -0.632505
2016-04    0.011181
2016-05    0.941598
Freq: M, dtype: float64

In [44]:
ts = pd.Series(np.random.randn(5), index = pd.date_range('2016-12-29', periods=5, freq='D'))
ts

2016-12-29    1.125565
2016-12-30    0.249613
2016-12-31    0.270745
2017-01-01   -0.131325
2017-01-02    0.236078
Freq: D, dtype: float64

In [45]:
pts = ts.to_period(freq='M')
pts

2016-12    1.125565
2016-12    0.249613
2016-12    0.270745
2017-01   -0.131325
2017-01    0.236078
Freq: M, dtype: float64

In [46]:
pts.groupby(level=0).sum()

2016-12    1.645924
2017-01    0.104753
Freq: M, dtype: float64

In [47]:
# 转换为时间戳时，细部时间会丢失
pts.to_timestamp(how='end')

2016-12-31    1.125565
2016-12-31    0.249613
2016-12-31    0.270745
2017-01-31   -0.131325
2017-01-31    0.236078
dtype: float64

### 重采样

* 高频率 -> 低频率 -> 降采样：5 分钟股票交易数据转换为日交易数据
* 低频率 -> 高频率 -> 升采样
* 其他重采样：每周三 (W-WED) 转换为每周五 (W-FRI)


In [48]:
ts = pd.Series(np.random.randint(0, 50, 60), index=pd.date_range('2016-04-25 09:30', periods=60, freq='T'))
ts

2016-04-25 09:30:00    39
2016-04-25 09:31:00    31
2016-04-25 09:32:00     4
2016-04-25 09:33:00    22
2016-04-25 09:34:00    22
2016-04-25 09:35:00    35
2016-04-25 09:36:00    16
2016-04-25 09:37:00    16
2016-04-25 09:38:00    26
2016-04-25 09:39:00     2
2016-04-25 09:40:00    47
2016-04-25 09:41:00    18
2016-04-25 09:42:00    17
2016-04-25 09:43:00    34
2016-04-25 09:44:00    23
2016-04-25 09:45:00    18
2016-04-25 09:46:00    32
2016-04-25 09:47:00    36
2016-04-25 09:48:00    18
2016-04-25 09:49:00    18
2016-04-25 09:50:00    42
2016-04-25 09:51:00    13
2016-04-25 09:52:00    14
2016-04-25 09:53:00    37
2016-04-25 09:54:00    11
2016-04-25 09:55:00    31
2016-04-25 09:56:00    41
2016-04-25 09:57:00    40
2016-04-25 09:58:00    19
2016-04-25 09:59:00    33
2016-04-25 10:00:00    36
2016-04-25 10:01:00    49
2016-04-25 10:02:00     1
2016-04-25 10:03:00     3
2016-04-25 10:04:00     1
2016-04-25 10:05:00    35
2016-04-25 10:06:00     2
2016-04-25 10:07:00    25
2016-04-25 1

In [49]:
# 0-4 分钟为第一组
ts.resample('5min', how='sum')

2016-04-25 09:30:00    118
2016-04-25 09:35:00     95
2016-04-25 09:40:00    139
2016-04-25 09:45:00    122
2016-04-25 09:50:00    117
2016-04-25 09:55:00    164
2016-04-25 10:00:00     90
2016-04-25 10:05:00    114
2016-04-25 10:10:00    113
2016-04-25 10:15:00    140
2016-04-25 10:20:00    155
2016-04-25 10:25:00    162
Freq: 5T, dtype: int64

In [50]:
# 0-4 分钟为第一组
ts.resample('5min', how='sum', label='right')

2016-04-25 09:35:00    118
2016-04-25 09:40:00     95
2016-04-25 09:45:00    139
2016-04-25 09:50:00    122
2016-04-25 09:55:00    117
2016-04-25 10:00:00    164
2016-04-25 10:05:00     90
2016-04-25 10:10:00    114
2016-04-25 10:15:00    113
2016-04-25 10:20:00    140
2016-04-25 10:25:00    155
2016-04-25 10:30:00    162
Freq: 5T, dtype: int64

### OHLC 重采样

金融数据专用：Open/High/Low/Close

In [51]:
ts.resample('5min', how='ohlc')

Unnamed: 0,open,high,low,close
2016-04-25 09:30:00,39,39,4,22
2016-04-25 09:35:00,35,35,2,2
2016-04-25 09:40:00,47,47,17,23
2016-04-25 09:45:00,18,36,18,18
2016-04-25 09:50:00,42,42,11,11
2016-04-25 09:55:00,31,41,19,33
2016-04-25 10:00:00,36,49,1,1
2016-04-25 10:05:00,35,35,2,22
2016-04-25 10:10:00,13,44,0,0
2016-04-25 10:15:00,44,45,12,14


In [52]:
### 通过 groupby 重采样
ts = pd.Series(np.random.randint(0, 50, 100), index=pd.date_range('2016-03-01', periods=100, freq='D'))
ts

2016-03-01    37
2016-03-02    34
2016-03-03    48
2016-03-04    25
2016-03-05    16
2016-03-06    47
2016-03-07    38
2016-03-08    35
2016-03-09    42
2016-03-10    29
2016-03-11     3
2016-03-12    43
2016-03-13    42
2016-03-14    44
2016-03-15     3
2016-03-16    22
2016-03-17    31
2016-03-18    20
2016-03-19    47
2016-03-20    45
2016-03-21    32
2016-03-22     1
2016-03-23     2
2016-03-24    24
2016-03-25    28
2016-03-26     4
2016-03-27    15
2016-03-28    33
2016-03-29    16
2016-03-30    37
              ..
2016-05-10    30
2016-05-11    34
2016-05-12    32
2016-05-13    30
2016-05-14    40
2016-05-15    15
2016-05-16     5
2016-05-17    38
2016-05-18    14
2016-05-19    28
2016-05-20    44
2016-05-21    32
2016-05-22     5
2016-05-23    44
2016-05-24    49
2016-05-25    22
2016-05-26    13
2016-05-27    49
2016-05-28    30
2016-05-29    36
2016-05-30    28
2016-05-31    48
2016-06-01    46
2016-06-02    31
2016-06-03    17
2016-06-04    35
2016-06-05    39
2016-06-06    

In [53]:
ts.groupby(lambda x: x.month).sum()

3    888
4    683
5    841
6    237
dtype: int64

In [54]:
ts.groupby(ts.index.to_period('M')).sum()

2016-03    888
2016-04    683
2016-05    841
2016-06    237
Freq: M, dtype: int64

### 升采样和插值

In [55]:
# 以周为单位，每周五采样
df = pd.DataFrame(np.random.randint(1, 50, 2), index=pd.date_range('2016-04-22', periods=2, freq='W-FRI'))
df

Unnamed: 0,0
2016-04-22,32
2016-04-29,36


In [56]:
df.resample('D')

Unnamed: 0,0
2016-04-22,32.0
2016-04-23,
2016-04-24,
2016-04-25,
2016-04-26,
2016-04-27,
2016-04-28,
2016-04-29,36.0


In [57]:
df.resample('D', fill_method='ffill', limit=3)

Unnamed: 0,0
2016-04-22,32.0
2016-04-23,32.0
2016-04-24,32.0
2016-04-25,32.0
2016-04-26,
2016-04-27,
2016-04-28,
2016-04-29,36.0


In [58]:
# 以周为单位，每周一采样
df.resample('W-MON', fill_method='ffill')

Unnamed: 0,0
2016-04-25,32
2016-05-02,36


### 时期重采样

In [59]:
df = pd.DataFrame(np.random.randint(2, 30, (24, 4)), 
                  index=pd.period_range('2015-01', '2016-12', freq='M'),
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2015-01,3,22,22,11
2015-02,7,3,21,26
2015-03,6,12,12,11
2015-04,14,24,21,14
2015-05,29,19,27,8
2015-06,14,10,29,12
2015-07,20,2,6,21
2015-08,11,25,26,21
2015-09,10,24,2,3
2015-10,21,24,7,8


In [60]:
adf = df.resample('A-DEC', how='mean')
adf

Unnamed: 0,A,B,C,D
2015,11.916667,18.166667,16.333333,14.083333
2016,16.5,15.583333,11.083333,16.166667


In [61]:
df.resample('A-MAY', how='mean')

Unnamed: 0,A,B,C,D
2015,11.8,16.0,20.6,14.0
2016,12.166667,17.0,12.25,15.333333
2017,19.428571,17.285714,11.285714,15.571429


In [62]:
# 升采样
adf.resample('Q-DEC')

Unnamed: 0,A,B,C,D
2015Q1,11.916667,18.166667,16.333333,14.083333
2015Q2,,,,
2015Q3,,,,
2015Q4,,,,
2016Q1,16.5,15.583333,11.083333,16.166667
2016Q2,,,,
2016Q3,,,,
2016Q4,,,,


In [63]:
adf.resample('Q-DEC', fill_method='ffill')

Unnamed: 0,A,B,C,D
2015Q1,11.916667,18.166667,16.333333,14.083333
2015Q2,11.916667,18.166667,16.333333,14.083333
2015Q3,11.916667,18.166667,16.333333,14.083333
2015Q4,11.916667,18.166667,16.333333,14.083333
2016Q1,16.5,15.583333,11.083333,16.166667
2016Q2,16.5,15.583333,11.083333,16.166667
2016Q3,16.5,15.583333,11.083333,16.166667
2016Q4,16.5,15.583333,11.083333,16.166667


### 性能

In [64]:
n = 100000000
ts = pd.Series(np.random.randn(n), 
               index=pd.date_range('2000-01-01', periods=n, freq='10ms'))
len(ts)

100000000

In [65]:
%timeit ts.resample('10min', how='ohlc')

1 loops, best of 3: 1.25 s per loop


In [66]:
ts.resample('D', how='ohlc')

Unnamed: 0,open,high,low,close
2000-01-01,-0.061024,5.427398,-5.26259,0.723502
2000-01-02,0.101625,5.002179,-5.896551,-0.339946
2000-01-03,0.622688,5.603006,-5.260461,-1.608818
2000-01-04,-0.908498,5.319902,-5.472226,1.005223
2000-01-05,-2.126586,5.428941,-5.228624,-0.0337
2000-01-06,1.300359,5.223219,-5.392481,-0.760485
2000-01-07,-0.450962,5.250595,-5.148237,-1.630172
2000-01-08,0.934403,5.230623,-5.157113,0.239452
2000-01-09,-1.552805,5.097451,-5.288262,-0.611569
2000-01-10,0.448738,5.236686,-5.390805,-0.793015
