In [1]:
import pandas as pd
import numpy as np

## 时间日期

* 时间戳 tiimestamp：固定的时刻 -> pd.Timestamp
* 固定时期 period：比如 2016年3月份，再如2015年销售额 -> pd.Period
* 时间间隔 interval：由起始时间和结束时间来表示，固定时期是时间间隔的一个特殊

**时间日期在 Pandas 里的作用**

* 分析金融数据，如股票交易数据
* 分析服务器日志


### Python datetime

python 标准库里提供了时间日期的处理。这个是时间日期的基础。

In [2]:
from datetime import datetime
from datetime import timedelta

In [3]:
now = datetime.now()
now

datetime.datetime(2016, 4, 28, 14, 21, 43, 949000)

In [4]:
now.year, now.month, now.day

(2016, 4, 28)

### 时间差

In [5]:
date1 = datetime(2016, 3, 20)
date2 = datetime(2016, 3, 16)
delta = date1 - date2
delta

datetime.timedelta(4)

In [6]:
delta.days

4

In [7]:
delta.total_seconds()

345600.0

In [8]:
date2 + delta

datetime.datetime(2016, 3, 20, 0, 0)

In [9]:
date2 + timedelta(4.5)

datetime.datetime(2016, 3, 20, 12, 0)

### 字符串和 datetime 转换

关于 datetime 格式定义，可以参阅 python 官方文档

In [10]:
date = datetime(2016, 3, 20, 8, 30)
date

datetime.datetime(2016, 3, 20, 8, 30)

In [11]:
str(date)

'2016-03-20 08:30:00'

In [12]:
date.strftime('%Y-%m-%d %H:%M:%S')

'2016-03-20 08:30:00'

In [13]:
datetime.strptime('2016-03-20 09:30', '%Y-%m-%d %H:%M')

datetime.datetime(2016, 3, 20, 9, 30)

## Pandas 里的时间序列

Pandas 里使用 Timestamp 来表达时间

In [14]:
dates = [datetime(2016, 3, 1), datetime(2016, 3, 2), datetime(2016, 3, 3), datetime(2016, 3, 4)]
s = pd.Series(np.random.randn(4), index=dates)
s

2016-03-01   -0.371286
2016-03-02   -0.517337
2016-03-03    0.419003
2016-03-04    0.266401
dtype: float64

In [15]:
type(s.index)

pandas.tseries.index.DatetimeIndex

In [16]:
type(s.index[0])

pandas.tslib.Timestamp

## 日期范围

### 生成日期范围

In [17]:
pd.date_range('20160320', '20160331')

DatetimeIndex(['2016-03-20', '2016-03-21', '2016-03-22', '2016-03-23',
               '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27',
               '2016-03-28', '2016-03-29', '2016-03-30', '2016-03-31'],
              dtype='datetime64[ns]', freq='D')

In [18]:
pd.date_range(start='20160320', periods=10)

DatetimeIndex(['2016-03-20', '2016-03-21', '2016-03-22', '2016-03-23',
               '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27',
               '2016-03-28', '2016-03-29'],
              dtype='datetime64[ns]', freq='D')

In [19]:
## 规则化时间戳
pd.date_range(start='2016-03-20 16:23:32', periods=10, normalize=True)

DatetimeIndex(['2016-03-20', '2016-03-21', '2016-03-22', '2016-03-23',
               '2016-03-24', '2016-03-25', '2016-03-26', '2016-03-27',
               '2016-03-28', '2016-03-29'],
              dtype='datetime64[ns]', freq='D')

### 时间频率

In [20]:
## 星期
pd.date_range(start='20160320', periods=10, freq='W')

DatetimeIndex(['2016-03-20', '2016-03-27', '2016-04-03', '2016-04-10',
               '2016-04-17', '2016-04-24', '2016-05-01', '2016-05-08',
               '2016-05-15', '2016-05-22'],
              dtype='datetime64[ns]', freq='W-SUN')

In [21]:
# 月
pd.date_range(start='20160320', periods=10, freq='M')

DatetimeIndex(['2016-03-31', '2016-04-30', '2016-05-31', '2016-06-30',
               '2016-07-31', '2016-08-31', '2016-09-30', '2016-10-31',
               '2016-11-30', '2016-12-31'],
              dtype='datetime64[ns]', freq='M')

In [22]:
## 每个月最后一个工作日组成的索引
pd.date_range(start='20160320', periods=10, freq='BM')

DatetimeIndex(['2016-03-31', '2016-04-29', '2016-05-31', '2016-06-30',
               '2016-07-29', '2016-08-31', '2016-09-30', '2016-10-31',
               '2016-11-30', '2016-12-30'],
              dtype='datetime64[ns]', freq='BM')

In [23]:
# 小时
pd.date_range(start='20160320', periods=10, freq='4H')

DatetimeIndex(['2016-03-20 00:00:00', '2016-03-20 04:00:00',
               '2016-03-20 08:00:00', '2016-03-20 12:00:00',
               '2016-03-20 16:00:00', '2016-03-20 20:00:00',
               '2016-03-21 00:00:00', '2016-03-21 04:00:00',
               '2016-03-21 08:00:00', '2016-03-21 12:00:00'],
              dtype='datetime64[ns]', freq='4H')

## 时期及算术运算

pd.Period 表示时期，比如几日，月或几个月等。比如用来统计每个月的销售额，就可以用时期作为单位。

In [24]:
p1 = pd.Period(2010)
p1

Period('2010', 'A-DEC')

In [25]:
p2 = p1 + 2
p2

Period('2012', 'A-DEC')

In [26]:
p2 - p1

2L

In [27]:
p1 = pd.Period(2016, freq='M')
p1

Period('2016-01', 'M')

In [28]:
p1 + 3

Period('2016-04', 'M')

### 时期序列

In [29]:
pd.period_range(start='2016-01', periods=12, freq='M')

PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
             '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12'],
            dtype='int64', freq='M')

In [30]:
pd.period_range(start='2016-01', end='2016-10', freq='M')

PeriodIndex(['2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06',
             '2016-07', '2016-08', '2016-09', '2016-10'],
            dtype='int64', freq='M')

In [31]:
# 直接用字符串
index = pd.PeriodIndex(['2016Q1', '2016Q2', '2016Q3'], freq='Q-DEC')
index

PeriodIndex(['2016Q1', '2016Q2', '2016Q3'], dtype='int64', freq='Q-DEC')

### 时期的频率转换

asfreq

* A-DEC: 以 12 月份作为结束的年时期
* A-NOV: 以 11 月份作为结束的年时期
* Q-DEC: 以 12 月份作为结束的季度时期

![time freq](imgs/time_freq.png)

In [32]:
p = pd.Period('2016', freq='A-DEC')
p

Period('2016', 'A-DEC')

In [33]:
p.asfreq('M', how='start')

Period('2016-01', 'M')

In [34]:
p.asfreq('M', how='end')

Period('2016-12', 'M')

In [35]:
p = pd.Period('2016-04', freq='M')
p

Period('2016-04', 'M')

In [36]:
p.asfreq('A-DEC')

Period('2016', 'A-DEC')

In [37]:
# 以年为周期，以一年中的 3 月份作为年的结束（财年）
p.asfreq('A-MAR')

Period('2017', 'A-MAR')

### 季度时间频率

Pandas 支持 12 种季度型频率，从 Q-JAN 到 Q-DEC

In [38]:
p = pd.Period('2016Q4', 'Q-JAN')
p

Period('2016Q4', 'Q-JAN')

In [39]:
# 以 1 月份结束的财年中，2016Q4 的时期是指 2015-11-1 到 2016-1-31
p.asfreq('D', how='start'), p.asfreq('D', how='end')

(Period('2015-11-01', 'D'), Period('2016-01-31', 'D'))

In [40]:
# 获取该季度倒数第二个工作日下午4点的时间戳
p4pm = (p.asfreq('B', how='end') - 1).asfreq('T', 'start') + 16 * 60
p4pm

Period('2016-01-28 16:00', 'T')

In [41]:
# 转换为 timestamp
p4pm.to_timestamp()

Timestamp('2016-01-28 16:00:00')

### Timestamp 和 Period 相互转换

In [42]:
ts = pd.Series(np.random.randn(5), index = pd.date_range('2016-01-01', periods=5, freq='M'))
ts

2016-01-31    1.176978
2016-02-29    0.467512
2016-03-31   -1.001717
2016-04-30   -2.103578
2016-05-31   -0.314030
Freq: M, dtype: float64

In [43]:
ts.to_period()

2016-01    1.176978
2016-02    0.467512
2016-03   -1.001717
2016-04   -2.103578
2016-05   -0.314030
Freq: M, dtype: float64

In [44]:
ts = pd.Series(np.random.randn(5), index = pd.date_range('2016-12-29', periods=5, freq='D'))
ts

2016-12-29   -2.780962
2016-12-30    0.468131
2016-12-31   -0.527857
2017-01-01    0.393113
2017-01-02   -1.446709
Freq: D, dtype: float64

In [45]:
pts = ts.to_period(freq='M')
pts

2016-12   -2.780962
2016-12    0.468131
2016-12   -0.527857
2017-01    0.393113
2017-01   -1.446709
Freq: M, dtype: float64

In [46]:
pts.groupby(level=0).sum()

2016-12   -2.840688
2017-01   -1.053596
Freq: M, dtype: float64

In [47]:
# 转换为时间戳时，细部时间会丢失
pts.to_timestamp(how='end')

2016-12-31   -2.780962
2016-12-31    0.468131
2016-12-31   -0.527857
2017-01-31    0.393113
2017-01-31   -1.446709
dtype: float64

### 重采样

* 高频率 -> 低频率 -> 降采样：5 分钟股票交易数据转换为日交易数据
* 低频率 -> 高频率 -> 升采样
* 其他重采样：每周三 (W-WED) 转换为每周五 (W-FRI)


In [48]:
ts = pd.Series(np.random.randint(0, 50, 60), index=pd.date_range('2016-04-25 09:30', periods=60, freq='T'))
ts

2016-04-25 09:30:00    14
2016-04-25 09:31:00    25
2016-04-25 09:32:00    43
2016-04-25 09:33:00     5
2016-04-25 09:34:00    27
2016-04-25 09:35:00    40
2016-04-25 09:36:00    41
2016-04-25 09:37:00     6
2016-04-25 09:38:00    45
2016-04-25 09:39:00    13
2016-04-25 09:40:00     3
2016-04-25 09:41:00    35
2016-04-25 09:42:00     0
2016-04-25 09:43:00     4
2016-04-25 09:44:00    24
2016-04-25 09:45:00    34
2016-04-25 09:46:00    32
2016-04-25 09:47:00     0
2016-04-25 09:48:00    32
2016-04-25 09:49:00     0
2016-04-25 09:50:00    24
2016-04-25 09:51:00     7
2016-04-25 09:52:00    36
2016-04-25 09:53:00    39
2016-04-25 09:54:00     6
2016-04-25 09:55:00    11
2016-04-25 09:56:00     3
2016-04-25 09:57:00    46
2016-04-25 09:58:00    27
2016-04-25 09:59:00    32
2016-04-25 10:00:00    20
2016-04-25 10:01:00    22
2016-04-25 10:02:00    23
2016-04-25 10:03:00    46
2016-04-25 10:04:00    46
2016-04-25 10:05:00     9
2016-04-25 10:06:00     9
2016-04-25 10:07:00    20
2016-04-25 1

In [49]:
# 0-4 分钟为第一组
ts.resample('5min', how='sum')

2016-04-25 09:30:00    114
2016-04-25 09:35:00    145
2016-04-25 09:40:00     66
2016-04-25 09:45:00     98
2016-04-25 09:50:00    112
2016-04-25 09:55:00    119
2016-04-25 10:00:00    157
2016-04-25 10:05:00     86
2016-04-25 10:10:00    109
2016-04-25 10:15:00    108
2016-04-25 10:20:00    147
2016-04-25 10:25:00    106
Freq: 5T, dtype: int32

In [50]:
# 0-4 分钟为第一组
ts.resample('5min', how='sum', label='right')

2016-04-25 09:35:00    114
2016-04-25 09:40:00    145
2016-04-25 09:45:00     66
2016-04-25 09:50:00     98
2016-04-25 09:55:00    112
2016-04-25 10:00:00    119
2016-04-25 10:05:00    157
2016-04-25 10:10:00     86
2016-04-25 10:15:00    109
2016-04-25 10:20:00    108
2016-04-25 10:25:00    147
2016-04-25 10:30:00    106
Freq: 5T, dtype: int32

### OHLC 重采样

金融数据专用：Open/High/Low/Close

In [51]:
ts.resample('5min', how='ohlc')

Unnamed: 0,open,high,low,close
2016-04-25 09:30:00,14,43,5,27
2016-04-25 09:35:00,40,45,6,13
2016-04-25 09:40:00,3,35,0,24
2016-04-25 09:45:00,34,34,0,0
2016-04-25 09:50:00,24,39,6,6
2016-04-25 09:55:00,11,46,3,32
2016-04-25 10:00:00,20,46,20,46
2016-04-25 10:05:00,9,26,9,26
2016-04-25 10:10:00,12,40,1,1
2016-04-25 10:15:00,1,45,1,45


In [52]:
### 通过 groupby 重采样
ts = pd.Series(np.random.randint(0, 50, 100), index=pd.date_range('2016-03-01', periods=100, freq='D'))
ts

2016-03-01    30
2016-03-02     2
2016-03-03    11
2016-03-04    11
2016-03-05    13
2016-03-06    10
2016-03-07    38
2016-03-08    49
2016-03-09    36
2016-03-10    32
2016-03-11    21
2016-03-12    42
2016-03-13    42
2016-03-14    31
2016-03-15    36
2016-03-16    43
2016-03-17    36
2016-03-18    34
2016-03-19    19
2016-03-20    31
2016-03-21    19
2016-03-22    35
2016-03-23    30
2016-03-24    16
2016-03-25    39
2016-03-26     8
2016-03-27    48
2016-03-28    16
2016-03-29    28
2016-03-30    22
              ..
2016-05-10    43
2016-05-11     3
2016-05-12    38
2016-05-13    49
2016-05-14    27
2016-05-15    36
2016-05-16     9
2016-05-17    45
2016-05-18    23
2016-05-19     0
2016-05-20    48
2016-05-21     7
2016-05-22    24
2016-05-23    21
2016-05-24    27
2016-05-25     8
2016-05-26    46
2016-05-27    38
2016-05-28    21
2016-05-29    16
2016-05-30    47
2016-05-31    47
2016-06-01    46
2016-06-02     5
2016-06-03     5
2016-06-04    38
2016-06-05    14
2016-06-06    

In [53]:
ts.groupby(lambda x: x.month).sum()

3    828
4    920
5    815
6    163
dtype: int32

In [54]:
ts.groupby(ts.index.to_period('M')).sum()

2016-03    828
2016-04    920
2016-05    815
2016-06    163
Freq: M, dtype: int32

### 升采样和插值

In [55]:
# 以周为单位，每周五采样
df = pd.DataFrame(np.random.randint(1, 50, 2), index=pd.date_range('2016-04-22', periods=2, freq='W-FRI'))
df

Unnamed: 0,0
2016-04-22,9
2016-04-29,5


In [56]:
df.resample('D')

Unnamed: 0,0
2016-04-22,9.0
2016-04-23,
2016-04-24,
2016-04-25,
2016-04-26,
2016-04-27,
2016-04-28,
2016-04-29,5.0


In [57]:
df.resample('D', fill_method='ffill', limit=3)

Unnamed: 0,0
2016-04-22,9.0
2016-04-23,9.0
2016-04-24,9.0
2016-04-25,9.0
2016-04-26,
2016-04-27,
2016-04-28,
2016-04-29,5.0


In [58]:
# 以周为单位，每周一采样
df.resample('W-MON', fill_method='ffill')

Unnamed: 0,0
2016-04-25,9
2016-05-02,5


### 时期重采样

In [59]:
df = pd.DataFrame(np.random.randint(2, 30, (24, 4)), 
                  index=pd.period_range('2015-01', '2016-12', freq='M'),
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2015-01,19,2,26,12
2015-02,12,24,23,28
2015-03,20,18,13,4
2015-04,12,14,20,20
2015-05,29,17,11,22
2015-06,29,7,20,19
2015-07,4,25,20,18
2015-08,17,18,29,15
2015-09,19,26,27,10
2015-10,8,28,21,11


In [60]:
adf = df.resample('A-DEC', how='mean')
adf

Unnamed: 0,A,B,C,D
2015,16.0,15.833333,21.666667,15.333333
2016,12.5,15.083333,18.083333,13.916667


In [61]:
df.resample('A-MAY', how='mean')

Unnamed: 0,A,B,C,D
2015,18.4,15.0,18.6,17.2
2016,12.5,16.333333,24.333333,15.0
2017,14.285714,14.285714,13.142857,12.142857


In [62]:
# 升采样
adf.resample('Q-DEC')

Unnamed: 0,A,B,C,D
2015Q1,16.0,15.833333,21.666667,15.333333
2015Q2,,,,
2015Q3,,,,
2015Q4,,,,
2016Q1,12.5,15.083333,18.083333,13.916667
2016Q2,,,,
2016Q3,,,,
2016Q4,,,,


In [63]:
adf.resample('Q-DEC', fill_method='ffill')

Unnamed: 0,A,B,C,D
2015Q1,16.0,15.833333,21.666667,15.333333
2015Q2,16.0,15.833333,21.666667,15.333333
2015Q3,16.0,15.833333,21.666667,15.333333
2015Q4,16.0,15.833333,21.666667,15.333333
2016Q1,12.5,15.083333,18.083333,13.916667
2016Q2,12.5,15.083333,18.083333,13.916667
2016Q3,12.5,15.083333,18.083333,13.916667
2016Q4,12.5,15.083333,18.083333,13.916667


### 性能

In [65]:
n = 1000000
ts = pd.Series(np.random.randn(n), 
               index=pd.date_range('2000-01-01', periods=n, freq='10ms'))
len(ts)

1000000

In [66]:
%timeit ts.resample('10min', how='ohlc')

10 loops, best of 3: 19.5 ms per loop


In [67]:
ts.resample('D', how='ohlc')

Unnamed: 0,open,high,low,close
2000-01-01,-0.939358,5.503891,-4.814675,0.973239
