In [53]:
import pandas as pd
import numpy as np

## 时间序列

### 时间点、时间段、时间间隔

#### Timestamp对象表示时间截

In [54]:
now = pd.Timestamp.now() #获取当前时间
now_shanghai = now.tz_localize("Asia/Shanghai") # 指定时区信息
now_tokyo = now_shanghai.tz_convert("Asia/Tokyo") # 转换时区
print u"本地时间:", now
print u"上海时区:", now_shanghai
print u"东京时区:", now_tokyo

本地时间: 2018-07-10 14:54:36.499000
上海时区: 2018-07-10 14:54:36.499000+08:00
东京时区: 2018-07-10 15:54:36.499000+09:00


In [55]:
now_shanghai == now_tokyo

True

In [56]:
# now == now_tokyo
# # 不同时区的时间可以比较本地时间和时区时间无法比较

 pytz.common_timezones() 获得表示时区的字符串

In [57]:
import pytz
%omit pytz.common_timezones

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
...


#### Period 对象表示一个标准的时间段

In [58]:
now_day = pd.Period.now(freq="D") # 日周期时间段
now_hour = pd.Period.now(freq="H") # 小时周期时间段
%C now_day; now_hour

         now_day                       now_hour           
-------------------------  -------------------------------
Period('2018-07-10', 'D')  Period('2018-07-10 14:00', 'H')


freq 属性是一个 描述时间段的字符串

In [59]:
from pandas.tseries import frequencies
frequencies._period_code_map.keys()[:5]
# frequencies._period_alias_dictionary()

['A-JAN', 'A-DEC', 'Q-AUG', 'Q-SEP', 'Q-MAY']

星期时间段

In [60]:
# now_week_sun = pd.Period.now(freq="W") 
# # 以星期天开始的时间段, 但是不包含星期天
# # 星期天是7/8, 星期一是7/9, 今天星期二7/10

# now_week_mon = pd.Period.now(freq="W-MON")
# # 以星期一开始的时间段, 但是不包含星期一

# %C now_week_sun; now_week_mon

时间段的起点和终点

In [61]:
%C now_day.start_time; now_day.end_time

       now_day.start_time                      now_day.end_time             
--------------------------------  ------------------------------------------
Timestamp('2018-07-10 00:00:00')  Timestamp('2018-07-10 23:59:59.999999999')


Timestamp对象的to_period()方法可以把时间点转换为包含时间点的时间段

In [62]:
now_shanghai.to_period("H") # 所有的 14:00(-15:00)

Period('2018-07-10 14:00', 'H')

Tinestamp和Period的对象可以通过其属性获得年月日等信息

In [63]:
now

Timestamp('2018-07-10 14:54:36.499000')

In [64]:
%C now.year; now.month; now.day; now.hour

now.year  now.month  now.day  now.hour
--------  ---------  -------  --------
2018      7          10       14      


In [65]:
%C now.dayofweek; now.dayofyear 
# dayofweek 一周的第几天; dayofyear 一年中的第几天

now.dayofweek  now.dayofyear
-------------  -------------
1              191          


#### Timedelta对象表示时间间隔

计算离国庆节还有多少时间

In [66]:
national_day = pd.Timestamp("2018-10-1") # 定义国庆节的日期
td = national_day - pd.Timestamp.now() # 国庆节日期减去现在的日期
td

Timedelta('82 days 09:05:22.644000')

pd.Timedelta(" ") 定义时间间隔对象 

In [67]:
national_day + pd.Timedelta("20 days 10:20:30") 

Timestamp('2018-10-21 10:20:30')

In [68]:
%C td.days; td.seconds; td.microseconds
# 时间间隔的天数,秒数,微秒数,纳秒数
# 9*60*60 + 7*60 + 46 =32866 s 剩余的秒数不是总秒数

td.days  td.seconds  td.microseconds
-------  ----------  ---------------
82L      32722L      644000L        


关键字参数直接指定时间间隔

In [69]:
print pd.Timedelta(days=10, hours=1, minutes=2, seconds=10.5)
print pd.Timedelta(seconds=100000)

10 days 01:02:10.500000
1 days 03:46:40


### 时间序列

Timestamp对象, Period对象, Timedelta对象 可以表示索引标签

In [76]:
pd.date_range("2015-01-01", "2015-01-2", freq="H")

DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 01:00:00',
               '2015-01-01 02:00:00', '2015-01-01 03:00:00',
               '2015-01-01 04:00:00', '2015-01-01 05:00:00',
               '2015-01-01 06:00:00', '2015-01-01 07:00:00',
               '2015-01-01 08:00:00', '2015-01-01 09:00:00',
               '2015-01-01 10:00:00', '2015-01-01 11:00:00',
               '2015-01-01 12:00:00', '2015-01-01 13:00:00',
               '2015-01-01 14:00:00', '2015-01-01 15:00:00',
               '2015-01-01 16:00:00', '2015-01-01 17:00:00',
               '2015-01-01 18:00:00', '2015-01-01 19:00:00',
               '2015-01-01 20:00:00', '2015-01-01 21:00:00',
               '2015-01-01 22:00:00', '2015-01-01 23:00:00',
               '2015-01-02 00:00:00'],
              dtype='datetime64[ns]', freq='H', tz=None)

创建时间索引对象

In [80]:
def random_timestamps(start, end, freq, count):
    """ 创建n个随机时间点的DatetimeIndex对象"""
    index = pd.date_range(start, end, freq=freq)
    # 起始时间, 终止时间和时间精度
    
    np.random.seed(42)

    # len(index) 远远大于5(可能是500), 
    locations = np.random.choice(np.arange(len(index)), size=count, replace=False)
    locations.sort()    # 从这500个时间点对应的索引中随机取出5个索引    
    return index[locations]

ts_index = random_timestamps("2015-01-01", "2015-10-01", freq="Min", count=5)

pd_index = ts_index.to_period("M")

td_index = pd.TimedeltaIndex(np.diff(ts_index))

print ts_index, "\n"
print pd_index, "\n"
print td_index, "\n"

DatetimeIndex(['2015-01-15 16:12:00', '2015-02-15 08:04:00',
               '2015-02-28 12:30:00', '2015-08-06 02:40:00',
               '2015-08-18 13:13:00'],
              dtype='datetime64[ns]', freq=None, tz=None) 

PeriodIndex(['2015-01', '2015-02', '2015-02', '2015-08', '2015-08'], dtype='int64', freq='M') 

TimedeltaIndex(['30 days 15:52:00', '13 days 04:26:00', '158 days 14:10:00',
                '12 days 10:33:00'],
               dtype='timedelta64[ns]', freq=None) 



三种索引对象的dtype属性

In [78]:
%C ts_index.dtype; pd_index.dtype; td_index.dtype

 ts_index.dtype   pd_index.dtype   td_index.dtype 
----------------  --------------  ----------------
dtype('<M8[ns]')  dtype('int64')  dtype('<m8[ns]')


三种索引对象与时间相关的属性

In [82]:
%C ts_index.weekday; pd_index.month; td_index.seconds

ts_index.weekday   pd_index.month        td_index.seconds      
----------------  ---------------  ----------------------------
[3, 6, 5, 3, 1]   [1, 2, 2, 8, 8]  [57120, 15960, 51000, 37980]


shift属性可以移动时间点

In [86]:
ts_index.shift(1, "H") # 每个时间点多加一小时

DatetimeIndex(['2015-01-15 17:12:00', '2015-02-15 09:04:00',
               '2015-02-28 13:30:00', '2015-08-06 03:40:00',
               '2015-08-18 14:13:00'],
              dtype='datetime64[ns]', freq=None, tz=None)

In [88]:
ts_index.shift(1, "M") # 每个时间点移动到月头或者月尾

DatetimeIndex(['2015-01-31 16:12:00', '2015-02-28 08:04:00',
               '2015-03-31 12:30:00', '2015-08-31 02:40:00',
               '2015-08-31 13:13:00'],
              dtype='datetime64[ns]', freq=None, tz=None)

按日期取整(将时刻修改为当天的凌晨)

In [90]:
ts_index.normalize() 

DatetimeIndex(['2015-01-15', '2015-02-15', '2015-02-28', '2015-08-06',
               '2015-08-18'],
              dtype='datetime64[ns]', freq=None, tz=None)

任意周期取整

In [96]:
ts_index.to_period("H") # 整点

PeriodIndex(['2015-01-15 16:00', '2015-02-15 08:00', '2015-02-28 12:00',
             '2015-08-06 02:00', '2015-08-18 13:00'],
            dtype='int64', freq='H')

In [97]:
ts_index.to_period("H").to_timestamp()

DatetimeIndex(['2015-01-15 16:00:00', '2015-02-15 08:00:00',
               '2015-02-28 12:00:00', '2015-08-06 02:00:00',
               '2015-08-18 13:00:00'],
              dtype='datetime64[ns]', freq=None, tz=None)

时间序列: 如果Series对象的索引为DatatimeIndex对象, 那么这个Series对象称为时间序列

In [102]:
ts_series = pd.Series(range(5), index=ts_index)
ts_series

2015-01-15 16:12:00    0
2015-02-15 08:04:00    1
2015-02-28 12:30:00    2
2015-08-06 02:40:00    3
2015-08-18 13:13:00    4
dtype: int64

时间序列提供了一些专门处理时间的方法

In [103]:
ts_series.between_time("9:00", "18:00")
#  between_time()返回所有位于指定时间范围内的数据

2015-01-15 16:12:00    0
2015-02-28 12:30:00    2
2015-08-18 13:13:00    4
dtype: int64

In [105]:
ts_series.tshift(1, freq="D") 
#将索引移动一天

2015-01-16 16:12:00    0
2015-02-16 08:04:00    1
2015-03-01 12:30:00    2
2015-08-07 02:40:00    3
2015-08-19 13:13:00    4
dtype: int64

PeriodIndex和TimedeltaIndex为索引的时间序列

In [106]:
pd_series = pd.Series(range(5), index=pd_index)
td_series = pd.Series(range(4), index=td_index)
%C pd_series.tshift(1); td_series.tshift(10, freq="H")

 pd_series.tshift(1)   td_series.tshift(10, freq="H")
---------------------  ------------------------------
2015-02    0           31 days 01:52:00     0        
2015-03    1           13 days 14:26:00     1        
2015-03    2           159 days 00:10:00    2        
2015-09    3           12 days 20:33:00     3        
2015-09    4           dtype: int64                  
Freq: M, dtype: int64                                


时间信息不仅可以做索引, 还可以做Series对象和DataFrame的列标签

In [107]:
ts_data = pd.Series(ts_index)
pd_data = pd.Series(pd_index)
td_data = pd.Series(td_index)
%C ts_data; pd_data; td_data

        ts_data             pd_data            td_data        
-----------------------  -------------  ----------------------
0   2015-01-15 16:12:00  0    2015-01   0    30 days 15:52:00 
1   2015-02-15 08:04:00  1    2015-02   1    13 days 04:26:00 
2   2015-02-28 12:30:00  2    2015-02   2   158 days 14:10:00 
3   2015-08-06 02:40:00  3    2015-08   3    12 days 10:33:00 
4   2015-08-18 13:13:00  4    2015-08   dtype: timedelta64[ns]
dtype: datetime64[ns]    dtype: object                        


序列的值为时间数据时, 可以通过名字空间对象df调用相关的属性和方法

In [108]:
%C ts_data.dt.hour; pd_data.dt.month; td_data.dt.days

ts_data.dt.hour  pd_data.dt.month  td_data.dt.days
---------------  ----------------  ---------------
0    16          0    1            0     30       
1     8          1    2            1     13       
2    12          2    2            2    158       
3     2          3    8            3     12       
4    13          4    8            dtype: int64   
dtype: int64     dtype: int64                     
