In [1]:
from datetime import datetime,timedelta
import pandas as pd
import numpy as np

In [2]:
now=datetime.now()
now

datetime.datetime(2019, 9, 1, 0, 40, 58, 478867)

In [3]:
# https://www.cnblogs.com/fwl8888/p/9635505.html
now.strftime('%Y-%m-%d')

'2019-09-01'

In [4]:
now.year

2019

In [5]:
delta=datetime(2019,8,30)-datetime(2018,11,24,8,15)
delta

datetime.timedelta(days=278, seconds=56700)

In [6]:
delta.days

278

In [7]:
startTime=datetime(2019,8,30,12,30)
deltaTime=timedelta(days=1,hours=-10,minutes=20)
endTime=startTime+deltaTime
endTime

datetime.datetime(2019, 8, 31, 2, 50)

In [8]:
# strptime
datestrs=['20190820','20170322']
[datetime.strptime(x,'%Y%m%d') for x in datestrs]

[datetime.datetime(2019, 8, 20, 0, 0), datetime.datetime(2017, 3, 22, 0, 0)]

In [9]:
# parse可以解析常见的日期格式
from dateutil.parser import parse

parse('20190801')
parse('12/9/2011',dayfirst=True)
parse('2018-3-12')
parse('090810',yearfirst=True)

datetime.datetime(2009, 8, 10, 0, 0)

In [10]:
pd.to_datetime(datestrs)

DatetimeIndex(['2019-08-20', '2017-03-22'], dtype='datetime64[ns]', freq=None)

In [11]:
dtstr=pd.to_datetime(datestrs+[None])
dtstr

DatetimeIndex(['2019-08-20', '2017-03-22', 'NaT'], dtype='datetime64[ns]', freq=None)

In [12]:
dtstr[1]

Timestamp('2017-03-22 00:00:00')

In [13]:
# 时间序列基础
dt=datestrs+['20180822','20190323','20110125','20140624']
dates=[datetime.strptime(x,'%Y%m%d') for x in dt]
ts=pd.Series(np.arange(len(dt)),index=dates)
type(ts)
# 时间序列
ts

2019-08-20    0
2017-03-22    1
2018-08-22    2
2019-03-23    3
2011-01-25    4
2014-06-24    5
dtype: int32

In [14]:
ts.index

DatetimeIndex(['2019-08-20', '2017-03-22', '2018-08-22', '2019-03-23',
               '2011-01-25', '2014-06-24'],
              dtype='datetime64[ns]', freq=None)

In [15]:
ts[::2]
# 时间序列的索引形式可以是多样的，以下几种方式均可以获取相应的元素
ts['22/8/2018']
ts['20180822']

2018-08-22    2
dtype: int32

In [16]:
# 时间序列切片
# date_range可以设定时间的取值范围
long_ts=pd.Series(np.random.randn(1000),index=pd.date_range('1/1/2015',periods=1000))
# 通过索引对Series进行切片,索引的形式可以是year,year-month,year-month-day等
# e.g
long_ts['2016-05'][:5]

#### 通过日期进行切片的方式只对规则Series有效
long_ts[datetime(2015,3,27):datetime(2015,3,31)]

# 时间序列数据均是按照时间顺序先后排列，则可以通过传入时间范围对时间序列进行切片
# 注意索引2014-12-29-2014-12-31在该序列中不存在
long_ts['2014-12-29':'2015-1-2']

# 以上方式在dataframe中也适用

2015-01-01   -0.964583
2015-01-02    0.049861
Freq: D, dtype: float64

In [17]:
# 时间序列，如果索引有重复项，则可以聚合处理
# e.g
dates=pd.DatetimeIndex(['20150102','20150312','20160624','20150310','20150102'])
dup_ts=pd.Series(np.arange(5),index=dates)
dup_ts

2015-01-02    0
2015-03-12    1
2016-06-24    2
2015-03-10    3
2015-01-02    4
dtype: int32

In [18]:
# 判断索引是否唯一
dup_ts.index.is_unique
# level=0
dup_ts.groupby(level=0).mean()

2015-01-02    2
2015-03-10    3
2015-03-12    1
2016-06-24    2
dtype: int32

In [19]:
# 日期的范围，重采样resample
# truncate过滤
# https://www.cnblogs.com/tianqizhi/p/9277376.html
# 过滤掉某一时间之前before或者之后after的时间序列
# ts.truncate(before='20190520
dup_ts=dup_ts.sort_index()
dup_ts
dup_ts.truncate(before='20150301')

2015-03-10    3
2015-03-12    1
2016-06-24    2
dtype: int32

In [20]:
# 时间的生成
# 适用date_range生成指定范围的时间
pd.date_range('20180810',periods=5)
# 传入频率
# BM表示每个月的最后一日的索引
pd.date_range('20180101','20190101',freq='BM')

DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-30', '2018-04-30',
               '2018-05-31', '2018-06-29', '2018-07-31', '2018-08-31',
               '2018-09-28', '2018-10-31', '2018-11-30', '2018-12-31'],
              dtype='datetime64[ns]', freq='BM')

In [21]:
# 适用date_range 默认会保留起始时间戳的时间信息（如果有的话）
pd.date_range('12/21/2018 12:56:31',periods=5)

DatetimeIndex(['2018-12-21 12:56:31', '2018-12-22 12:56:31',
               '2018-12-23 12:56:31', '2018-12-24 12:56:31',
               '2018-12-25 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [22]:
#适用normalize参数可以规范化时间戳参数至午夜的时间戳
pd.date_range('12/21/2018 12:56:31',periods=5,normalize=True,freq='5s')

DatetimeIndex(['2018-12-21 00:00:00', '2018-12-21 00:00:05',
               '2018-12-21 00:00:10', '2018-12-21 00:00:15',
               '2018-12-21 00:00:20'],
              dtype='datetime64[ns]', freq='5S')

In [23]:
#### 频率和日期偏移
from pandas.tseries.offsets import Hour,Minute
hour=Hour(4)
hour

<4 * Hours>

In [24]:
# freq参数：可以适用如: '4h','5s'等
# 开区间
pd.date_range('1/1/2000','1/3/2000 23:59',freq='4h',normalize=True)
# 可以传入字符串，如‘1h30min’等
pd.date_range('1/10/2019','1/12/2019',freq='2h30min')


DatetimeIndex(['2019-01-10 00:00:00', '2019-01-10 02:30:00',
               '2019-01-10 05:00:00', '2019-01-10 07:30:00',
               '2019-01-10 10:00:00', '2019-01-10 12:30:00',
               '2019-01-10 15:00:00', '2019-01-10 17:30:00',
               '2019-01-10 20:00:00', '2019-01-10 22:30:00',
               '2019-01-11 01:00:00', '2019-01-11 03:30:00',
               '2019-01-11 06:00:00', '2019-01-11 08:30:00',
               '2019-01-11 11:00:00', '2019-01-11 13:30:00',
               '2019-01-11 16:00:00', '2019-01-11 18:30:00',
               '2019-01-11 21:00:00', '2019-01-11 23:30:00'],
              dtype='datetime64[ns]', freq='150T')

In [25]:
# WOM(week of month)频率：以WOM开头，获得诸如每个月的第三个周五 之类的日期
pd.date_range('20190601','20190830',freq='WOM-3Fri')

DatetimeIndex(['2019-06-21', '2019-07-19', '2019-08-16'], dtype='datetime64[ns]', freq='WOM-3FRI')

In [26]:
# shift函数：shift操作可以将数据沿着时间轴向前或者向后移动，而时间轴不变，
# shift操作的重要用途是计算时间序列中的百分比的变化
# e.g
ts2=pd.DataFrame(np.arange(5),index=pd.date_range('20190101',periods=5,freq='M'))
ts2

Unnamed: 0,0
2019-01-31,0
2019-02-28,1
2019-03-31,2
2019-04-30,3
2019-05-31,4


In [27]:
# ts2.shift(-2)
ts2.shift(2)

Unnamed: 0,0
2019-01-31,
2019-02-28,
2019-03-31,0.0
2019-04-30,1.0
2019-05-31,2.0


In [28]:
# 计算百分比变化
ts2.shift(1)/ts2 -1

Unnamed: 0,0
2019-01-31,
2019-02-28,-1.0
2019-03-31,-0.5
2019-04-30,-0.333333
2019-05-31,-0.25


In [30]:
# 单纯的位移操作会导致数据的丢失，
# #可以向shift函数传入freq参数，实现对时间戳的位移
ts2.shift(2,freq='M')

Unnamed: 0,0
2019-03-31,0
2019-04-30,1
2019-05-31,2
2019-06-30,3
2019-07-31,4


In [34]:
# shift函数中periods,freq可以结合使用
# 以下两种方式等效
ts2.shift(3,freq='D')
ts2.shift(1,freq='3D')

Unnamed: 0,0
2019-02-03,0
2019-03-03,1
2019-04-03,2
2019-05-03,3
2019-06-03,4


In [36]:
# 在datetime或者Timestamp上使用shift进行时间偏移
from pandas.tseries.offsets import Day,MonthEnd
now=datetime(2019,9,1)
now=now+3*Day()
now

Timestamp('2019-09-04 00:00:00')

In [38]:
# 瞄点偏移量(MonthEnd)
# 位移量可能小于一个月的长度，在当月
now+MonthEnd()

Timestamp('2019-09-30 00:00:00')

In [41]:
# 可以通过rollforward，rollback方法，将日期向前或者向后滚动
offset=MonthEnd()
offset.rollforward(now)#Timestamp('2019-09-30 00:00:00')
offset.rollback(now)#Timestamp('2019-08-31 00:00:00')

Timestamp('2019-08-31 00:00:00')

In [42]:
# 结合groupby方法使用滚动方法
ts3=pd.Series(np.random.randn(20),index=pd.date_range('20190101',periods=20,freq='4d'))
ts3.groupby(offset.rollforward).mean()

2019-01-31    0.463447
2019-02-28   -0.535225
2019-03-31    0.423802
dtype: float64

In [45]:
# 更加简单的实现方法
ts3.resample('M').mean()

2019-01-31    0.463447
2019-02-28   -0.535225
2019-03-31    0.423802
Freq: M, dtype: float64

In [48]:
# #######################################
##################时区处理###############
import pytz #时区信息的第三方库
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [52]:
# timezone
tz=pytz.timezone('Asia/Shanghai')
tz

<DstTzInfo 'Asia/Shanghai' LMT+8:06:00 STD>

In [55]:
# 给时间序列添加时区
ts4=pd.date_range('3/9/2019',periods=10,freq='D',tz='utc')

In [58]:
# 本地化转换tz_localize
# 可以为时间序列添加时区
# 使用tz_convert进行时区的转换
ts_utc=ts4.tz_convert('Asia/Shanghai')
ts_utc

DatetimeIndex(['2019-03-09 08:00:00+08:00', '2019-03-10 08:00:00+08:00',
               '2019-03-11 08:00:00+08:00', '2019-03-12 08:00:00+08:00',
               '2019-03-13 08:00:00+08:00', '2019-03-14 08:00:00+08:00',
               '2019-03-15 08:00:00+08:00', '2019-03-16 08:00:00+08:00',
               '2019-03-17 08:00:00+08:00', '2019-03-18 08:00:00+08:00'],
              dtype='datetime64[ns, Asia/Shanghai]', freq='D')

In [64]:
# 时区意识型的Timestamp对象
stamp=pd.Timestamp('2019-03-12 12:30')
stamp_utc=stamp.tz_localize('utc')
stamp_utc.tz_convert('US/Eastern')
# 以上操作和以下操作等价
stamp_shanghai=pd.Timestamp('2019-05-20 13:14',tz='Asia/Shanghai')
stamp_shanghai

Timestamp('2019-05-20 13:14:00+0800', tz='Asia/Shanghai')

In [70]:
# 不同时区间的运算
# 不同时区的时间序列进行运算，则在进行合并时，会选择UTC时区进行存储
rng=pd.date_range('3/7/2018',periods=10,freq='B')
tz=pd.Series(np.random.randn(len(rng)),index=rng)
tz1=tz[:7].tz_localize('Europe/London')
tz2=tz1[2:].tz_convert('Europe/Moscow')
# tz1+tz2
tz1*tz2

2018-03-07 00:00:00+00:00         NaN
2018-03-08 00:00:00+00:00         NaN
2018-03-09 00:00:00+00:00    0.935121
2018-03-12 00:00:00+00:00    0.356462
2018-03-13 00:00:00+00:00    0.169773
2018-03-14 00:00:00+00:00    0.938351
2018-03-15 00:00:00+00:00    3.619571
Freq: B, dtype: float64

In [73]:
# 时期及算数运算
# 时期表示的是时间区间
p=pd.Period(2019,freq='A-DEC')
p

Period('2019', 'A-DEC')

In [75]:
p+2

Period('2021', 'A-DEC')

In [78]:
pd.Period(2014,freq='A-DEC')-p

<-5 * YearEnds: month=12>

In [80]:
pd.period_range('20190101','20190630',freq='M')

PeriodIndex(['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06'], dtype='period[M]', freq='M')

In [84]:
# periodIndex的构造
values=['2019Q1','2019Q2','2019Q3','2019Q4']
index=pd.PeriodIndex(values,freq='Q-DEC')
index

PeriodIndex(['2019Q1', '2019Q2', '2019Q3', '2019Q4'], dtype='period[Q-DEC]', freq='Q-DEC')

In [88]:
pd.DataFrame(np.random.randn(len(values)),index=index,columns=['test'])

Unnamed: 0,test
2019Q1,-0.034648
2019Q2,-1.986832
2019Q3,-0.617482
2019Q4,-0.304477


In [92]:
data=pd.read_csv('examples/macrodata.csv')
data.year[:5]

0    1959.0
1    1959.0
2    1959.0
3    1959.0
4    1960.0
Name: year, dtype: float64

In [94]:
data[:3]

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09


In [96]:
index=pd.PeriodIndex(year=data.year,quarter=data.quarter,freq='Q-DEC')
index

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203, freq='Q-DEC')

In [100]:
data.index=index
data.infl[20:25]
data[:3]

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
1959Q1,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1959Q2,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
1959Q3,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09


In [None]:
# 重采样：将时间序列从一个频率转换到拎一个频率的处理过程
# 重采样分为降采样、升采样
# 降采样：频率由高到低
# 升采样：频率由低到高
