<a href="https://colab.research.google.com/github/jiangenhe/insc-486-fall-2021/blob/main/week7/week7_lecture_time_series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Time Series

In [None]:
import numpy as np
import pandas as pd

## Date and Time Data Types and Tools

In [None]:
from datetime import datetime
now = datetime.now()
now

In [None]:
now.year, now.month, now.day

In [None]:
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta
delta.days
delta.seconds

### Converting Between String and Datetime

In [None]:
stamp = datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [None]:
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [None]:
value = '2011-01-03'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [None]:
from dateutil.parser import parse
parse('2011-01-03')

In [None]:
parse('Jan 31, 1997 10:45 PM')

In [None]:
parse('6/12/2011', dayfirst=True)

In [None]:
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(datestrs)

## Time Series Basics

In [None]:
from datetime import datetime
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts

2011-01-02    0.526228
2011-01-05   -0.205805
2011-01-07   -1.117844
2011-01-08   -0.818618
2011-01-10   -1.588775
2011-01-12   -0.172666
dtype: float64

In [None]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

### Indexing, Selection, Subsetting

In [None]:
stamp = ts.index[2]
ts[stamp]

-1.1178441320140846

In [None]:
ts['1/10/2011']


-1.5887751101543368

In [None]:
ts['20110110']

-1.5887751101543368

In [None]:
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('1/1/2000', periods=1000))
longer_ts


2000-01-01   -0.704540
2000-01-02    0.898968
2000-01-03   -0.053463
2000-01-04    0.435670
2000-01-05    1.099736
                ...   
2002-09-22   -0.242121
2002-09-23    0.188298
2002-09-24    1.147944
2002-09-25   -0.255061
2002-09-26   -0.353712
Freq: D, Length: 1000, dtype: float64

In [None]:
longer_ts['2001']

In [None]:
longer_ts['2001-05']

In [None]:
ts[datetime(2011, 1, 7):]

In [None]:
ts
ts['1/6/2011':'1/11/2011']

In [None]:
ts.truncate(after='1/9/2011')

In [None]:
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=['Colorado', 'Texas',
                                'New York', 'Ohio'])
long_df

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,0.972350,1.578841,-0.284988,0.844146
2000-01-12,-0.807381,-0.187178,-0.133185,0.666385
2000-01-19,0.191429,-1.399120,0.072694,0.627470
2000-01-26,-0.558658,2.031532,0.360274,-1.261360
2000-02-02,-1.658576,-0.452781,-0.762814,-0.505018
...,...,...,...,...
2001-10-31,0.367397,0.557422,0.775671,-0.059675
2001-11-07,1.911188,-1.004954,-0.526420,-1.002453
2001-11-14,-0.188118,-1.068738,0.842537,0.376363
2001-11-21,0.178820,-1.013820,-0.339818,-1.208330


In [None]:
long_df.loc['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-0.333753,1.406123,0.565658,-0.887611
2001-05-09,1.874151,-0.827729,-1.975361,-1.938828
2001-05-16,0.415272,0.56985,-0.516224,0.626977
2001-05-23,-0.202786,-0.823508,0.616185,-0.47418
2001-05-30,-0.347943,-0.488419,0.307763,-1.107985


### Time Series with Duplicate Indices

In [None]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [None]:
dup_ts.index.is_unique

False

In [None]:
dup_ts.groupby(level=0).sum()


2000    10
dtype: int64

### Frequencies and Date Offsets

In [None]:
from pandas.tseries.offsets import Hour, Minute

In [None]:
four_hours = Hour(4)
four_hours

<4 * Hours>

In [None]:
ts = pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4h')
ts

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [None]:
ts+four_hours

DatetimeIndex(['2000-01-01 04:00:00', '2000-01-01 08:00:00',
               '2000-01-01 12:00:00', '2000-01-01 16:00:00',
               '2000-01-01 20:00:00', '2000-01-02 00:00:00',
               '2000-01-02 04:00:00', '2000-01-02 08:00:00',
               '2000-01-02 12:00:00', '2000-01-02 16:00:00',
               '2000-01-02 20:00:00', '2000-01-03 00:00:00',
               '2000-01-03 04:00:00', '2000-01-03 08:00:00',
               '2000-01-03 12:00:00', '2000-01-03 16:00:00',
               '2000-01-03 20:00:00', '2000-01-04 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [None]:
ts + four_hours + Minute(3)

## Time Zone Handling

In [None]:
import pytz
pytz.common_timezones[:5]

['Africa/Abidjan',
 'Africa/Accra',
 'Africa/Addis_Ababa',
 'Africa/Algiers',
 'Africa/Asmara']

In [None]:
tz = pytz.timezone('America/New_York')
tz

<DstTzInfo 'America/New_York' LMT-1 day, 19:04:00 STD>

### Time Zone Localization and Conversion

In [None]:
rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2012-03-09 09:30:00    0.619238
2012-03-10 09:30:00    1.282431
2012-03-11 09:30:00    0.915469
2012-03-12 09:30:00   -0.673361
2012-03-13 09:30:00    1.297960
2012-03-14 09:30:00   -0.626374
Freq: D, dtype: float64

In [None]:
print(ts.index.tz)

None


In [None]:
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [None]:
ts
ts_utc = ts.tz_localize('UTC')
ts_utc
ts_utc.index

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [None]:
ts_utc.tz_convert('America/New_York')

2012-03-09 04:30:00-05:00    0.619238
2012-03-10 04:30:00-05:00    1.282431
2012-03-11 05:30:00-04:00    0.915469
2012-03-12 05:30:00-04:00   -0.673361
2012-03-13 05:30:00-04:00    1.297960
2012-03-14 05:30:00-04:00   -0.626374
Freq: D, dtype: float64

### Operations with Time Zone−Aware Timestamp Objects

In [None]:
stamp = pd.Timestamp('2011-03-12 04:00')
stamp_utc = stamp.tz_localize('utc')
stamp_utc.tz_convert('America/New_York')

In [None]:
stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')
stamp_moscow

In [None]:
stamp_utc.value
stamp_utc.tz_convert('America/New_York').value

In [None]:
from pandas.tseries.offsets import Hour
stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')
stamp
stamp + Hour()

In [None]:
stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')
stamp
stamp + 2 * Hour()

## Periods and Period Arithmetic

In [None]:
rng = pd.period_range('2000-01-01', '2000-06-30', freq='M')
rng

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

In [None]:
pd.Series(np.random.randn(6), index=rng)

2000-01   -0.695768
2000-02    0.648353
2000-03   -0.024062
2000-04    0.474302
2000-05    0.049622
2000-06   -0.990864
Freq: M, dtype: float64

In [None]:
values = ['2001Q3', '2002Q2', '2003Q1']
index = pd.PeriodIndex(values, freq='Q-DEC')
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

### Converting Timestamps to Periods (and Back)

In [None]:
rng = pd.date_range('1/29/2000', periods=6, freq='D')
ts2 = pd.Series(np.random.randn(6), index=rng)
ts2


2000-01-29    0.818630
2000-01-30   -0.478903
2000-01-31    1.474002
2000-02-01   -1.010244
2000-02-02   -1.380709
2000-02-03    0.561331
Freq: D, dtype: float64

In [None]:
pts = ts2.to_period('M')
pts


2000-01    0.818630
2000-01   -0.478903
2000-01    1.474002
2000-02   -1.010244
2000-02   -1.380709
2000-02    0.561331
Freq: M, dtype: float64

In [None]:
pts.to_timestamp(how='end')

2000-01-31 23:59:59.999999999    0.818630
2000-01-31 23:59:59.999999999   -0.478903
2000-01-31 23:59:59.999999999    1.474002
2000-02-29 23:59:59.999999999   -1.010244
2000-02-29 23:59:59.999999999   -1.380709
2000-02-29 23:59:59.999999999    0.561331
dtype: float64