# Time Series

In [5]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
#PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## Date and Time Data Types and Tools

In [1]:
from datetime import datetime # manipulate date and time 
now = datetime.now() # date and time stamp
now

datetime.datetime(2023, 3, 29, 9, 56, 3, 655972)

In [2]:
now.year, now.month, now.day

(2023, 3, 29)

In [3]:
delta = datetime(2021, 1, 7) - datetime(2018, 6, 24, 8, 15)
delta
#delta.days
#delta.seconds

datetime.timedelta(days=927, seconds=56700)

In [None]:
from datetime import timedelta #  timedelta object represents a duration. difference date time
start = datetime(2021, 1, 7)
start + timedelta(days=12)
#start - 2 * timedelta(days=12)  #go 24 days in the past of start date

### Converting Between String and Datetime

In [None]:
#date to string conversion
stamp = datetime(2021, 1, 3)
str(stamp)
#stamp.strftime('%Y-%m-%d')

In [None]:
#string to date conversion
value = '2021-01-03'   
datetime.strptime(value, '%Y-%m-%d')

In [None]:
# creates a datetime object from the given string.
datestrs = ['7/6/2021', '8/6/2021']
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

In [None]:
# parse most known formats to represent a date and/or time
from dateutil.parser import parse
parse('2021-01-03')

In [None]:
parse('Jan 31, 1997 10:45 PM')

In [None]:
parse('6/12/2021', dayfirst=True)

In [6]:
#pandas can also do conversions to datetime objects
datestrs = ['2021-07-06 12:00:00', '2021-08-06 00:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2021-07-06 12:00:00', '2021-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [7]:
idx = pd.to_datetime(datestrs + [None])  #example using a missing date
idx
#idx[2]
#pd.isnull(idx)

DatetimeIndex(['2021-07-06 12:00:00', '2021-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

## Time Series Basics

In [8]:
from datetime import datetime
dates = [datetime(2021, 1, 2), datetime(2021, 1, 5),
         datetime(2021, 1, 7), datetime(2021, 1, 8),
         datetime(2021, 1, 10), datetime(2021, 1, 12)]
ts = pd.Series(np.random.randn(6), index=dates)
ts

2021-01-02   -0.204708
2021-01-05    0.478943
2021-01-07   -0.519439
2021-01-08   -0.555730
2021-01-10    1.965781
2021-01-12    1.393406
dtype: float64

In [9]:
ts.index

DatetimeIndex(['2021-01-02', '2021-01-05', '2021-01-07', '2021-01-08',
               '2021-01-10', '2021-01-12'],
              dtype='datetime64[ns]', freq=None)

In [10]:
stamp = ts.index[0]
stamp

Timestamp('2021-01-02 00:00:00')

### Indexing, Selection, Subsetting

In [11]:
stamp = ts.index[2]
ts[stamp]

-0.5194387150567381

In [None]:
#query the data using some of the most common date formats and pandas will try to parse it correctly

ts['1/10/2021']  
#ts['20210110']

In [14]:
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('1/1/2000', periods=1000))
longer_ts
#longer_ts['2001']

2000-01-01    0.488675
2000-01-02   -0.178098
2000-01-03    2.122315
2000-01-04    0.061192
2000-01-05    0.884111
                ...   
2002-09-22    1.847446
2002-09-23    0.453739
2002-09-24   -0.302450
2002-09-25    1.402558
2002-09-26    1.212354
Freq: D, Length: 1000, dtype: float64

In [13]:
longer_ts['2001-05']

2001-05-01   -0.622547
2001-05-02    0.936289
2001-05-03    0.750018
2001-05-04   -0.056715
2001-05-05    2.300675
                ...   
2001-05-27    0.235477
2001-05-28    0.111835
2001-05-29   -1.251504
2001-05-30   -2.949343
2001-05-31    0.634634
Freq: D, Length: 31, dtype: float64

In [15]:
#going back to the small dataset
ts  

2021-01-02   -0.204708
2021-01-05    0.478943
2021-01-07   -0.519439
2021-01-08   -0.555730
2021-01-10    1.965781
2021-01-12    1.393406
dtype: float64

In [16]:
ts[datetime(2021, 1, 7):]   #taking a slice

2021-01-07   -0.519439
2021-01-08   -0.555730
2021-01-10    1.965781
2021-01-12    1.393406
dtype: float64

In [17]:
ts['1/6/2021':'1/11/2021']  #slicing even though the timestamps are not in the dataset

2021-01-07   -0.519439
2021-01-08   -0.555730
2021-01-10    1.965781
dtype: float64

In [18]:
ts.truncate(after='1/9/2021')

2021-01-02   -0.204708
2021-01-05    0.478943
2021-01-07   -0.519439
2021-01-08   -0.555730
dtype: float64

In [19]:
#creating a new dataset for testing
dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=['Colorado', 'Texas',
                                'New York', 'Ohio'])
long_df

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,-1.613474,-0.573966,0.424894,1.257544
2000-01-12,-1.065343,0.276356,1.680135,-0.679335
2000-01-19,-0.129697,-1.930931,-0.607638,-0.195258
2000-01-26,-2.077403,0.422648,2.001614,1.385914
2000-02-02,-1.104685,0.677098,0.560161,-0.815368
...,...,...,...,...
2001-10-31,-0.295860,0.611270,0.425756,-0.211722
2001-11-07,1.512358,2.036981,0.259326,0.626242
2001-11-14,1.529507,-0.153254,-0.089238,-0.628635
2001-11-21,2.160955,0.797459,0.715103,1.510537


In [20]:
long_df.loc['5-2001']
#long_df.loc['2001-06']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-0.924368,-0.020641,-0.826849,-0.439442
2001-05-09,0.28919,0.317978,0.035787,1.034372
2001-05-16,-0.506926,1.228659,-1.07965,-1.503665
2001-05-23,-0.16736,-0.183889,-1.003073,-0.292718
2001-05-30,-0.830605,-0.271502,0.825583,1.121559


### Time Series with Duplicate Indices

In [21]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int64

In [22]:
dup_ts.index.is_unique   #quick check to see if there are duplicate timestamps in the index

False

In [26]:
#dup_ts['1/3/2000']  # not duplicated
dup_ts['1/2/2000']  # duplicated

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int64

In [27]:
grouped = dup_ts.groupby(level=0)
grouped.mean()
#grouped.count()

2000-01-01    0.0
2000-01-02    2.0
2000-01-03    4.0
dtype: float64

## Date Ranges, Frequencies, and Shifting

### Generating Date Ranges

In [28]:
index = pd.date_range('2012-04-01', '2012-06-01')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [29]:
pd.date_range(start='2012-04-01', periods=20)
#pd.date_range(end='2012-06-01', periods=20)

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [30]:
pd.date_range('2000-01-01', '2000-12-01', freq='M') 

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-30',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-30', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='M')

In [31]:
pd.date_range('2012-05-02 12:56:31', periods=5)


DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [32]:
pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

## Time Zone Handling

### Time Zone Localization and Conversion

In [33]:
rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2012-03-09 09:30:00   -1.677765
2012-03-10 09:30:00   -0.034771
2012-03-11 09:30:00   -1.943194
2012-03-12 09:30:00    0.405850
2012-03-13 09:30:00    0.680251
2012-03-14 09:30:00    1.357221
Freq: D, dtype: float64

In [34]:
print(ts.index.tz)  #find the timezone for the index

None


In [None]:
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')  #explicitly mentioning the timezone

In [None]:
ts

In [35]:
ts_utc = ts.tz_localize('UTC')  #localize index to UTC
ts_utc

2012-03-09 09:30:00+00:00   -1.677765
2012-03-10 09:30:00+00:00   -0.034771
2012-03-11 09:30:00+00:00   -1.943194
2012-03-12 09:30:00+00:00    0.405850
2012-03-13 09:30:00+00:00    0.680251
2012-03-14 09:30:00+00:00    1.357221
Freq: D, dtype: float64

In [36]:
ts_utc.index

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [37]:
ts_utc.tz_convert('America/New_York')

2012-03-09 04:30:00-05:00   -1.677765
2012-03-10 04:30:00-05:00   -0.034771
2012-03-11 05:30:00-04:00   -1.943194
2012-03-12 05:30:00-04:00    0.405850
2012-03-13 05:30:00-04:00    0.680251
2012-03-14 05:30:00-04:00    1.357221
Freq: D, dtype: float64

In [38]:
ts_eastern = ts.tz_localize('America/New_York')
#ts_eastern
#ts_eastern.tz_convert('UTC')
ts_eastern.tz_convert('Europe/Berlin')

2012-03-09 15:30:00+01:00   -1.677765
2012-03-10 15:30:00+01:00   -0.034771
2012-03-11 14:30:00+01:00   -1.943194
2012-03-12 14:30:00+01:00    0.405850
2012-03-13 14:30:00+01:00    0.680251
2012-03-14 14:30:00+01:00    1.357221
dtype: float64

## Resampling and Frequency Conversion

In [39]:
# Resampling method performs a group-by based on time.

rng = pd.date_range('2000-01-01', periods=100, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2000-01-01   -1.386823
2000-01-02   -0.181085
2000-01-03   -0.658086
2000-01-04    0.476751
2000-01-05   -0.160366
                ...   
2000-04-05    0.424914
2000-04-06    0.221165
2000-04-07    2.109351
2000-04-08    0.312983
2000-04-09   -1.100591
Freq: D, Length: 100, dtype: float64

In [40]:
ts.resample('M').mean()


2000-01-31    0.106451
2000-02-29   -0.053441
2000-03-31    0.107898
2000-04-30    0.010690
Freq: M, dtype: float64

In [41]:
ts.resample('M', kind='period').mean()  #a bit better - each month is treated as a "period" of time

2000-01    0.106451
2000-02   -0.053441
2000-03    0.107898
2000-04    0.010690
Freq: M, dtype: float64

### Downsampling

In [42]:
rng = pd.date_range('2000-01-01', periods=12, freq='T')
ts = pd.Series(np.arange(12), index=rng)
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int64

In [43]:
# Down sample to 5 mins
ts.resample('5min', closed='left').sum()

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int64

In [44]:
ts.resample('5min', closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int64

In [45]:
ts.resample('5min', closed='right', label='right').sum()

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int64

### Upsampling and Interpolation

In [48]:
frame = pd.DataFrame(np.random.randn(2, 4),
                     index=pd.date_range('1/1/2000', periods=2,
                                         freq='W-WED'),
                     columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.146453,-0.580437,2.010696,-0.670604
2000-01-12,-0.978622,-0.047741,-1.408045,-0.470403


In [47]:
df_daily = frame.resample('D').asfreq()
df_daily

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.927628,0.912367,-0.915843,-0.907908
2000-01-06,,,,
2000-01-07,,,,
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-0.605262,-0.710197,1.709644,-0.729395


In [49]:
frame.resample('D').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.146453,-0.580437,2.010696,-0.670604
2000-01-06,1.146453,-0.580437,2.010696,-0.670604
2000-01-07,1.146453,-0.580437,2.010696,-0.670604
2000-01-08,1.146453,-0.580437,2.010696,-0.670604
2000-01-09,1.146453,-0.580437,2.010696,-0.670604
2000-01-10,1.146453,-0.580437,2.010696,-0.670604
2000-01-11,1.146453,-0.580437,2.010696,-0.670604
2000-01-12,-0.978622,-0.047741,-1.408045,-0.470403


In [50]:
frame.resample('D').ffill(limit=2)

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.146453,-0.580437,2.010696,-0.670604
2000-01-06,1.146453,-0.580437,2.010696,-0.670604
2000-01-07,1.146453,-0.580437,2.010696,-0.670604
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-0.978622,-0.047741,-1.408045,-0.470403


## Moving Window Functions

In [None]:
close_px_all = pd.read_csv('stock_px_2.csv',parse_dates=True, index_col=0)
close_px_all

In [None]:
close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]
close_px


In [None]:
close_px = close_px.resample('B').ffill()  #resample to business day
close_px

In [None]:
close_px.AAPL.plot()  
close_px.AAPL.rolling(100).mean().plot()  #using the "rolling" operator. Grouping over a 100 day sliding window

In [None]:
appl_std250 = close_px.AAPL.rolling(250, min_periods=10).std()
#appl_std250 = close_px.AAPL.rolling(250).std()
appl_std250.plot()

In [None]:
close_px.rolling(60).mean().plot(logy=True)  #applying the rolling window to all columns, plot has a logarithmic y axis