# Chapter 10: Time Series Analysis
## Recipes

* [Understanding the difference between Python and pandas date tools](#Understanding-the-difference-between-Python-and-pandas-date-tools)
* [Slicing time series intelligently](#Slicing-time-series-intelligently)
* [Using methods that only work with a DatetimeIndex](#Using-methods-that-only-work-with-a-DatetimeIndex)
* [Counting the number of weekly crimes](#Counting-the-number-of-weekly-crimes)
* [Aggregating weekly crime and traffic accidents separately](#Aggregating-weekly-crime-and-traffic-separately)
* [Measuring crime by weekday and year](#Measuring-crime-by-weekday-and-year)
* [Grouping with anonymous functions with a DatetimeIndex](#Grouping-with-anonymous-functions-with-a-DatetimeIndex)
* [Grouping by a Timestamp and another column](#Grouping-by-a-Timestamp-and-another-column)
* [Finding the last time crime was 20% lower with merge_asof](#Finding-the-last-time-crime-was-20%-lower-with-merge_asof)

In [None]:
import pandas as pd
import numpy as np
import datetime

%matplotlib inline

# Understanding the difference between Python and pandas date tools

In [None]:
date = datetime.date(year=2013, month=6, day=7)
time = datetime.time(hour=12, minute=30, second=19, microsecond=463198)
dt = datetime.datetime(year=2013, month=6, day=7, 
                       hour=12, minute=30, second=19, microsecond=463198)

print("date is ", date)
print("time is", time)
print("datetime is", dt)

In [None]:
td = datetime.timedelta(weeks=2, days=5, hours=10, minutes=20, 
                        seconds=6.73, milliseconds=99, microseconds=8)
print(td)

In [None]:
print('new date is', date + td)
print('new datetime is', dt + td)

In [None]:
time + td

In [None]:
pd.Timestamp(year=2012, month=12, day=21, hour=5, minute=10, second=8, microsecond=99)

In [None]:
pd.Timestamp('2016/1/10')

In [None]:
pd.Timestamp('2014-5/10')

In [None]:
pd.Timestamp('Jan 3, 2019 20:45.56')

In [None]:
pd.Timestamp('2016-01-05T05:34:43.123456789')

In [None]:
pd.Timestamp(500)

In [None]:
pd.Timestamp(5000, unit='D')

In [None]:
pd.to_datetime('2015-5-13')

In [None]:
pd.to_datetime('2015-13-5', dayfirst=True)

In [None]:
pd.Timestamp('Saturday September 30th, 2017')

In [None]:
pd.to_datetime('Start Date: Sep 30, 2017 Start Time: 1:30 pm', format='Start Date: %b %d, %Y Start Time: %I:%M %p')

In [None]:
pd.to_datetime(100, unit='D', origin='2013-1-1')

In [None]:
s = pd.Series([10, 100, 1000, 10000])
pd.to_datetime(s, unit='D')

In [None]:
s = pd.Series(['12-5-2015', '14-1-2013', '20/12/2017', '40/23/2017'])
pd.to_datetime(s, dayfirst=True, errors='coerce')

In [None]:
pd.to_datetime(['Aug 3 1999 3:45:56', '10/31/2017'])

In [None]:
pd.Timedelta('12 days 5 hours 3 minutes 123456789 nanoseconds')

In [None]:
pd.Timedelta(days=5, minutes=7.34)

In [None]:
pd.Timedelta(100, unit='W')

In [None]:
pd.to_timedelta('5 dayz', errors='ignore')

In [None]:
pd.to_timedelta('67:15:45.454')

In [None]:
s = pd.Series([10, 100])
pd.to_timedelta(s, unit='s')

In [None]:
time_strings = ['2 days 24 minutes 89.67 seconds', '00:45:23.6']
pd.to_timedelta(time_strings)

In [None]:
pd.Timedelta('12 days 5 hours 3 minutes') * 2

In [None]:
pd.Timestamp('1/1/2017') + pd.Timedelta('12 days 5 hours 3 minutes') * 2

In [None]:
td1 = pd.to_timedelta([10, 100], unit='s')
td2 = pd.to_timedelta(['3 hours', '4 hours'])
td1 + td2

In [None]:
pd.Timedelta('12 days') / pd.Timedelta('3 days')

In [None]:
ts = pd.Timestamp('2016-10-1 4:23:23.9')

In [None]:
ts.ceil('h')

In [None]:
ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second

In [None]:
ts.dayofweek, ts.dayofyear, ts.daysinmonth

In [None]:
ts.to_pydatetime()

In [None]:
td = pd.Timedelta(125.8723, unit='h')
td

In [None]:
td.round('min')

In [None]:
td.components

In [None]:
td.total_seconds()

## There's more...

In [None]:
date_string_list = ['Sep 30 1984'] * 10000

In [None]:
%timeit pd.to_datetime(date_string_list, format='%b %d %Y')

In [None]:
%timeit pd.to_datetime(date_string_list)

# Slicing time series intelligently

In [None]:
crime = pd.read_hdf('data/crime.h5', 'crime')
crime.dtypes

In [None]:
crime = crime.set_index('REPORTED_DATE')
crime.head()

In [None]:
pd.options.display.max_rows = 4

In [None]:
crime.loc['2016-05-12 16:45:00']

In [None]:
crime.loc['2016-05-12']

In [None]:
crime.loc['2016-05'].shape

In [None]:
crime.loc['2016'].shape

In [None]:
crime.loc['2016-05-12 03'].shape

In [None]:
crime.loc['Dec 2015'].sort_index()

In [None]:
crime.loc['2016 Sep, 15'].shape

In [None]:
crime.loc['21st October 2014 05'].shape

In [None]:
crime.loc['2015-3-4':'2016-1-1'].sort_index()

In [None]:
crime.loc['2015-3-4 22':'2016-1-1 23:45:00'].sort_index()

## How it works...

In [None]:
mem_cat = crime.memory_usage().sum()
mem_obj = crime.astype({'OFFENSE_TYPE_ID':'object', 
                        'OFFENSE_CATEGORY_ID':'object', 
                        'NEIGHBORHOOD_ID':'object'}).memory_usage(deep=True)\
                                                    .sum()
mb = 2 ** 20
round(mem_cat / mb, 1), round(mem_obj / mb, 1)

In [None]:
crime.index[:2]

## There's more...

In [None]:
%timeit crime.loc['2015-3-4':'2016-1-1']

In [None]:
crime_sort = crime.sort_index()

In [None]:
%timeit crime_sort.loc['2015-3-4':'2016-1-1']

In [None]:
pd.options.display.max_rows = 60

# Using methods that only work with a DatetimeIndex

In [None]:
crime = pd.read_hdf('data/crime.h5', 'crime').set_index('REPORTED_DATE')
print(type(crime.index))

In [None]:
crime.between_time('2:00', '5:00', include_end=False).head()

In [None]:
crime.at_time('5:47').head()

In [None]:
crime_sort = crime.sort_index()

In [None]:
pd.options.display.max_rows = 6

In [None]:
crime_sort.first(pd.offsets.MonthBegin(6))

In [None]:
crime_sort.first(pd.offsets.MonthEnd(6))

In [None]:
crime_sort.first(pd.offsets.MonthBegin(6, normalize=True))

In [None]:
crime_sort.loc[:'2012-06']

In [None]:
crime_sort.first('5D')

In [None]:
crime_sort.first('5B')

In [None]:
crime_sort.first('7W')

In [None]:
crime_sort.first('3QS')

## How it works...

In [None]:
import datetime
crime.between_time(datetime.time(2,0), datetime.time(5,0), include_end=False)

In [None]:
first_date = crime_sort.index[0]
first_date

In [None]:
first_date + pd.offsets.MonthBegin(6)

In [None]:
first_date + pd.offsets.MonthEnd(6)

## There's more...

In [None]:
dt = pd.Timestamp('2012-1-16 13:40')
dt + pd.DateOffset(months=1)

In [None]:
do = pd.DateOffset(years=2, months=5, days=3, hours=8, seconds=10)
pd.Timestamp('2012-1-22 03:22') + do

In [None]:
pd.options.display.max_rows=60

# Counting the number of weekly crimes

In [None]:
crime_sort = pd.read_hdf('data/crime.h5', 'crime') \
               .set_index('REPORTED_DATE') \
               .sort_index()

In [None]:
crime_sort.resample('W')

In [None]:
weekly_crimes = crime_sort.resample('W').size()
weekly_crimes.head()

In [None]:
len(crime_sort.loc[:'2012-1-8'])

In [None]:
len(crime_sort.loc['2012-1-9':'2012-1-15'])

In [None]:
crime_sort.resample('W-THU').size().head()

In [None]:
weekly_crimes_gby = crime_sort.groupby(pd.Grouper(freq='W')).size()
weekly_crimes_gby.head()

In [None]:
weekly_crimes.equals(weekly_crimes_gby)

## How it works...

In [None]:
r = crime_sort.resample('W')
resample_methods = [attr for attr in dir(r) if attr[0].islower()]
print(resample_methods)

## There's more...

In [None]:
crime = pd.read_hdf('data/crime.h5', 'crime')
weekly_crimes2 = crime.resample('W', on='REPORTED_DATE').size()
weekly_crimes2.equals(weekly_crimes)

In [None]:
weekly_crimes_gby2 = crime.groupby(pd.Grouper(key='REPORTED_DATE', freq='W')).size()
weekly_crimes_gby2.equals(weekly_crimes_gby)

In [None]:
weekly_crimes.plot(figsize=(16,4), title='All Denver Crimes')

# Aggregating weekly crime and traffic separately

In [None]:
crime_sort = pd.read_hdf('data/crime.h5', 'crime') \
               .set_index('REPORTED_DATE') \
               .sort_index()

In [None]:
crime_quarterly = crime_sort.resample('Q')['IS_CRIME', 'IS_TRAFFIC'].sum()
crime_quarterly.head()

In [None]:
crime_sort.resample('QS')['IS_CRIME', 'IS_TRAFFIC'].sum().head()

In [None]:
crime_sort.loc['2012-4-1':'2012-6-30', ['IS_CRIME', 'IS_TRAFFIC']].sum()

In [None]:
crime_quarterly2 = crime_sort.groupby(pd.Grouper(freq='Q'))['IS_CRIME', 'IS_TRAFFIC'].sum()
crime_quarterly2.equals(crime_quarterly)

In [None]:
plot_kwargs = dict(figsize=(16,4), 
                   color=['black', 'lightgrey'], 
                   title='Denver Crimes and Traffic Accidents')
crime_quarterly.plot(**plot_kwargs)

## How it works...

In [None]:
crime_sort.resample('Q').sum().head()

In [None]:
crime_sort.resample('QS-MAR')['IS_CRIME', 'IS_TRAFFIC'].sum().head()

## There's more...

In [None]:
crime_begin = crime_quarterly.iloc[0]
crime_begin

In [None]:
crime_quarterly.div(crime_begin) \
               .sub(1) \
               .round(2) \
               .plot(**plot_kwargs)

# Measuring crime by weekday and year

In [None]:
crime = pd.read_hdf('data/crime.h5', 'crime')
crime.head()

In [None]:
wd_counts = crime['REPORTED_DATE'].dt.weekday_name.value_counts()
wd_counts

In [None]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 
        'Friday', 'Saturday', 'Sunday']
title = 'Denver Crimes and Traffic Accidents per Weekday'
wd_counts.reindex(days).plot(kind='barh', title=title)

In [None]:
title = 'Denver Crimes and Traffic Accidents per Year' 
crime['REPORTED_DATE'].dt.year.value_counts() \
                              .sort_index() \
                              .plot(kind='barh', title=title)

In [None]:
weekday = crime['REPORTED_DATE'].dt.weekday_name
year = crime['REPORTED_DATE'].dt.year

crime_wd_y = crime.groupby([year, weekday]).size()
crime_wd_y.head(10)

In [None]:
crime_table = crime_wd_y.rename_axis(['Year', 'Weekday']).unstack('Weekday')
crime_table

In [None]:
criteria = crime['REPORTED_DATE'].dt.year == 2017
crime.loc[criteria, 'REPORTED_DATE'].dt.dayofyear.max()

In [None]:
round(272 / 365, 3)

In [None]:
crime_pct = crime['REPORTED_DATE'].dt.dayofyear.le(272) \
                                  .groupby(year) \
                                  .mean() \
                                  .round(3)
crime_pct

In [None]:
crime_pct.loc[2012:2016].median()

In [None]:
crime_table.loc[2017] = crime_table.loc[2017].div(.748).astype('int')
crime_table = crime_table.reindex(columns=days)
crime_table

In [None]:
import seaborn as sns
sns.heatmap(crime_table, cmap='Greys')

In [None]:
denver_pop = pd.read_csv('data/denver_pop.csv', index_col='Year')
denver_pop

In [None]:
den_100k = denver_pop.div(100000).squeeze()
crime_table2 = crime_table.div(den_100k, axis='index').astype('int')
crime_table2

In [None]:
sns.heatmap(crime_table2, cmap='Greys')

## How it works...

In [None]:
wd_counts.loc[days]

In [None]:
crime_table / den_100k

## There's more...

In [None]:
ADJ_2017 = .748

def count_crime(df, offense_cat): 
    df = df[df['OFFENSE_CATEGORY_ID'] == offense_cat]
    weekday = df['REPORTED_DATE'].dt.weekday_name
    year = df['REPORTED_DATE'].dt.year
    
    ct = df.groupby([year, weekday]).size().unstack()
    ct.loc[2017] = ct.loc[2017].div(ADJ_2017).astype('int')
    
    pop = pd.read_csv('data/denver_pop.csv', index_col='Year')
    pop = pop.squeeze().div(100000)
    
    ct = ct.div(pop, axis=0).astype('int')
    ct = ct.reindex(columns=days)
    sns.heatmap(ct, cmap='Greys')
    return ct

In [None]:
count_crime(crime, 'auto-theft')

# Grouping with anonymous functions with a DatetimeIndex

In [None]:
crime_sort = pd.read_hdf('data/crime.h5', 'crime') \
               .set_index('REPORTED_DATE') \
               .sort_index()

In [None]:
common_attrs = set(dir(crime_sort.index)) & set(dir(pd.Timestamp))
print([attr for attr in common_attrs if attr[0] != '_'])

In [None]:
crime_sort.index.weekday_name.value_counts()

In [None]:
crime_sort.groupby(lambda x: x.weekday_name)['IS_CRIME', 'IS_TRAFFIC'].sum()

In [None]:
funcs = [lambda x: x.round('2h').hour, lambda x: x.year]
cr_group = crime_sort.groupby(funcs)['IS_CRIME', 'IS_TRAFFIC'].sum()
cr_final = cr_group.unstack()
cr_final.style.highlight_max(color='lightgrey')

## There's more...

In [None]:
cr_final.xs('IS_TRAFFIC', axis='columns', level=0).head()

In [None]:
cr_final.xs(2016, axis='columns', level=1).head()

# Grouping by a Timestamp and another column

In [None]:
employee = pd.read_csv('data/employee.csv', 
                       parse_dates=['JOB_DATE', 'HIRE_DATE'], 
                       index_col='HIRE_DATE')
employee.head()

In [None]:
employee.groupby('GENDER')['BASE_SALARY'].mean().round(-2)

In [None]:
employee.resample('10AS')['BASE_SALARY'].mean().round(-2)

In [None]:
sal_avg = employee.groupby('GENDER').resample('10AS')['BASE_SALARY'].mean().round(-2)
sal_avg

In [None]:
sal_avg.unstack('GENDER')

In [None]:
employee[employee['GENDER'] == 'Male'].index.min()

In [None]:
employee[employee['GENDER'] == 'Female'].index.min()

In [None]:
sal_avg2 = employee.groupby(['GENDER', pd.Grouper(freq='10AS')])['BASE_SALARY'].mean().round(-2)
sal_avg2

In [None]:
sal_final = sal_avg2.unstack('GENDER')
sal_final

## How it works...

In [None]:
'resample' in dir(employee.groupby('GENDER'))

In [None]:
'groupby' in dir(employee.resample('10AS'))

## There's more...

In [None]:
years = sal_final.index.year
years_right = years + 9
sal_final.index = years.astype(str) + '-' + years_right.astype(str)
sal_final

In [None]:
cuts = pd.cut(employee.index.year, bins=5, precision=0)
cuts.categories.values

In [None]:
employee.groupby([cuts, 'GENDER'])['BASE_SALARY'].mean().unstack('GENDER').round(-2)

# Finding the last time crime was 20% lower with merge_asof

In [None]:
crime_sort = pd.read_hdf('data/crime.h5', 'crime') \
               .set_index('REPORTED_DATE') \
               .sort_index()

In [None]:
crime_sort.index.max()

In [None]:
crime_sort = crime_sort[:'2017-8']
crime_sort.index.max()

In [None]:
all_data = crime_sort.groupby([pd.Grouper(freq='M'), 'OFFENSE_CATEGORY_ID']).size()
all_data.head()

In [None]:
all_data = all_data.sort_values().reset_index(name='Total')
all_data.head()

In [None]:
goal = all_data[all_data['REPORTED_DATE'] == '2017-8-31'].reset_index(drop=True)
goal['Total_Goal'] = goal['Total'].mul(.8).astype(int)
goal.head()

In [None]:
pd.merge_asof(goal, all_data, left_on='Total_Goal', right_on='Total', 
              by='OFFENSE_CATEGORY_ID', suffixes=('_Current', '_Last'))

## There's more...

In [None]:
pd.Period(year=2012, month=5, day=17, hour=14, minute=20, freq='T')

In [None]:
crime_sort.index.to_period('M')

In [None]:
ad_period = crime_sort.groupby([lambda x: x.to_period('M'), 
                                'OFFENSE_CATEGORY_ID']).size()
ad_period = ad_period.sort_values() \
                     .reset_index(name='Total') \
                     .rename(columns={'level_0':'REPORTED_DATE'})
ad_period.head()

In [None]:
cols = ['OFFENSE_CATEGORY_ID', 'Total']
all_data[cols].equals(ad_period[cols])

In [None]:
aug_2018 = pd.Period('2017-8', freq='M')
goal_period = ad_period[ad_period['REPORTED_DATE'] == aug_2018].reset_index(drop=True)
goal_period['Total_Goal'] = goal_period['Total'].mul(.8).astype(int)

pd.merge_asof(goal_period, ad_period, left_on='Total_Goal', right_on='Total', 
                  by='OFFENSE_CATEGORY_ID', suffixes=('_Current', '_Last')).head()