# Section 10: Working with Dates and Times in Datasets

### Intro to the Working with Dates and Times Module

In [4]:
#import pandas library
import pandas as pd
#import vanilla python datetime module
import datetime as dt

### Reivew of Python's datetime module

In [5]:
#vanilla python datetime example
#datetime comes with the Python Standard Library so we don't have to install it.

In [7]:
#the date object (includes year, month, and date)
dt.date(2016, 4, 12)
#creates a date object

datetime.date(2016, 4, 12)

In [9]:
#lets look at some of the attributes for the date object stored in someday variable
someday = dt.date(2010,1,20)

In [12]:
someday.year, type(someday.year)

(2010, int)

In [13]:
someday.month, type(someday.month)

(1, int)

In [14]:
someday.day, type(someday.day)

(20, int)

In [22]:
#can pass the date object into the string method to make it more readable
str(someday)

'2010-01-20'

In [15]:
#the datetime object (includes year, month, date, and time)
dt.datetime(2010,1,20)
#only year, month, and day are required.
#if no time parameter is given it will default to midnight.

datetime.datetime(2010, 1, 20, 0, 0)

In [17]:
dt.datetime(2010,1,20,8)
#8am on jan 20, 2010

datetime.datetime(2010, 1, 20, 8, 0)

In [19]:
dt.datetime(2010,1,20,8,13,57)
#8:13:57 on jan 20,2010

datetime.datetime(2010, 1, 20, 8, 13, 57)

In [20]:
#works in military time
dt.datetime(2010,1,20,17,13,57)
#5:13:57 on jan 20,2010

datetime.datetime(2010, 1, 20, 17, 13, 57)

In [21]:
#if you want it in a readable format you can pass it to the string method
str(dt.datetime(2010,1,20,17,13,57))

'2010-01-20 17:13:57'

In [31]:
#the same attributes exist on the datetime object
sometime = dt.datetime(2010,1,20,17,13,57)
sometime.year
sometime.month
sometime.day
sometime.hour
sometime.minute
sometime.second
sometime.microsecond

0

In [None]:
#the attributes are useful if you want to extract all the values in a dataframe that occurred during a specific month.
#you need to be able to pull out the month from the date/datetime object

### The Pandas Timestamp Object

In [33]:
#pandas version of python's datetime object
#will also default to midnight if no hour/minute/second is given

In [34]:
#the pd.Timestamp object can accept a wide variety of inputs
#(strings, datetime objects, etc)
pd.Timestamp('2015-03-31')

Timestamp('2015-03-31 00:00:00')

In [35]:
pd.Timestamp('2015/03/31')

Timestamp('2015-03-31 00:00:00')

In [36]:
pd.Timestamp('2013,11,04')

Timestamp('2013-04-01 00:00:00')

In [37]:
pd.Timestamp('1/1/2015')

Timestamp('2015-01-01 00:00:00')

In [38]:
#day/month/year
pd.Timestamp('19/12/2015') 
#pandas will read this in correctly since there is no month of 19

Timestamp('2015-12-19 00:00:00')

In [39]:
pd.Timestamp('12/19/2015')

Timestamp('2015-12-19 00:00:00')

In [40]:
#march 4, 2000
pd.Timestamp('4/3/2000')
#pandas actually reads this as April 3rd because it defaults to the month first

Timestamp('2000-04-03 00:00:00')

In [42]:
pd.Timestamp('2021-03-08 08:35:15')
pd.Timestamp('2021-03-08 06:35:15 PM')

Timestamp('2021-03-08 18:35:15')

In [43]:
#can also feed direct python datetime objects
pd.Timestamp(dt.date(2015,1,1))

Timestamp('2015-01-01 00:00:00')

In [44]:
pd.Timestamp(dt.datetime(2000,2,3, 21,35,22))

Timestamp('2000-02-03 21:35:22')

In [45]:
#why the need for a pandas timestamp object when we have a python datetime object??
# a lot more functionality with the timestamp object 
# timestamp object just works much better

### The Pandas DateTimeIndex Object

In [49]:
#basically just a collection of pandas timestamps
dates = ['2016/01/02','2016/04/12','2009/09/07']
pd.DatetimeIndex(dates)
#turns all the dates into Timestamps then stores those timestamps as the datetimeindex object

DatetimeIndex(['2016-01-02', '2016-04-12', '2009-09-07'], dtype='datetime64[ns]', freq=None)

In [52]:
#can also pass in python's datetime objects.
dates = [dt.datetime(2006,1,10),dt.datetime(1994,6,13),dt.datetime(2003,12,29)]
pd.DatetimeIndex(dates)
#converts the dates to timestamps and then saves them in the datetimeindex object

DatetimeIndex(['2006-01-10', '1994-06-13', '2003-12-29'], dtype='datetime64[ns]', freq=None)

In [54]:
#the main idea of the datetimeindex object is to serve as the index as one of the other pandas objects (a series or a dataframe)
#lets create a series and save this datetimeindex as the index for the series.
values = [100,200,300]
dates = [dt.datetime(2006,1,10),dt.datetime(1994,6,13),dt.datetime(2003,12,29)]
dtIndex = pd.DatetimeIndex(dates)
pd.Series(data = values, index = dtIndex)
#we have the timestamps are assigned to the index of the series adn the values are the values of the series.

2006-01-10    100
1994-06-13    200
2003-12-29    300
dtype: int64

In [55]:
#the DateTimeIndex is really just a container for storing a collection of pandas Timestamp objects

### The pd.to_datetime() method

In [56]:
#called directly on the pandas library.
#basically a convenience method to convert existing object into a pandas time related object
#can accept a string, or a list of strings, or python date object or python datetime object.
#and will turn it into a pandas timestamp object or a datetimeindex object

In [60]:
pd.to_datetime('2001-03-19')
pd.to_datetime(dt.date(2015,1,1))
pd.to_datetime(dt.datetime(2015,1,1,14,35,20))
pd.to_datetime(['2015-01-03','2014/02/08','2016','July 4th, 1996'])
#can accept a wide variety of input options.

DatetimeIndex(['2015-01-03', '2014-02-08', '2016-01-01', '1996-07-04'], dtype='datetime64[ns]', freq=None)

In [62]:
#most common thing to do with the to_datetime() method is to convert an already existing series into timestamps
#that can be used for extracting data within given times, etc
#here we'll create a series from this list of dates.
times = pd.Series(['2015-01-03','2014/02/08','2016','July 4th, 1996'])
#initially pandas reads them in as strings so there will be no advanced datetime functionality on it.
#therefore we'll need to convert those strings to timestamps using the .to_datetime() method.
pd.to_datetime(times)
#the new series as datetime64 as the dtype of the values.

0   2015-01-03
1   2014-02-08
2   2016-01-01
3   1996-07-04
dtype: datetime64[ns]

In [71]:
#we'll see what happens when we pass in bad data
wacky = pd.Series(['July 4th, 1996', '10/04/1991', 'HELLO','2015-02-31'])
print(wacky)
pd.to_datetime(wacky)
#the 'HELLO' value throws a TypeError: Unrecognized value type: <class 'str'>
#the nonexistant day Feb 31, 2015 throws a ValueError: day is out of range for month

0    July 4th, 1996
1        10/04/1991
2             HELLO
3        2015-02-31
dtype: object


ParserError: Unknown string format: HELLO

In [73]:
#we can prevent error from being raised by setting the errors = parameter to 'coerce'
#this will deal with the correctly formatted data and put NaT (Not a Time) the incorrectly formatted data
wacky = pd.Series(['July 4th, 1996', '10/04/1991', 'HELLO','2015-02-31'])
print(wacky)
pd.to_datetime(wacky, errors = 'coerce')
#lines associated with 'HELLO' and Feb 31,2015 show up as NaT

0    July 4th, 1996
1        10/04/1991
2             HELLO
3        2015-02-31
dtype: object


0   1996-07-04
1   1991-10-04
2          NaT
3          NaT
dtype: datetime64[ns]

In [76]:
#Can also pass in unix time (number of seconds since Jan 1, 1970) but need to add the unit = 's' parameter for seconds.
pd.to_datetime([1349720105,1349806505,1349892905,1349979305,1350065705], unit = 's')

DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05',
               '2012-10-10 18:15:05', '2012-10-11 18:15:05',
               '2012-10-12 18:15:05'],
              dtype='datetime64[ns]', freq=None)

### Create Range of Dates with the pd.date_range() Method, Part 1

In [78]:
#called directly on the pandas library
#needs 2 of the 3 arguments (start/end/periods)
pd.date_range(start = '2016-01-01', end = '2016-01-10', freq = 'D')
#returns a datetimeIndex object starting at jan1,2016 ending at jan10,2016 increasing by a single day each time (freq = 'D')

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
               '2016-01-09', '2016-01-10'],
              dtype='datetime64[ns]', freq='D')

In [81]:
times = pd.date_range(start = '2016-01-01', end = '2016-01-10', freq = 'D')
print(type(times))
#we can see that the dp.date_range() method creates a datetimeindex object
print(times[0])
#we can pass in an index value (just like a list index) and extract that index from the datetimeindex object as a timestamp

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
2016-01-01 00:00:00


In [82]:
pd.date_range(start = '2016-01-01', end = '2016-01-10', freq = '2D')
#freq parameter specifies the step of the range
#can set freq = '2D' to icnrement by 2 days

DatetimeIndex(['2016-01-01', '2016-01-03', '2016-01-05', '2016-01-07',
               '2016-01-09'],
              dtype='datetime64[ns]', freq='2D')

In [85]:
# a lot of other options for freq parameter
pd.date_range(start = '2016-01-01', end = '2016-01-10', freq = 'B') #business days
#here you can see that 2016-01-01 must be a friday since it skips jan2 and jan3 and picks back up on jan4

DatetimeIndex(['2016-01-01', '2016-01-04', '2016-01-05', '2016-01-06',
               '2016-01-07', '2016-01-08'],
              dtype='datetime64[ns]', freq='B')

In [87]:
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq = 'W')  #week starting on sunday
#the 3rd is the first sunday in the date range so its the first value returned.

DatetimeIndex(['2016-01-03', '2016-01-10'], dtype='datetime64[ns]', freq='W-SUN')

In [91]:
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq = 'W-FRI')  #weekly frequency starting on the first friday in the range
#the 1st is a friday, the 8th is a friday, and the 15th is a friday as well so they're all included in the output.

DatetimeIndex(['2016-01-01', '2016-01-08', '2016-01-15'], dtype='datetime64[ns]', freq='W-FRI')

In [92]:
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq = 'H') #hourly
pd.date_range(start = '2016-01-01', end = '2016-01-15', freq = '6H') #6 hour frquency

DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
               '2016-01-01 02:00:00', '2016-01-01 03:00:00',
               '2016-01-01 04:00:00', '2016-01-01 05:00:00',
               '2016-01-01 06:00:00', '2016-01-01 07:00:00',
               '2016-01-01 08:00:00', '2016-01-01 09:00:00',
               ...
               '2016-01-14 15:00:00', '2016-01-14 16:00:00',
               '2016-01-14 17:00:00', '2016-01-14 18:00:00',
               '2016-01-14 19:00:00', '2016-01-14 20:00:00',
               '2016-01-14 21:00:00', '2016-01-14 22:00:00',
               '2016-01-14 23:00:00', '2016-01-15 00:00:00'],
              dtype='datetime64[ns]', length=337, freq='H')

In [94]:
pd.date_range(start = '2016-01-01', end = '2016-12-31', freq = 'M')  #month end values that fall within the date range

DatetimeIndex(['2016-01-31', '2016-02-29', '2016-03-31', '2016-04-30',
               '2016-05-31', '2016-06-30', '2016-07-31', '2016-08-31',
               '2016-09-30', '2016-10-31', '2016-11-30', '2016-12-31'],
              dtype='datetime64[ns]', freq='M')

In [95]:
pd.date_range(start = '2016-01-01', end = '2016-12-31', freq = 'MS')  #month start values that fall wtithin date range

DatetimeIndex(['2016-01-01', '2016-02-01', '2016-03-01', '2016-04-01',
               '2016-05-01', '2016-06-01', '2016-07-01', '2016-08-01',
               '2016-09-01', '2016-10-01', '2016-11-01', '2016-12-01'],
              dtype='datetime64[ns]', freq='MS')

In [96]:
pd.date_range(start = '2016-01-01', end = '2050-01-01', freq = 'A')  #year end values

DatetimeIndex(['2016-12-31', '2017-12-31', '2018-12-31', '2019-12-31',
               '2020-12-31', '2021-12-31', '2022-12-31', '2023-12-31',
               '2024-12-31', '2025-12-31', '2026-12-31', '2027-12-31',
               '2028-12-31', '2029-12-31', '2030-12-31', '2031-12-31',
               '2032-12-31', '2033-12-31', '2034-12-31', '2035-12-31',
               '2036-12-31', '2037-12-31', '2038-12-31', '2039-12-31',
               '2040-12-31', '2041-12-31', '2042-12-31', '2043-12-31',
               '2044-12-31', '2045-12-31', '2046-12-31', '2047-12-31',
               '2048-12-31', '2049-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')

In [97]:
#main idea is to set a start date, an end date(inclusive), and the fequency/step.

### Create Range of Dates with the pd.date_range() Method, Part 2

In [101]:
#this section we'll talk about the start and periods parameters
#periods represents the number of results we want to get.
pd.date_range(start = '2012-09-09', periods=25, freq= 'D')
#returns a datetimeindex with 25 days starting on 2012-09-09 and going to 2012-10-03

DatetimeIndex(['2012-09-09', '2012-09-10', '2012-09-11', '2012-09-12',
               '2012-09-13', '2012-09-14', '2012-09-15', '2012-09-16',
               '2012-09-17', '2012-09-18', '2012-09-19', '2012-09-20',
               '2012-09-21', '2012-09-22', '2012-09-23', '2012-09-24',
               '2012-09-25', '2012-09-26', '2012-09-27', '2012-09-28',
               '2012-09-29', '2012-09-30', '2012-10-01', '2012-10-02',
               '2012-10-03'],
              dtype='datetime64[ns]', freq='D')

In [104]:
#a full year (365) timestamps are returned which account for an entire year 2012-09-09 to 2013-09-08
pd.date_range(start = '2012-09-09', periods=365, freq= 'D')

DatetimeIndex(['2012-09-09', '2012-09-10', '2012-09-11', '2012-09-12',
               '2012-09-13', '2012-09-14', '2012-09-15', '2012-09-16',
               '2012-09-17', '2012-09-18',
               ...
               '2013-08-30', '2013-08-31', '2013-09-01', '2013-09-02',
               '2013-09-03', '2013-09-04', '2013-09-05', '2013-09-06',
               '2013-09-07', '2013-09-08'],
              dtype='datetime64[ns]', length=365, freq='D')

In [106]:
pd.date_range(start = '2012-09-09', periods=50, freq= 'B') #50 business days from 2012-09-09 (ends nov16,2012)

DatetimeIndex(['2012-09-10', '2012-09-11', '2012-09-12', '2012-09-13',
               '2012-09-14', '2012-09-17', '2012-09-18', '2012-09-19',
               '2012-09-20', '2012-09-21', '2012-09-24', '2012-09-25',
               '2012-09-26', '2012-09-27', '2012-09-28', '2012-10-01',
               '2012-10-02', '2012-10-03', '2012-10-04', '2012-10-05',
               '2012-10-08', '2012-10-09', '2012-10-10', '2012-10-11',
               '2012-10-12', '2012-10-15', '2012-10-16', '2012-10-17',
               '2012-10-18', '2012-10-19', '2012-10-22', '2012-10-23',
               '2012-10-24', '2012-10-25', '2012-10-26', '2012-10-29',
               '2012-10-30', '2012-10-31', '2012-11-01', '2012-11-02',
               '2012-11-05', '2012-11-06', '2012-11-07', '2012-11-08',
               '2012-11-09', '2012-11-12', '2012-11-13', '2012-11-14',
               '2012-11-15', '2012-11-16'],
              dtype='datetime64[ns]', freq='B')

In [107]:
len(pd.date_range(start = '2012-09-09', periods=50, freq= 'B')) #50 business days from 2012-09-09 (ends nov16,2012)

50

In [112]:
pd.date_range(start = '2012-09-09', periods=50, freq= 'W') #50 weeks (starting sunday) from sept 9 2012
pd.date_range(start = '2012-09-09', periods=50, freq= 'W-TUE') #50 weeks (starting Tuesday) from sept 9 2012
pd.date_range(start = '2012-09-09', periods=50, freq= 'MS') #50 month starts from that date
pd.date_range(start = '2012-09-09', periods=50, freq= 'H') #every hour for 50 hours
pd.date_range(start = '2012-09-09', periods=50, freq= '6H')  #50 6hour time gaps from sept 9,2012 at midnight 

DatetimeIndex(['2012-09-09 00:00:00', '2012-09-09 06:00:00',
               '2012-09-09 12:00:00', '2012-09-09 18:00:00',
               '2012-09-10 00:00:00', '2012-09-10 06:00:00',
               '2012-09-10 12:00:00', '2012-09-10 18:00:00',
               '2012-09-11 00:00:00', '2012-09-11 06:00:00',
               '2012-09-11 12:00:00', '2012-09-11 18:00:00',
               '2012-09-12 00:00:00', '2012-09-12 06:00:00',
               '2012-09-12 12:00:00', '2012-09-12 18:00:00',
               '2012-09-13 00:00:00', '2012-09-13 06:00:00',
               '2012-09-13 12:00:00', '2012-09-13 18:00:00',
               '2012-09-14 00:00:00', '2012-09-14 06:00:00',
               '2012-09-14 12:00:00', '2012-09-14 18:00:00',
               '2012-09-15 00:00:00', '2012-09-15 06:00:00',
               '2012-09-15 12:00:00', '2012-09-15 18:00:00',
               '2012-09-16 00:00:00', '2012-09-16 06:00:00',
               '2012-09-16 12:00:00', '2012-09-16 18:00:00',
               '2012-09-

In [113]:
#periods param allows us to set how many timestamps will be added to the datetimeindex object.

### Create Range of Dates with the pd.date_range() Method, Part 3

In [114]:
#the last combination of params that are required
#end and periods params.
#allows us to set the end data and count forward a certain number by a given frequency.
pd.date_range(end = '1999-12-31', periods = 20, freq = 'D')

DatetimeIndex(['1999-12-12', '1999-12-13', '1999-12-14', '1999-12-15',
               '1999-12-16', '1999-12-17', '1999-12-18', '1999-12-19',
               '1999-12-20', '1999-12-21', '1999-12-22', '1999-12-23',
               '1999-12-24', '1999-12-25', '1999-12-26', '1999-12-27',
               '1999-12-28', '1999-12-29', '1999-12-30', '1999-12-31'],
              dtype='datetime64[ns]', freq='D')

In [117]:
pd.date_range(end = '1999-12-31', periods = 40, freq = 'D') #counts backward 40 days from the end date

DatetimeIndex(['1999-11-22', '1999-11-23', '1999-11-24', '1999-11-25',
               '1999-11-26', '1999-11-27', '1999-11-28', '1999-11-29',
               '1999-11-30', '1999-12-01', '1999-12-02', '1999-12-03',
               '1999-12-04', '1999-12-05', '1999-12-06', '1999-12-07',
               '1999-12-08', '1999-12-09', '1999-12-10', '1999-12-11',
               '1999-12-12', '1999-12-13', '1999-12-14', '1999-12-15',
               '1999-12-16', '1999-12-17', '1999-12-18', '1999-12-19',
               '1999-12-20', '1999-12-21', '1999-12-22', '1999-12-23',
               '1999-12-24', '1999-12-25', '1999-12-26', '1999-12-27',
               '1999-12-28', '1999-12-29', '1999-12-30', '1999-12-31'],
              dtype='datetime64[ns]', freq='D')

In [118]:
pd.date_range(end = '1999-12-31', periods = 20, freq = 'B') #starts 20 business days before 1999-12-31 and ends on the given date

DatetimeIndex(['1999-12-06', '1999-12-07', '1999-12-08', '1999-12-09',
               '1999-12-10', '1999-12-13', '1999-12-14', '1999-12-15',
               '1999-12-16', '1999-12-17', '1999-12-20', '1999-12-21',
               '1999-12-22', '1999-12-23', '1999-12-24', '1999-12-27',
               '1999-12-28', '1999-12-29', '1999-12-30', '1999-12-31'],
              dtype='datetime64[ns]', freq='B')

In [119]:
pd.date_range(end = '1999-12-31', periods = 40, freq = 'W-SUN') #see 40 sundays prior to the defined end date

DatetimeIndex(['1999-03-28', '1999-04-04', '1999-04-11', '1999-04-18',
               '1999-04-25', '1999-05-02', '1999-05-09', '1999-05-16',
               '1999-05-23', '1999-05-30', '1999-06-06', '1999-06-13',
               '1999-06-20', '1999-06-27', '1999-07-04', '1999-07-11',
               '1999-07-18', '1999-07-25', '1999-08-01', '1999-08-08',
               '1999-08-15', '1999-08-22', '1999-08-29', '1999-09-05',
               '1999-09-12', '1999-09-19', '1999-09-26', '1999-10-03',
               '1999-10-10', '1999-10-17', '1999-10-24', '1999-10-31',
               '1999-11-07', '1999-11-14', '1999-11-21', '1999-11-28',
               '1999-12-05', '1999-12-12', '1999-12-19', '1999-12-26'],
              dtype='datetime64[ns]', freq='W-SUN')

In [120]:
pd.date_range(end = '1999-12-31', periods = 40, freq = 'W-FRI') #see 40 fridays prior to the defined end date

DatetimeIndex(['1999-04-02', '1999-04-09', '1999-04-16', '1999-04-23',
               '1999-04-30', '1999-05-07', '1999-05-14', '1999-05-21',
               '1999-05-28', '1999-06-04', '1999-06-11', '1999-06-18',
               '1999-06-25', '1999-07-02', '1999-07-09', '1999-07-16',
               '1999-07-23', '1999-07-30', '1999-08-06', '1999-08-13',
               '1999-08-20', '1999-08-27', '1999-09-03', '1999-09-10',
               '1999-09-17', '1999-09-24', '1999-10-01', '1999-10-08',
               '1999-10-15', '1999-10-22', '1999-10-29', '1999-11-05',
               '1999-11-12', '1999-11-19', '1999-11-26', '1999-12-03',
               '1999-12-10', '1999-12-17', '1999-12-24', '1999-12-31'],
              dtype='datetime64[ns]', freq='W-FRI')

In [121]:
#other frequency options are avialbe here too.
pd.date_range(end = '1999-12-31', periods = 40, freq = 'M') #see 40 month ends prior to the defined end date

DatetimeIndex(['1996-09-30', '1996-10-31', '1996-11-30', '1996-12-31',
               '1997-01-31', '1997-02-28', '1997-03-31', '1997-04-30',
               '1997-05-31', '1997-06-30', '1997-07-31', '1997-08-31',
               '1997-09-30', '1997-10-31', '1997-11-30', '1997-12-31',
               '1998-01-31', '1998-02-28', '1998-03-31', '1998-04-30',
               '1998-05-31', '1998-06-30', '1998-07-31', '1998-08-31',
               '1998-09-30', '1998-10-31', '1998-11-30', '1998-12-31',
               '1999-01-31', '1999-02-28', '1999-03-31', '1999-04-30',
               '1999-05-31', '1999-06-30', '1999-07-31', '1999-08-31',
               '1999-09-30', '1999-10-31', '1999-11-30', '1999-12-31'],
              dtype='datetime64[ns]', freq='M')

In [123]:
pd.date_range(end = '1999-12-31', periods = 53, freq = 'MS') #see 53 month starts prior to the defined end date

DatetimeIndex(['1995-08-01', '1995-09-01', '1995-10-01', '1995-11-01',
               '1995-12-01', '1996-01-01', '1996-02-01', '1996-03-01',
               '1996-04-01', '1996-05-01', '1996-06-01', '1996-07-01',
               '1996-08-01', '1996-09-01', '1996-10-01', '1996-11-01',
               '1996-12-01', '1997-01-01', '1997-02-01', '1997-03-01',
               '1997-04-01', '1997-05-01', '1997-06-01', '1997-07-01',
               '1997-08-01', '1997-09-01', '1997-10-01', '1997-11-01',
               '1997-12-01', '1998-01-01', '1998-02-01', '1998-03-01',
               '1998-04-01', '1998-05-01', '1998-06-01', '1998-07-01',
               '1998-08-01', '1998-09-01', '1998-10-01', '1998-11-01',
               '1998-12-01', '1999-01-01', '1999-02-01', '1999-03-01',
               '1999-04-01', '1999-05-01', '1999-06-01', '1999-07-01',
               '1999-08-01', '1999-09-01', '1999-10-01', '1999-11-01',
               '1999-12-01'],
              dtype='datetime64[ns]', freq='MS'

In [125]:
pd.date_range(end = '1999-12-31', periods = 100, freq = '7H') #see 100 hours prior to the defined end date

DatetimeIndex(['1999-12-02 03:00:00', '1999-12-02 10:00:00',
               '1999-12-02 17:00:00', '1999-12-03 00:00:00',
               '1999-12-03 07:00:00', '1999-12-03 14:00:00',
               '1999-12-03 21:00:00', '1999-12-04 04:00:00',
               '1999-12-04 11:00:00', '1999-12-04 18:00:00',
               '1999-12-05 01:00:00', '1999-12-05 08:00:00',
               '1999-12-05 15:00:00', '1999-12-05 22:00:00',
               '1999-12-06 05:00:00', '1999-12-06 12:00:00',
               '1999-12-06 19:00:00', '1999-12-07 02:00:00',
               '1999-12-07 09:00:00', '1999-12-07 16:00:00',
               '1999-12-07 23:00:00', '1999-12-08 06:00:00',
               '1999-12-08 13:00:00', '1999-12-08 20:00:00',
               '1999-12-09 03:00:00', '1999-12-09 10:00:00',
               '1999-12-09 17:00:00', '1999-12-10 00:00:00',
               '1999-12-10 07:00:00', '1999-12-10 14:00:00',
               '1999-12-10 21:00:00', '1999-12-11 04:00:00',
               '1999-12-

### The .dt Accessor

In [131]:
#similar to the .str prefix when working with string data in pandas.
bunch_of_dates = pd.date_range(start = '2000-01-01', end = '2010-12-31', freq = '24D')
s = pd.Series(bunch_of_dates)
s.head(3)
#now we have a bunch of timestamps stored in a series

0   2000-01-01
1   2000-01-25
2   2000-02-18
dtype: datetime64[ns]

In [132]:
#want to extract the day for each one of these values in a series.
#can't just call s.day. need to include the .dt accessor
s.dt.day

0       1
1      25
2      18
3      13
4       6
       ..
163    17
164    11
165     4
166    28
167    22
Length: 168, dtype: int64

In [135]:
s.dt.month

0       1
1       1
2       2
3       3
4       4
       ..
163     9
164    10
165    11
166    11
167    12
Length: 168, dtype: int64

In [156]:
#find the day name of each date
s.dt.day_name()

0       Saturday
1        Tuesday
2         Friday
3         Monday
4       Thursday
         ...    
163       Friday
164       Monday
165     Thursday
166       Sunday
167    Wednesday
Length: 168, dtype: object

In [158]:
#want to know if these dates is s fall on the start of a quarter
s.dt.is_quarter_start 
#returns a boolean series which can get passed back into s[] to extract those dates that are the start of a quarter
s[s.dt.is_quarter_start]
#here are the dates that start a quarter from the s series.

0     2000-01-01
19    2001-04-01
38    2002-07-01
137   2009-01-01
dtype: datetime64[ns]

In [164]:
s.dt.is_month_end
#can pass this boolean series into the s series 
#s[s.dt.is_month_end]

0      False
1      False
2      False
3      False
4      False
       ...  
163    False
164    False
165    False
166    False
167    False
Length: 168, dtype: bool

In [162]:
s.dt.month_name()

0        January
1        January
2       February
3          March
4          April
         ...    
163    September
164      October
165     November
166     November
167     December
Length: 168, dtype: object

### Install pandas-datareader Library

In [167]:
#conda install -c anaconda pandas-datareader
#this library reaches out to a server and fetches a bunch of data for us and converts it into a dataframe

### Import Financial Data Set with pandas_datareader Library

In [179]:
import pandas as pd
import datetime as dt
from pandas_datareader import data

In [180]:
data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
#we have a datetimeindex as the index of the df.
#also have high, low, open, close, volumn, and adj close in the columns

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.100000,30.590000,30.620001,30.950001,38409100.0,24.226894
2010-01-05,31.100000,30.639999,30.850000,30.959999,49749600.0,24.234720
2010-01-06,31.080000,30.520000,30.879999,30.770000,58182400.0,24.085989
2010-01-07,30.700001,30.190001,30.629999,30.450001,50559700.0,23.835503
2010-01-08,30.879999,30.240000,30.280001,30.660000,51197400.0,23.999893
...,...,...,...,...,...,...
2020-07-09,216.380005,211.470001,216.330002,214.320007,33121700.0,214.320007
2020-07-10,214.080002,211.080002,213.619995,213.669998,26177600.0,213.669998
2020-07-13,215.800003,206.500000,214.479996,207.070007,38135600.0,207.070007
2020-07-14,208.850006,202.029999,206.130005,208.350006,37534700.0,208.350006


In [181]:
stocks = data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.226894
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.23472
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.085989


In [182]:
#attributes we can access on this dataframe
stocks.values

array([[3.11000004e+01, 3.05900002e+01, 3.06200008e+01, 3.09500008e+01,
        3.84091000e+07, 2.42268944e+01],
       [3.11000004e+01, 3.06399994e+01, 3.08500004e+01, 3.09599991e+01,
        4.97496000e+07, 2.42347202e+01],
       [3.10799999e+01, 3.05200005e+01, 3.08799992e+01, 3.07700005e+01,
        5.81824000e+07, 2.40859890e+01],
       ...,
       [2.15800003e+02, 2.06500000e+02, 2.14479996e+02, 2.07070007e+02,
        3.81356000e+07, 2.07070007e+02],
       [2.08850006e+02, 2.02029999e+02, 2.06130005e+02, 2.08350006e+02,
        3.75347000e+07, 2.08350006e+02],
       [2.11330002e+02, 2.06860001e+02, 2.09559998e+02, 2.08070007e+02,
        1.19443640e+07, 2.08070007e+02]])

In [183]:
stocks.columns

Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')

In [184]:
stocks.index

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2020-07-01', '2020-07-02', '2020-07-06', '2020-07-07',
               '2020-07-08', '2020-07-09', '2020-07-10', '2020-07-13',
               '2020-07-14', '2020-07-15'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [185]:
stocks.axes

[DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
                '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
                '2010-01-14', '2010-01-15',
                ...
                '2020-07-01', '2020-07-02', '2020-07-06', '2020-07-07',
                '2020-07-08', '2020-07-09', '2020-07-10', '2020-07-13',
                '2020-07-14', '2020-07-15'],
               dtype='datetime64[ns]', name='Date', length=2651, freq=None),
 Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')]

### Selecting Rows from a DataFrame with a DateTimeIndex

In [186]:
stocks = data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.226894
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.23472
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.085989


In [187]:
#can use index label .loc[]
stocks.loc[pd.Timestamp('2010-01-04')]
#generally want to pass it a timestamp object, rather than just a string.

High         3.110000e+01
Low          3.059000e+01
Open         3.062000e+01
Close        3.095000e+01
Volume       3.840910e+07
Adj Close    2.422689e+01
Name: 2010-01-04 00:00:00, dtype: float64

In [189]:
#or use index position with .iloc[]
stocks.iloc[0]
#returns the same series as above
stocks.iloc[-1]
#returns new series with the most recent row, etc.

High         2.113300e+02
Low          2.068600e+02
Open         2.095600e+02
Close        2.080700e+02
Volume       1.216174e+07
Adj Close    2.080700e+02
Name: 2020-07-15 00:00:00, dtype: float64

In [192]:
#pulling multiple values at the same time
#using a list.
stocks.loc[['2010-01-04','2010-01-05']]
#the code above will thrown an error. we need to wrap those values in a pd.timestamp()

KeyError: "None of [Index(['2010-01-04', '2010-01-05'], dtype='object', name='Date')] are in the [index]"

In [194]:
stocks.loc[[pd.Timestamp('2010-01-04'),pd.Timestamp('2010-01-05')]]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.226894
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.23472


In [195]:
stocks.iloc[[10,15,30]]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-19,31.24,30.68,30.75,31.1,46575700.0,24.344313
2010-01-26,29.85,29.09,29.200001,29.5,66639900.0,23.091867
2010-02-17,28.65,28.360001,28.530001,28.59,45882900.0,22.484196


In [199]:
#can also slice with loc and iloc
stocks.loc[pd.Timestamp('2013-10-01'): pd.Timestamp('2013-10-07')]
#with loc the upper range is included in the result
#with .loc[] the upper range is not included

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-10-01,33.610001,33.299999,33.349998,33.580002,36718700.0,28.962389
2013-10-02,34.029999,33.290001,33.360001,33.919998,46946800.0,29.255632
2013-10-03,34.0,33.419998,33.880001,33.860001,38703800.0,29.203878
2013-10-04,33.990002,33.619999,33.689999,33.880001,33008100.0,29.22113
2013-10-07,33.709999,33.200001,33.599998,33.299999,35069300.0,28.720888


In [200]:
#there is also a method called .truncate() with a before and after parameter that does the same thing.
stocks.truncate(before = pd.Timestamp('2013-10-01'), after = pd.Timestamp('2013-10-07'))

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-10-01,33.610001,33.299999,33.349998,33.580002,36718700.0,28.962389
2013-10-02,34.029999,33.290001,33.360001,33.919998,46946800.0,29.255632
2013-10-03,34.0,33.419998,33.880001,33.860001,38703800.0,29.203878
2013-10-04,33.990002,33.619999,33.689999,33.880001,33008100.0,29.22113
2013-10-07,33.709999,33.200001,33.599998,33.299999,35069300.0,28.720888


In [202]:
#with .loc[] the upper range is not included
stocks.iloc[1000:1005]
#pulls out index values of 1000,1001,1002,1003, and 1004

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-12-23,36.889999,36.549999,36.810001,36.619999,25128700.0,31.823889
2013-12-24,37.169998,36.639999,36.720001,37.080002,14243000.0,32.223644
2013-12-26,37.490002,37.169998,37.200001,37.439999,17612800.0,32.536484
2013-12-27,37.619999,37.169998,37.580002,37.290001,14563000.0,32.406143
2013-12-30,37.380001,36.900002,37.220001,37.290001,16290500.0,32.406143


In [214]:
#find microsoft stock values on every single one of my birthdays.\
#first we create a DateTimeIndex with all of our birthdays using the date_range method
bdays = pd.date_range(start = '1988-01-25', end = '2020-07-15', freq = pd.DateOffset(years = 1))
#next we create a boolean series that compares the bdays index with the index of the stocks.
#if the stock index is in bday it returns true
mask = stocks.index.isin(bdays)
#we can then pass this boolean series back into the original dataframe the extract only those rows that are True
stocks[mask]
#returns all the stock prices on each of my birthdays. There are gaps since those dates are on the weekend.
#or can use .loc[] to change the dataframe
stocks.loc[mask]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-25,29.66,29.1,29.24,29.32,63373000.0,22.950977
2011-01-25,28.450001,28.120001,28.139999,28.450001,42436600.0,22.733784
2012-01-25,29.65,29.07,29.07,29.559999,59231700.0,24.248291
2013-01-25,28.23,27.389999,27.58,27.879999,81847700.0,23.516964
2016-01-25,52.650002,51.650002,51.939999,51.790001,34707700.0,47.498287
2017-01-25,64.099998,63.450001,63.950001,63.68,23672700.0,60.005581
2018-01-25,93.239998,91.93,92.470001,92.330002,26383200.0,88.945114
2019-01-25,107.879997,106.199997,107.239998,107.169998,31225600.0,105.028282


### Timestamp Object Attributes and Methods

In [250]:
stocks = data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.226894
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.23472
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.085989


In [236]:
#access all the index values
stocks.index

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2020-07-01', '2020-07-02', '2020-07-06', '2020-07-07',
               '2020-07-08', '2020-07-09', '2020-07-10', '2020-07-13',
               '2020-07-14', '2020-07-15'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [237]:
#access a single index value that is a Timestamp object
stocks.index[500]

Timestamp('2011-12-27 00:00:00')

In [238]:
someday = stocks.index[500]

In [239]:
someday.month
#returns numerica month

12

In [240]:
someday.week
#returns the numeric week of that timestamp

52

In [241]:
someday.is_month_start, someday.is_month_end

(False, False)

In [242]:
someday.is_quarter_start, someday.is_quarter_end

(False, False)

In [243]:
someday.day

27

In [244]:
#also a lot of helpful methods that can run on a Timestamp object
someday.month_name()

'December'

In [245]:
someday.day_name()

'Tuesday'

In [246]:
#we can access any of these attributes or methods on a DateTimeIndex
#it will run through every Timestamp in the DateTimeIndex object and run the attribute/method
stocks.index

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2020-07-01', '2020-07-02', '2020-07-06', '2020-07-07',
               '2020-07-08', '2020-07-09', '2020-07-10', '2020-07-13',
               '2020-07-14', '2020-07-15'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [247]:
stocks.index.day_name()
#returns a brand new pandas index object as list of day name strings.

Index(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Monday',
       'Tuesday', 'Wednesday', 'Thursday', 'Friday',
       ...
       'Wednesday', 'Thursday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
       'Friday', 'Monday', 'Tuesday', 'Wednesday'],
      dtype='object', name='Date', length=2651)

In [256]:
#can use this new index as a new column in my stocks dataframe
stocks.insert(len(stocks.columns),'Day of Week', stocks.index.day_name())
stocks

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Day of Week
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,31.100000,30.590000,30.620001,30.950001,38409100.0,24.226894,Monday
2010-01-05,31.100000,30.639999,30.850000,30.959999,49749600.0,24.234720,Tuesday
2010-01-06,31.080000,30.520000,30.879999,30.770000,58182400.0,24.085989,Wednesday
2010-01-07,30.700001,30.190001,30.629999,30.450001,50559700.0,23.835503,Thursday
2010-01-08,30.879999,30.240000,30.280001,30.660000,51197400.0,23.999893,Friday
...,...,...,...,...,...,...,...
2020-07-09,216.380005,211.470001,216.330002,214.320007,33121700.0,214.320007,Thursday
2020-07-10,214.080002,211.080002,213.619995,213.669998,26177600.0,213.669998,Friday
2020-07-13,215.800003,206.500000,214.479996,207.070007,38135600.0,207.070007,Monday
2020-07-14,208.850006,202.029999,206.130005,208.350006,37534700.0,208.350006,Tuesday


In [259]:
#lets extract all the rows that fall on the months start
stocks[stocks.index.is_month_start]
#the stocks.index.is_month_start returns a boolean series which is passed back into the stocks df
#to extract all values that fall on the months start

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Day of Week
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-02-01,28.480000,27.920000,28.389999,28.410000,85931100.0,22.238642,Monday
2010-03-01,29.049999,28.530001,28.770000,29.020000,43805400.0,22.822367,Monday
2010-04-01,29.540001,28.620001,29.350000,29.160000,74768100.0,22.932465,Thursday
2010-06-01,26.309999,25.520000,25.530001,25.889999,76152400.0,20.452698,Tuesday
2010-07-01,23.320000,22.730000,23.090000,23.160000,92239400.0,18.296041,Thursday
...,...,...,...,...,...,...,...
2019-11-01,144.419998,142.970001,144.259995,143.720001,33128400.0,142.445740,Friday
2020-04-01,157.750000,150.820007,153.000000,152.110001,57969900.0,151.687546,Wednesday
2020-05-01,178.639999,174.009995,175.800003,174.570007,39370500.0,174.085175,Friday
2020-06-01,183.000000,181.460007,182.539993,182.830002,22622400.0,182.830002,Monday


In [262]:
#can also insert the boolean series as a column in the stocks dataframe
stocks.insert(len(stocks.columns), 'Is Month Start', stocks.index.is_month_start)
stocks

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close,Day of Week,Is Month Start
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,31.100000,30.590000,30.620001,30.950001,38409100.0,24.226894,Monday,False
2010-01-05,31.100000,30.639999,30.850000,30.959999,49749600.0,24.234720,Tuesday,False
2010-01-06,31.080000,30.520000,30.879999,30.770000,58182400.0,24.085989,Wednesday,False
2010-01-07,30.700001,30.190001,30.629999,30.450001,50559700.0,23.835503,Thursday,False
2010-01-08,30.879999,30.240000,30.280001,30.660000,51197400.0,23.999893,Friday,False
...,...,...,...,...,...,...,...,...
2020-07-09,216.380005,211.470001,216.330002,214.320007,33121700.0,214.320007,Thursday,False
2020-07-10,214.080002,211.080002,213.619995,213.669998,26177600.0,213.669998,Friday,False
2020-07-13,215.800003,206.500000,214.479996,207.070007,38135600.0,207.070007,Monday,False
2020-07-14,208.850006,202.029999,206.130005,208.350006,37534700.0,208.350006,Tuesday,False


### The pd.DateOffset Object

In [270]:
stocks = data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.226894
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.23472
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.085989


In [265]:
#allows us to add or subtract a set ammount of time from every date in out datetimeindex.
#add 5 days to every date
stocks.index + pd.DateOffset(days = 5)
#returns a new DateTimeIndex object with 5 days added to every Timestamp

DatetimeIndex(['2010-01-09', '2010-01-10', '2010-01-11', '2010-01-12',
               '2010-01-13', '2010-01-16', '2010-01-17', '2010-01-18',
               '2010-01-19', '2010-01-20',
               ...
               '2020-07-06', '2020-07-07', '2020-07-11', '2020-07-12',
               '2020-07-13', '2020-07-14', '2020-07-15', '2020-07-18',
               '2020-07-19', '2020-07-20'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [266]:
#subtract 5 days
stocks.index - pd.DateOffset(days = 5)

DatetimeIndex(['2009-12-30', '2009-12-31', '2010-01-01', '2010-01-02',
               '2010-01-03', '2010-01-06', '2010-01-07', '2010-01-08',
               '2010-01-09', '2010-01-10',
               ...
               '2020-06-26', '2020-06-27', '2020-07-01', '2020-07-02',
               '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-08',
               '2020-07-09', '2020-07-10'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [269]:
#will need to save this back into the index of the dataframe
stocks.index = stocks.index - pd.DateOffset(days = 5)
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-12-30,31.1,30.59,30.620001,30.950001,38409100.0,24.226894
2009-12-31,31.1,30.639999,30.85,30.959999,49749600.0,24.23472
2010-01-01,31.08,30.52,30.879999,30.77,58182400.0,24.085989


In [272]:
#setting the original index back to default
stocks = data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
stocks.index

DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
               '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
               '2010-01-14', '2010-01-15',
               ...
               '2020-07-01', '2020-07-02', '2020-07-06', '2020-07-07',
               '2020-07-08', '2020-07-09', '2020-07-10', '2020-07-13',
               '2020-07-14', '2020-07-15'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [273]:
#other offset operations
stocks.index + pd.DateOffset(weeks = 2)

DatetimeIndex(['2010-01-18', '2010-01-19', '2010-01-20', '2010-01-21',
               '2010-01-22', '2010-01-25', '2010-01-26', '2010-01-27',
               '2010-01-28', '2010-01-29',
               ...
               '2020-07-15', '2020-07-16', '2020-07-20', '2020-07-21',
               '2020-07-22', '2020-07-23', '2020-07-24', '2020-07-27',
               '2020-07-28', '2020-07-29'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [275]:
stocks.index + pd.DateOffset(months = 3)

DatetimeIndex(['2010-04-04', '2010-04-05', '2010-04-06', '2010-04-07',
               '2010-04-08', '2010-04-11', '2010-04-12', '2010-04-13',
               '2010-04-14', '2010-04-15',
               ...
               '2020-10-01', '2020-10-02', '2020-10-06', '2020-10-07',
               '2020-10-08', '2020-10-09', '2020-10-10', '2020-10-13',
               '2020-10-14', '2020-10-15'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [277]:
stocks.index - pd.DateOffset(years = 8)

DatetimeIndex(['2002-01-04', '2002-01-05', '2002-01-06', '2002-01-07',
               '2002-01-08', '2002-01-11', '2002-01-12', '2002-01-13',
               '2002-01-14', '2002-01-15',
               ...
               '2012-07-01', '2012-07-02', '2012-07-06', '2012-07-07',
               '2012-07-08', '2012-07-09', '2012-07-10', '2012-07-13',
               '2012-07-14', '2012-07-15'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [279]:
stocks.index + pd.DateOffset(years = 1,months = 4,  days = 10, hours = 6, minutes = 2, seconds = 26)

DatetimeIndex(['2011-05-14 06:02:26', '2011-05-15 06:02:26',
               '2011-05-16 06:02:26', '2011-05-17 06:02:26',
               '2011-05-18 06:02:26', '2011-05-21 06:02:26',
               '2011-05-22 06:02:26', '2011-05-23 06:02:26',
               '2011-05-24 06:02:26', '2011-05-25 06:02:26',
               ...
               '2021-11-11 06:02:26', '2021-11-12 06:02:26',
               '2021-11-16 06:02:26', '2021-11-17 06:02:26',
               '2021-11-18 06:02:26', '2021-11-19 06:02:26',
               '2021-11-20 06:02:26', '2021-11-23 06:02:26',
               '2021-11-24 06:02:26', '2021-11-25 06:02:26'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

### Timeseries Offsets

In [280]:
stocks = data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.226894
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.23472
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,24.085989


In [286]:
#can also add dynamic amount of time.
#here we'll set each of the Timestamps to the last day of its month.
stocks.index + pd.tseries.offsets.MonthEnd()
#kind of weird because all the days that actually fall on the last day of the month get pushed to the last day of the following month.
stocks.index - pd.tseries.offsets.MonthEnd()
#each timestamp gets set to the end of the previous month

DatetimeIndex(['2009-12-31', '2009-12-31', '2009-12-31', '2009-12-31',
               '2009-12-31', '2009-12-31', '2009-12-31', '2009-12-31',
               '2009-12-31', '2009-12-31',
               ...
               '2020-06-30', '2020-06-30', '2020-06-30', '2020-06-30',
               '2020-06-30', '2020-06-30', '2020-06-30', '2020-06-30',
               '2020-06-30', '2020-06-30'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [288]:
stocks.index + pd.tseries.offsets.MonthBegin()
stocks.index - pd.tseries.offsets.MonthBegin()

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2020-06-01', '2020-07-01', '2020-07-01', '2020-07-01',
               '2020-07-01', '2020-07-01', '2020-07-01', '2020-07-01',
               '2020-07-01', '2020-07-01'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [289]:
#there is a way to import this pd.tseries.offsets to make things easier
from pandas.tseries import offsets
#then we can call offsets.MonthBegin()
stocks.index + offsets.MonthBegin()

DatetimeIndex(['2010-02-01', '2010-02-01', '2010-02-01', '2010-02-01',
               '2010-02-01', '2010-02-01', '2010-02-01', '2010-02-01',
               '2010-02-01', '2010-02-01',
               ...
               '2020-08-01', '2020-08-01', '2020-08-01', '2020-08-01',
               '2020-08-01', '2020-08-01', '2020-08-01', '2020-08-01',
               '2020-08-01', '2020-08-01'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [290]:
#set all timestamps to business end of month
stocks.index + offsets.BMonthEnd()

DatetimeIndex(['2010-01-29', '2010-01-29', '2010-01-29', '2010-01-29',
               '2010-01-29', '2010-01-29', '2010-01-29', '2010-01-29',
               '2010-01-29', '2010-01-29',
               ...
               '2020-07-31', '2020-07-31', '2020-07-31', '2020-07-31',
               '2020-07-31', '2020-07-31', '2020-07-31', '2020-07-31',
               '2020-07-31', '2020-07-31'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [291]:
#set all timestamps to business quarter end
stocks.index + offsets.BQuarterEnd()

DatetimeIndex(['2010-03-31', '2010-03-31', '2010-03-31', '2010-03-31',
               '2010-03-31', '2010-03-31', '2010-03-31', '2010-03-31',
               '2010-03-31', '2010-03-31',
               ...
               '2020-09-30', '2020-09-30', '2020-09-30', '2020-09-30',
               '2020-09-30', '2020-09-30', '2020-09-30', '2020-09-30',
               '2020-09-30', '2020-09-30'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

In [292]:
stocks.index + offsets.YearEnd()

DatetimeIndex(['2010-12-31', '2010-12-31', '2010-12-31', '2010-12-31',
               '2010-12-31', '2010-12-31', '2010-12-31', '2010-12-31',
               '2010-12-31', '2010-12-31',
               ...
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='Date', length=2651, freq=None)

### The Timedelta Object

In [None]:
stocks = data.DataReader('MSFT', data_source = 'yahoo', start = '2010-01-01', end = '2020-12-31')
stocks.head(3)

In [None]:
#the Timedelta object represents a time span/ duration of time/ difference between 2 times
#my birthday is in 6 months, 10 days

In [294]:
#easiest way is to subtract two timestamps
timea = pd.Timestamp('2020-03-31')
timeb = pd.Timestamp('2020-03-30')
timea - timeb
#returns a Timedelta object ('1 days 00:00:00')

Timedelta('1 days 00:00:00')

In [296]:
timea = pd.Timestamp('2020-03-31 04:35:16PM')
timeb = pd.Timestamp('2020-03-20 02:15:49AM')
timea - timeb

Timedelta('11 days 14:19:27')

In [297]:
#what if we take timeb - timea?
timeb - timea

Timedelta('-12 days +09:40:33')

In [299]:
#can add a timedelta directly to a timestamp to get a new timestamp
timea + pd.Timedelta(days = 3)

Timestamp('2020-04-03 16:35:16')

In [305]:
#additional Timedelta params
pd.Timedelta(days = 3, minutes = 45)
pd.Timedelta(weeks = 8, days = 3, hours = 12, minutes = 45, seconds = 23)
#years and months do not work as inputs into a timedelta

Timedelta('59 days 12:45:23')

In [306]:
#can also feed in a string time
pd.Timedelta('5 minutes')

Timedelta('0 days 00:05:00')

In [307]:
pd.Timedelta('6 hours 12 minutes')

Timedelta('0 days 06:12:00')

In [308]:
pd.Timedelta('14 days 6 hours 12 minutes 49 seconds')

Timedelta('14 days 06:12:49')

### Timedeltas in a Dataset

In [312]:
shipping = pd.read_csv('./data/ecommerce.csv', index_col = 'ID', parse_dates = ['order_date','delivery_date'])
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1998-05-24,1999-02-05
2,1992-04-22,1998-03-06
4,1991-02-10,1992-08-26


In [314]:
#find the shipping time per item
#output of the series is a series of timedeltas.
shipping['Delivery Time'] = shipping['delivery_date'] - shipping['order_date']

In [315]:
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1998-05-24,1999-02-05,257 days
2,1992-04-22,1998-03-06,2144 days
4,1991-02-10,1992-08-26,563 days


In [317]:
#can also add the timedeltas in the delivery time column to the datetimes in the delivery_date column
shipping['Twice as Long'] = shipping['delivery_date'] + shipping['Delivery Time']
#returns a new series of datetimes
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice as Long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1998-05-24,1999-02-05,257 days,1999-10-20
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12


In [318]:
shipping.dtypes

order_date        datetime64[ns]
delivery_date     datetime64[ns]
Delivery Time    timedelta64[ns]
Twice as Long     datetime64[ns]
dtype: object

In [321]:
#filter those that took more than a year to deliver
shipping[shipping['Delivery Time'] > '365 days']

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice as Long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12
5,1992-07-21,1997-11-20,1948 days,2003-03-22
7,1993-09-02,1998-06-10,1742 days,2003-03-18
9,1990-01-25,1994-10-02,1711 days,1999-06-09
...,...,...,...,...
986,1990-12-10,1992-12-16,737 days,1994-12-23
990,1991-06-24,1996-02-02,1684 days,2000-09-12
991,1991-09-09,1998-03-30,2394 days,2004-10-18
993,1990-11-16,1998-04-27,2719 days,2005-10-06


In [323]:
#filter those that took more than 3000 days to deliver
shipping[shipping['Delivery Time'] > '3000 days']

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice as Long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
32,1990-01-20,1998-07-24,3107 days,2007-01-25
130,1990-04-02,1999-08-16,3423 days,2008-12-29
151,1991-01-29,1999-08-05,3110 days,2008-02-09
229,1990-04-13,1998-11-17,3140 days,2007-06-23
314,1990-03-07,1999-12-25,3580 days,2009-10-13
331,1990-09-18,1999-12-19,3379 days,2009-03-20
348,1990-02-27,1999-01-04,3233 days,2007-11-11
392,1990-12-24,1999-12-04,3267 days,2008-11-13
590,1990-03-25,1998-12-20,3192 days,2007-09-16
634,1991-04-04,1999-07-21,3030 days,2007-11-06


In [324]:
#can also do max and min on timedelta columns
shipping['Delivery Time'].max()
#longest shipment took 3583 days

Timedelta('3583 days 00:00:00')

In [326]:
shipping['Delivery Time'].min()
#shortest delivery time only took 8 days

Timedelta('8 days 00:00:00')