In [1]:
import pandas as pd
pd.__version__

'1.1.3'

In [2]:
import datetime as dt # datetime module from the Python standard library. 

# datetime is not an external library (like Pandas), it is part of the Python language from the standard library. But not imported
# by default. 

This section is focussed on working exclusively with DataTimes both in Python and the Pandas library. 

DateTime format is very useful in the real world. We need it for things like tracking:
- trends over time.
- calculate duration between two points. 
- add a given amount of time to a date or a moment in time. 
- calculate a period of a quarter. 
- calculating day of the week. 

Review of Python's datetime module


A module is like a Python source file that is an internal library with Python loads on demand.

Note: Within the datetime module, there is a datetime method which creates a datetime object. 

We will see how Pandas offers its own twists to those objects. 

In [6]:
# date object : object used to store a date; day, month, year.

# parameters expect the year, month and day.  
someday = dt.date(2016, 4, 12)

# Created a date object representing 12th April 2016. 

In [7]:
someday.year
someday.month
someday.day

#Returns the year, month and day of the someday variable respectively. 

12

In [8]:
# datetime object: includes the date but also specifies the time; the seconds, minutes, hour. 
#When creating the datetime object, we can also specify the hour, minute and second. However, not required. 

dt.datetime(2010, 1, 20)

# datetime object also includes the time, but since we did not provide the time, it sets it to midnight by default. 
# Hours and minutes are set to 0. 

datetime.datetime(2010, 1, 20, 0, 0)

In [10]:
# Parameters: year, month, day, hour, minute, second. 

sometime = dt.datetime(2010, 1, 20, 17, 13, 50)
sometime
# 5:13:50 pm on 20 January 2010. 

datetime.datetime(2010, 1, 20, 17, 13, 50)

In [11]:
# Pass datetime object through string method to make the format more readable. 

str(dt.datetime(2010, 1, 20, 17, 13, 50))

'2010-01-20 17:13:50'

In [16]:
sometime.year
sometime.month
sometime.day
sometime.hour
sometime.minute
sometime.second

# Useful when working with dates and you want to extract something 
# For example, you will have to extract the month from the date objects to aggregate over the month. 

50

The pandas Timestamp Object

Pandas version of datetime, represents the datetime or a single moment in time. If no time is provided, then defaulted to midnight on that day.

In [17]:
pd.Timestamp(ts_input = "2015-03-31")
pd.Timestamp(ts_input = "2015/03/31")
pd.Timestamp(ts_input = "2015, 03, 31")
pd.Timestamp(ts_input = "31/03/2015") # It can figure out 31 is the day since it can't be the month.
pd.Timestamp(ts_input = "04/03/2015") # Takes it as American format: 3rd April 2015. 
pd.Timestamp(ts_input = "2021-03-08 08:35:15")
pd.Timestamp(ts_input = "2021-03-08 6:13:29PM")

# Since no time was provided, it defaulted to midnight. 
# All date input formats are the same. 

Timestamp('2021-03-08 18:13:29')

In [18]:
pd.Timestamp(dt.date(2015, 1, 1))
pd.Timestamp(dt.datetime(2015, 1, 1, 6, 4, 27))

# Same as previous cell input. 
# There is a lot more we can do with the pd.Timestamp than a Python datetime object. 

Timestamp('2015-01-01 06:04:27')

The pandas DateTimeIndex Object

Essentially a collection of Pandas timestamps.

In [19]:
dates = ["2016-01-02", "2016-04-12", "2009-09-07"]
pd.DatetimeIndex(dates)

# By providing the list of strings as an argument,, it creates an object to store all those dates. 
# 1) Converts those strings into Pandas timestamps.
# 2) Stores all those timestamps in a new object which is what the DateTimeIndex is. 

DatetimeIndex(['2016-01-02', '2016-04-12', '2009-09-07'], dtype='datetime64[ns]', freq=None)

In [20]:
dates = [dt.date(2016, 1, 10), dt.date(1994, 6 ,13), dt.date(2003, 12, 29)] # List of Python date objects. 
dates

dtIndex = pd.DatetimeIndex(dates) 
# Convert the list of date objects into Python timestamp objects. 
# Bundles them up into the pandas DatetimeIndex to store them all as one. 

# The purpose of the DatetimeIndex is to serve as the index of a Pandas object like a series or DataFrame. 

In [21]:
values = [100, 200, 300]
pd.Series(data = values, index = dtIndex)

# The timestamps we created as the DatetimeIndex now serves as the index of the newly created series.

2016-01-10    100
1994-06-13    200
2003-12-29    300
dtype: int64

The pd.to_datetime() Method

A convenient method to convert an existing object into a Pandas time related object.

For example, it can accept an input of a string, or a list of string, or a Python date/datetime object and it can convert it to something relevant in Pandas like a timestamp or a DatetimeIndex. 

In [22]:
pd.to_datetime("2001-04-19") # Converts string into Pandas timestamp. 
pd.to_datetime(dt.date(2015, 1, 1)) #  Converts a Python date object to a timestamp.
pd.to_datetime(dt.datetime(2015, 1, 1, 14, 35, 20)) # Converts datetime object to a timestamp. 
pd.to_datetime(["2015-01-03", "2014/02/08", "2016", "July 4th 1996"]) 
# Converts list  of strings to timestamp, note that for the input, we can even alternate the date formats.  Also note that for 2016, 
# month and date set to 01-01 by default since we didn't input those entries. 

DatetimeIndex(['2015-01-03', '2014-02-08', '2016-01-01', '1996-07-04'], dtype='datetime64[ns]', freq=None)

In [25]:
# The most common use of this method is to convert an existing Pandas series to Timestamp objects.

# Pandas will not interpret a list of strings into Datetime, it will default to a string Series. 
times = pd.Series(["2015-01-03", "2014/02/08", "2016", "July 4th 1996"])
times

0       2015-01-03
1       2014/02/08
2             2016
3    July 4th 1996
dtype: object

In [26]:
# In order to convert to a Timestamps and convert the container for all of them to DatetimeIndex object. 

pd.to_datetime(times)
# Note: Pandas has also normalized the Timestamp format for more consistency. 

0   2015-01-03
1   2014-02-08
2   2016-01-01
3   1996-07-04
dtype: datetime64[ns]

In [27]:
#  pd.datetime() method is very powerful but not perfect. It will run into issues when our data is bad. 

# Here, we have a list of strings, some of which are valid dates and some not. 
dates = pd.Series(["July 4th 1996", "10/04/1991", "Hello", "2015-02-33"])
dates

0    July 4th 1996
1       10/04/1991
2            Hello
3       2015-02-33
dtype: object

In [28]:
pd.to_datetime(dates) 

# Returns an error message since it can't convert the "Hello" and the date which doesn't exist "2015-02-33".
# errors = "raise" : by defaut. If invalid date inputs exist, error will be raised. 

ParserError: Unknown string format: Hello

In [29]:
pd.to_datetime(dates, errors = "coerce")

# errors = "coerce" : Converts valid inputs to Timestamp. Otherwise, it will not be converted and NaT (Not a Time) returned.  

0   1996-07-04
1   1991-10-04
2          NaT
3          NaT
dtype: datetime64[ns]

In [42]:
# Unix times: number of seconds since 01-01-1970. 

pd.to_datetime([1349720105, 1349806505], unit = "s") # unit  = "s" : unit of seconds required for unix times. 

# unixtime : a way to store time in seconds, represents the number of seconds since 01-01-1970 midnight. It's easier to convert
# that number of seconds into date and time without running into formatting issues. 

DatetimeIndex(['2012-10-08 18:15:05', '2012-10-09 18:15:05'], dtype='datetime64[ns]', freq=None)

Create Range of Dates with the pd.date_range() Method - Part 1 of 3

This method allows us to generate date ranges or datetime indexes on the fly. We will go through different combinations of the parameters we coud use. 

This lesson covers the following parameters:
- start.
- end.
- freq. 

In [17]:
# Parameters : start, end, period. We need 2 of the three parameters in order for this method to work. 
# This method accepts some kind of combination of parameters and generates an index for us. 

# start and end parameter.
# start : Left bound for generating dates  Specifies the first date of the date range from which to include if the criteria is satisfied
# end : Right bound for generating dates. Specified the end point of the DatetimeIndex, this day is also inclusive.. 
# freq : specifies the increments used to separate each  point  within the resulting DatetimeIndex. 

times = pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "D")
times

# freq = "D" : the default argument for frequency is set to "D" representing days. freq specifies the interval between the separation
# of the values that make up the DatetimeIndex. 

# Generates a datetime object which starts at 2016-01-01 and proceeds to 2016-01-10 and the interval between each timestamp 
# wiill be 1 day.  

# Advantage is that we won't have to manually create the datetime object, this method automatically generates the DatetimeIndex
# consisting of valid timestamps.

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06', '2016-01-07', '2016-01-08',
               '2016-01-09', '2016-01-10'],
              dtype='datetime64[ns]', freq='D')

In [7]:
type(times) # DatetimeIndex
type(times[0]) # Timestamp

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
# freq = "2D"

pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "2D")

# Generates a  DatetimeIndex starting at 2016-01-01 and proceeds towards 2016-01-10 in increments in 2 days. 

In [8]:
# freq = "B" : Business days (excludes weekends). 

pd.date_range(start = "2016-01-01", end = "2016-01-10", freq = "B")

# Generates DatetimeIndex starting from 2016-01-01 proceeds towards 2016-01-10 on business days. The weekends are excluded. 

DatetimeIndex(['2016-01-01', '2016-01-04', '2016-01-05', '2016-01-06',
               '2016-01-07', '2016-01-08'],
              dtype='datetime64[ns]', freq='B')

In [11]:
# freq = "W" . W stands for week. Includes one day per week. 

pd.date_range(start = "2016-01-01", end = "2016-01-15", freq = "W")
# freq = "W" is defaulted to "W" - SUN (short for Sunday). DateTimeIndex only includes one day per week, and that day falls on 
# a Sunday. 

pd.date_range(start = "2016-01-01", end = "2016-01-15", freq = "W-FRI")
# We can specify the day to be included. Here, we set that day to be Friday. 

DatetimeIndex(['2016-01-01', '2016-01-08', '2016-01-15'], dtype='datetime64[ns]', freq='W-FRI')

In [12]:
# freq = "H" : H stands for hour. Gives us every hour between start and end range. 

pd.date_range(start = "2016-01-01", end = "2016-01-15", freq = "H")
# Gives us every hour between the start of 2016-01-01 and the start of 2016-01-15. Default is set to 1 hour. 

pd.date_range(start = "2016-01-01", end = "2016-01-15", freq = "6H")
# Specified to increments of 6 hours. Gives us every 6 hours  between the start of 2016-01-01 and the start of 2016-01-15.  

DatetimeIndex(['2016-01-01 00:00:00', '2016-01-01 01:00:00',
               '2016-01-01 02:00:00', '2016-01-01 03:00:00',
               '2016-01-01 04:00:00', '2016-01-01 05:00:00',
               '2016-01-01 06:00:00', '2016-01-01 07:00:00',
               '2016-01-01 08:00:00', '2016-01-01 09:00:00',
               ...
               '2016-01-14 15:00:00', '2016-01-14 16:00:00',
               '2016-01-14 17:00:00', '2016-01-14 18:00:00',
               '2016-01-14 19:00:00', '2016-01-14 20:00:00',
               '2016-01-14 21:00:00', '2016-01-14 22:00:00',
               '2016-01-14 23:00:00', '2016-01-15 00:00:00'],
              dtype='datetime64[ns]', length=337, freq='H')

In [15]:
# freq = "M" : stands for month end.  Includes days which are the last day of each month. 
# freq = "MS" : stands for month start. Includes days that are the first day of each month. 

pd.date_range(start = "2016-01-01", end = "2016-01-15", freq = "M")
# Returns the month ends that fall within the date range. The result is empty because no month end falls within this date range, 

pd.date_range(start = "2016-01-01", end = "2016-12-31", freq = "M")
# By extending the end date to 2016-12-31, it returns all the months' end that fall within the date range. 

pd.date_range(start = "2016-01-01", end = "2016-01-15", freq = "MS")
# MS includes days between the start and end date which satisfy the criteria of falling on the first day of each month. 

DatetimeIndex(['2016-01-01'], dtype='datetime64[ns]', freq='MS')

In [16]:
# freq = "A" : includes the year end. Last day of the year

pd.date_range(start = "2016-01-01", end = "2050-01-01", freq = "A")
# Includes days that satisfies the criteria of being the last day of the year between the start and end date. 

DatetimeIndex(['2016-12-31', '2017-12-31', '2018-12-31', '2019-12-31',
               '2020-12-31', '2021-12-31', '2022-12-31', '2023-12-31',
               '2024-12-31', '2025-12-31', '2026-12-31', '2027-12-31',
               '2028-12-31', '2029-12-31', '2030-12-31', '2031-12-31',
               '2032-12-31', '2033-12-31', '2034-12-31', '2035-12-31',
               '2036-12-31', '2037-12-31', '2038-12-31', '2039-12-31',
               '2040-12-31', '2041-12-31', '2042-12-31', '2043-12-31',
               '2044-12-31', '2045-12-31', '2046-12-31', '2047-12-31',
               '2048-12-31', '2049-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')

Create Range of Dates with the pd.date_range() Method - Part 2 of 3

This lesson, the combination of parameters we will use is start, periods and freq. 
- start : specifies the date we are starting from. 
- periods : specifies the number of periods (or timestamps) you want to generate from the starting point. 
- freq: specifies the separation between each timestamp. 

In [19]:
# start and periods paraneters. 

# periods : represents the number of results we want to get. In other words, the number of Timestamps we want to generate, By 
# using periods, Pandas ignores the end parameter and just focusses on the start date generates the number of periods specified. 

pd.date_range(start = "2012-09-09", periods = 25, freq = "D")

# Returns a DatetimeIndex with exactly 25 values (timestamps). It started at the date specified in start and generated 25 periods
# with each separation (or frequency) set to 1 day. So, the 25 days represented are all 1 day apart. 

pandas._libs.tslibs.timestamps.Timestamp

In [21]:
pd.date_range(start = "2012-09-09", periods = 50, freq = "B")

# Generates 50 business days starting from 2012-09-10 (since 2012-09-09 is not a business day).

DatetimeIndex(['2012-09-10', '2012-09-11', '2012-09-12', '2012-09-13',
               '2012-09-14', '2012-09-17', '2012-09-18', '2012-09-19',
               '2012-09-20', '2012-09-21', '2012-09-24', '2012-09-25',
               '2012-09-26', '2012-09-27', '2012-09-28', '2012-10-01',
               '2012-10-02', '2012-10-03', '2012-10-04', '2012-10-05',
               '2012-10-08', '2012-10-09', '2012-10-10', '2012-10-11',
               '2012-10-12', '2012-10-15', '2012-10-16', '2012-10-17',
               '2012-10-18', '2012-10-19', '2012-10-22', '2012-10-23',
               '2012-10-24', '2012-10-25', '2012-10-26', '2012-10-29',
               '2012-10-30', '2012-10-31', '2012-11-01', '2012-11-02',
               '2012-11-05', '2012-11-06', '2012-11-07', '2012-11-08',
               '2012-11-09', '2012-11-12', '2012-11-13', '2012-11-14',
               '2012-11-15', '2012-11-16'],
              dtype='datetime64[ns]', freq='B')

In [25]:
pd.date_range(start = "2012-09-09", periods = 50, freq = "W-WED")
# Generates 50 dates starting from the first Wednesday from 2012-09-09 with each date separated from 1 week. 

pd.date_range(start = "2012-09-09", periods = 50, freq = "MS")
# Generates the start of 50 months coming up ahead.

pd.date_range(start = "2012-09-09", periods = 50, freq = "A")
# Generates 50 dates which are the end of each year starting from 2012. 

pd.date_range(start = "2012-09-09", periods = 50, freq = "H")
# Generates 50 timestamps startng from midnight 2012-09-09, with a separation of 1 hour between each timestamp.  

pd.date_range(start = "2012-09-09", periods = 50, freq = "6H")
# Generates 50 timestamps startng from midnight 2012-09-09, with a separation of 1 6 hours between each timestamp.  

DatetimeIndex(['2012-09-09 00:00:00', '2012-09-09 06:00:00',
               '2012-09-09 12:00:00', '2012-09-09 18:00:00',
               '2012-09-10 00:00:00', '2012-09-10 06:00:00',
               '2012-09-10 12:00:00', '2012-09-10 18:00:00',
               '2012-09-11 00:00:00', '2012-09-11 06:00:00',
               '2012-09-11 12:00:00', '2012-09-11 18:00:00',
               '2012-09-12 00:00:00', '2012-09-12 06:00:00',
               '2012-09-12 12:00:00', '2012-09-12 18:00:00',
               '2012-09-13 00:00:00', '2012-09-13 06:00:00',
               '2012-09-13 12:00:00', '2012-09-13 18:00:00',
               '2012-09-14 00:00:00', '2012-09-14 06:00:00',
               '2012-09-14 12:00:00', '2012-09-14 18:00:00',
               '2012-09-15 00:00:00', '2012-09-15 06:00:00',
               '2012-09-15 12:00:00', '2012-09-15 18:00:00',
               '2012-09-16 00:00:00', '2012-09-16 06:00:00',
               '2012-09-16 12:00:00', '2012-09-16 18:00:00',
               '2012-09-

Create Range of Dates with the pd.date_range() Method - Part 3 of 3

Parameters:
- end . specifies the end point of the DatetimeIndex, 
- periods: specifies the number of values in the resulting DatetimeIndex. 

In [27]:
pd.date_range(end = "1999-12-31", periods = 20, freq = "D")
# Generates 20 timestamps ending at 1999-12-31 with a separation of 1 day between each timestamp. So, the timestamp will begin 
# 20 days before the end date. 
# It starts at the end of the DatetimeIndex (which is the end point) and moves back (1 day) to generate the next timestamp until
# it has 20 values. 

DatetimeIndex(['1999-12-12', '1999-12-13', '1999-12-14', '1999-12-15',
               '1999-12-16', '1999-12-17', '1999-12-18', '1999-12-19',
               '1999-12-20', '1999-12-21', '1999-12-22', '1999-12-23',
               '1999-12-24', '1999-12-25', '1999-12-26', '1999-12-27',
               '1999-12-28', '1999-12-29', '1999-12-30', '1999-12-31'],
              dtype='datetime64[ns]', freq='D')

In [31]:
# We can perform the same alterations for freq as we have from previous parts on this topic. 

pd.date_range(end = "1999-12-31", periods = 20, freq = "W-FRI")
# Starts going backwards from the end point to search for the first Friday (back from the end point) and generates 50 timestamps
# with a separation of 1 week (every Friday). 

pd.date_range(end = "1999-12-31", periods = 20, freq = "MS")
# Since 1999-12-31 does not represent a month start, it will not be included. It will start from the first month start  going backwards
# from 1999-12-31 which is 1999-12-01 and proceed to generate the 20 timestamps representing the start of each month preceding 
# the end point. 
# In other words, we end on 1999-12-01 and preced backwards in increments of 1 months (month starts) for 20 periods.

DatetimeIndex(['1998-05-01', '1998-06-01', '1998-07-01', '1998-08-01',
               '1998-09-01', '1998-10-01', '1998-11-01', '1998-12-01',
               '1999-01-01', '1999-02-01', '1999-03-01', '1999-04-01',
               '1999-05-01', '1999-06-01', '1999-07-01', '1999-08-01',
               '1999-09-01', '1999-10-01', '1999-11-01', '1999-12-01'],
              dtype='datetime64[ns]', freq='MS')

The .dt Accessor

Similar to the .str accessor, where we had to pass the .str prefix to the string methods when working with string series. 

The .dt accessor is the same thing. In order to access an attribute of a series that consists of datetimes, we need to prefix it with the .dt accessor. 

In [37]:
bunch_of_dates = pd.date_range(start = "2000-01-01", end = "2010-12-31", freq = "24D")
bunch_of_dates
# Generates DatetimeIndex (can think of it as an array or a list of dates) starting from 2000-01-01 to 2010-12-31 with each timestamp
# separated by 24 days. 

DatetimeIndex(['2000-01-01', '2000-01-25', '2000-02-18', '2000-03-13',
               '2000-04-06', '2000-04-30', '2000-05-24', '2000-06-17',
               '2000-07-11', '2000-08-04',
               ...
               '2010-05-20', '2010-06-13', '2010-07-07', '2010-07-31',
               '2010-08-24', '2010-09-17', '2010-10-11', '2010-11-04',
               '2010-11-28', '2010-12-22'],
              dtype='datetime64[ns]', length=168, freq='24D')

In [38]:
s = pd.Series(bunch_of_dates)
s
# Feed the DatetimeIndex as a data source to a series constructor to create a series of datetime objects.  

0     2000-01-01
1     2000-01-25
2     2000-02-18
3     2000-03-13
4     2000-04-06
         ...    
163   2010-09-17
164   2010-10-11
165   2010-11-04
166   2010-11-28
167   2010-12-22
Length: 168, dtype: datetime64[ns]

In [51]:
# Extract the day of each of the values in the series. 

s.dt.day
# Returns a brand new series consisting of the days of the values in our previous bunch_of_dates series. 

s.dt.month
# Gives us the month of each value. 


s.dt.day_name()
# Gives us the weekday name of the date, 

0       Saturday
1        Tuesday
2         Friday
3         Monday
4       Thursday
         ...    
163       Friday
164       Monday
165     Thursday
166       Sunday
167    Wednesday
Length: 168, dtype: object

In [58]:
mask = s.dt.is_quarter_start
# Returns a boolean series. Returns True if the date falls on the start of the quarter, False if it does not. 

# We can use the boolean series to extract or filter dates. 
s[mask]
# Returns series of the dates which are the start of the quarter. 


0     2000-01-01
19    2001-04-01
38    2002-07-01
137   2009-01-01
dtype: datetime64[ns]

In [60]:
mask2 = s.dt.is_month_end # Returns True for those dates which are the end of the month. 
s[mask2]
# Returns those dates which falls on the end of the month. 

5     2000-04-30
57    2003-09-30
71    2004-08-31
90    2005-11-30
123   2008-01-31
161   2010-07-31
dtype: datetime64[ns]

Import Financial Data Set with pandas_datareader Library

pandas_datareader fetches up to date financial datasets from the internet. 

In [9]:
import pandas as pd
import datetime as dt
from pandas_datareader import data

In [12]:
stocks = data.DataReader(name = "MSFT", data_source = "yahoo", start = "2010-01-01", end = "2020-12-31")
stocks.head(3)

# Fetches up to date Microsoft (MSFT) stock information from the internet (yahoo), starting from 2010-01-01 to 2020-12-31 (or 
# present day).  

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.10536
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.113148
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,23.965164


In [13]:
# Attributes

stocks.values # Returns a numpy nd array consisting of all the information contained in microsoft_stocks.
stocks.columns # Returns the columns of microsoft_stocks.
stocks.index # Returns the DatetimeIndex of microsoft_stocks consisting of the index. 
stocks.axes 
# Returns the values for both axes in a list. The first element is the DatetimeIndex and the second element is the column index.  

[DatetimeIndex(['2010-01-04', '2010-01-05', '2010-01-06', '2010-01-07',
                '2010-01-08', '2010-01-11', '2010-01-12', '2010-01-13',
                '2010-01-14', '2010-01-15',
                ...
                '2020-12-11', '2020-12-14', '2020-12-15', '2020-12-16',
                '2020-12-17', '2020-12-18', '2020-12-21', '2020-12-22',
                '2020-12-23', '2020-12-24'],
               dtype='datetime64[ns]', name='Date', length=2765, freq=None),
 Index(['High', 'Low', 'Open', 'Close', 'Volume', 'Adj Close'], dtype='object')]

Selecting Rows from a DataFrame with a DatetimeIndex

In this lesson, we will learn how to extract one or more rows with the index label or index position from a DataFrame with a DatetimeIndex. 

In [25]:
# Providing a string argument will work.
stocks.loc["2010-01-04"]

# Better idea to wrap the date inside a pandas Timestamp object. In this case, both methods will work, but there are conditions 
# where only this method will work. 
stocks.loc[pd.Timestamp("2010-01-04")]

# .iloc for index position.
stocks.iloc[0]
# Returns first row of the stocks DataFrame. The returned row labels represent the original column headers from the stocks
# DataFrame. 

stocks.iloc[-1] 
# Returns the last row of the stocks DataFrame. 

High         2.236000e+02
Low          2.212100e+02
Open         2.214200e+02
Close        2.227500e+02
Volume       1.055057e+07
Adj Close    2.227500e+02
Name: 2020-12-24 00:00:00, dtype: float64

In [27]:
# For the .loc[] and .iloc[] accessors, an error is returned if the index label or index positions does not exist. 
# e.g. stocks.loc["2030-02-12"]
# stocks.iloc[500000]

In [30]:
# Lists can also be used.
stocks.loc[["2010-01-04", "2010-01-05"]]

# Or equivalently: 
stocks.loc[[pd.Timestamp("2010-01-04"), pd.Timestamp("2010-01-05")]]

# Even if one timestamp or date does not exist, then an error will be returned. 

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.10536
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.113148


In [34]:
# Slicing: We can pull from a given label/position to a given label/position.

# .loc[] : Both dates provided are inclusive. 
stocks.loc["2010-01-04": "2010-01-07"] 

# .truncate() : yields the same result, just need to provide the before and after dates (both inclusive). 
stocks.truncate(before = "2010-01-04", after = "2010-01-07")

# .iloc[] : end point is exclusive. 
stocks.iloc[1000: 1005]
# Only pulls up to date which is in row 1004. 

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-12-23,36.889999,36.549999,36.810001,36.619999,25128700.0,31.664244
2013-12-24,37.169998,36.639999,36.720001,37.080002,14243000.0,32.061996
2013-12-26,37.490002,37.169998,37.200001,37.439999,17612800.0,32.373272
2013-12-27,37.619999,37.169998,37.580002,37.290001,14563000.0,32.243584
2013-12-30,37.380001,36.900002,37.220001,37.290001,16290500.0,32.243584


In [77]:
# Find the stock price of Amazon on the days of my birthday. 

# pull stock price from internet using data.DataReader
# list/dtindex of my birthdays.  birthday = date_range
# boolean series. .isin(birthday). amz.index.isin(birthday) 
# amz[bool]

# 1) Pull stock price: using data.DataReader, pull AMZ stock prices from the internet. 
amz = data.DataReader(name = "amz", data_source = "yahoo", start = "1980-01-01", end = "2020-12-25")

# Pulls Amazon stock prices and assigned to variables amz. 

# 2) Create list/DatetimeIndex of my birthdays: Using the date_range() method. 
birthdays = pd.date_range(start = "1995-08-14", end = "2020-12-25", freq = pd.DateOffset(years = 1))
birthdays

# Returns a DatetimeIndex of my birthdays. 
# freq = pd.DateOffset(years = 1) : Accounts for leap years. Selects the same day each year regardless of whether leap year or not. 

# 3) Create boolean series: Return True for dates in the Date Index in amz lie on my birthday. 
birthdays_stocks = amz.index.isin(birthdays)
birthday_stocks

# Returns a boolean series and assigned to variable.  

# 4) Return stock prices that lie on my birthday: Using the boolean series. 
amz[birthday_stocks]

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2012-08-14,191.5,188.300003,189.199997,190.25,960.0,190.25
2013-08-14,223.0,219.699997,220.5,221.100006,1597.0,221.100006
2014-08-14,247.149994,243.5,244.5,247.149994,1532.0,247.149994
2015-08-14,481.0,471.950012,475.399994,478.299988,2865.0,478.299988
2017-08-14,836.25,825.0,825.450012,834.799988,5279.0,834.799988


Timestamp Object Attributes and Methods

In this lesson, we will look at the Timestamp object's attributes and methods on and see how we can apply them to an entire DatetimeIndex. 

In [1]:
import pandas as pd
import datetime as dt
from pandas_datareader import data

In [6]:
stocks = data.DataReader(name = "MSFT", data_source = "yahoo", start = "2010-01-01", end = "2020-12-31")
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.10536
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.113148
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,23.965164


In [10]:
someday = stocks.index[500]
someday

# Returns the Timestamp entry with index position 500 (501st entry). 
# The DatetimeIndex consists of a bundle of Timestamp objects. Each Timestamp object represents a Datetime. 

Timestamp('2011-12-27 00:00:00')

In [11]:
# Some useful attributes:

someday.day # Returns the day of the timestamp. 
someday.month # Returns the month of the timestamp. 
someday.year# Returns the year of the timestamp. 
someday.is_month_start # Returns True for timestamps which lie on the first day of the month. 
someday.is_month_start # Returns True for timestamps which lie on the last day of the month. 
someday.is_quarter_start # Returns True for timestamps which falls on the start of a quarter. 

False

In [14]:
someday.month_name() # Returns the month of the timestamp as a string. 
someday.day_name() # Returns the day of the week (as a string) the timestamp falls on. 

'Tuesday'

In [18]:
# We can invoke any of the above attributes or methods on the DatetimeIndex. It will return the corresponding value for each
# timestamp object in the DateIndex. 

day_name_index = stocks.index.day_name()
# Calls the method on each of the timestamp objects in the DatetimeIndex and returns the day of the week.

In [43]:
# insert() : allows us to insert a new column to the DataFrame. 
# Parameters: 
# loc : location you want to insert the new column.
# column : new column name. 
# value : the value you want to populate new column with. 

stocks.insert(loc = 0, column = "Day of Week", value = day_name_index)
stocks.head(3)

# Returns DataFrame with new "Day of Week" column populated with the day of week which the corresponding index falls on. 

Unnamed: 0_level_0,Day of Week,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-04,Monday,31.1,30.59,30.620001,30.950001,38409100.0,24.10536
2010-01-05,Tuesday,31.1,30.639999,30.85,30.959999,49749600.0,24.113148
2010-01-06,Wednesday,31.08,30.52,30.879999,30.77,58182400.0,23.965164


In [55]:
# Another example using attribute. 

stocks.insert(1, "Is Start of Month", stocks.index.is_month_start)
stocks.head(3)

# Returns DataFrame with new column (column index 1) called "Is Start of Month" which states True if the corresponding index
# falls on the first day of the month and False if it doesn't. 

# stocks.drop("Is Start of Month", axis = "columns", inplace = True)

Unnamed: 0_level_0,Day of Week,Is Start of Month,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,Monday,False,31.1,30.59,30.620001,30.950001,38409100.0,24.10536
2010-01-05,Tuesday,False,31.1,30.639999,30.85,30.959999,49749600.0,24.113148
2010-01-06,Wednesday,False,31.08,30.52,30.879999,30.77,58182400.0,23.965164


In [58]:
stocks[stocks["Is Start of Month"]]

# Returns all entries where the "Is Start of Month" is True. In other words, returns all rows which fall on the first day of the month. 

Unnamed: 0_level_0,Day of Week,Is Start of Month,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-02-01,Monday,True,28.480000,27.920000,28.389999,28.410000,85931100.0,22.127089
2010-03-01,Monday,True,29.049999,28.530001,28.770000,29.020000,43805400.0,22.707878
2010-04-01,Thursday,True,29.540001,28.620001,29.350000,29.160000,74768100.0,22.817423
2010-06-01,Tuesday,True,26.309999,25.520000,25.530001,25.889999,76152400.0,20.350094
2010-07-01,Thursday,True,23.320000,22.730000,23.090000,23.160000,92239400.0,18.204264
...,...,...,...,...,...,...,...,...
2020-06-01,Monday,True,183.000000,181.460007,182.539993,182.830002,22622400.0,181.912857
2020-07-01,Wednesday,True,206.350006,201.770004,203.139999,204.699997,32061200.0,203.673141
2020-09-01,Tuesday,True,227.449997,224.429993,225.509995,227.270004,25791200.0,226.676559
2020-10-01,Thursday,True,213.990005,211.320007,213.490005,212.460007,27158400.0,211.905228


The pd.DateOffset Object

The pd.DateOffset object allows us to add or subtract a set amount of time from every date in our DatetimeIndex. 

In [59]:
stocks = data.DataReader(name = "MSFT", data_source = "yahoo", start = "2010-01-01", end = "2020-12-31")
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.10536
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.113148
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,23.965164


In [69]:
# Add 5 days to every date. e.g. jan 4th --> jan 9th. 

stocks.index + pd.DateOffset(days = 5)
# Without the pd.DateOffset() object, there is ambiguity as to whether we are referring to the day, month or year etc. 
# The pd.DateOffset() object helps us clarify that. 

stocks.index + pd.DateOffset(day = 5)
# Using day instead of days, we have replaced all days with 5, rather than add on 5. 

stocks.index + pd.DateOffset(months = 5) # Added 5 months to the original date. 
stocks.index + pd.DateOffset(month = 5) # Replaced the month with 5 for all timestamps. 

stocks.index = stocks.index + pd.DateOffset(day = 5)
# Reassign the stocks index after adding on 5 days. 

Timeseries Offset

This lesson, we will focus on a more challenging problem. What if we want to add a dynamic amount of time to each datetime as opposed to adding a fixed number of days/months/years to each datetime. 

In [2]:
stocks = data.DataReader(name = "MSFT", data_source = "yahoo", start = "2010-01-01", end = "2020-12-31")
stocks.head(3)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,31.1,30.59,30.620001,30.950001,38409100.0,24.10536
2010-01-05,31.1,30.639999,30.85,30.959999,49749600.0,24.113148
2010-01-06,31.08,30.52,30.879999,30.77,58182400.0,23.965164


In [6]:
# For example, what if we want to round each date to the end of the current month (e.g. jan 4th --> jan 31st, jan5th --> jan 31st, 
# feb2nd --> feb 28th etc.). To achieve this, the number of days added will be according to each datetime and will therefore 
# not be set. 

# We will need to go an extra layer deeper to access the package which handles timeseries offsets. 
stocks.index + pd.tseries.offsets.MonthEnd()

# Returns the month end of every datetime object by moving forward because we used the plus sign. e.g. jan5 --> jan31. 

stocks.index - pd.tseries.offsets.MonthEnd()

# Returns the month end of every datetime object by going  back to the previous month end because we used the minus sign. 
# e.g. 01-05 --> 12-31. 

DatetimeIndex(['2010-01-31', '2010-01-31', '2010-01-31', '2010-01-31',
               '2010-01-31', '2010-01-31', '2010-01-31', '2010-01-31',
               '2010-01-31', '2010-01-31',
               ...
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='Date', length=2766, freq=None)

In [8]:
# Some caveats to MonthEnd()
# If the date falls on the month end already, then that date will be pushed back (considering we used plus sign) to end of next month. 
# For example, 04-30 --> 05-31. This applies to month/year beginning/end. 

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-23,223.559998,220.800003,223.110001,221.020004,18699600.0,221.020004
2020-12-24,223.610001,221.199997,221.419998,222.75,10550600.0,222.75
2020-12-28,226.029999,223.020004,224.449997,224.960007,17522554.0,224.960007


In [12]:
# There is also a complementary MonthBegin() offset.

stocks.index + pd.tseries.offsets.MonthBegin()

# Returns DatetimeIndex with the datetime objects moved forward to the next month beginning. e.g. 01-05 --> 02-01.

stocks.index - pd.tseries.offsets.MonthBegin() 

#Returns DatetimeIndex with the datetime objects moved backwards to the previous month beginning. e.g. 01-05 --> 01-01

DatetimeIndex(['2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01', '2010-01-01', '2010-01-01',
               '2010-01-01', '2010-01-01',
               ...
               '2020-12-01', '2020-12-01', '2020-12-01', '2020-12-01',
               '2020-12-01', '2020-12-01', '2020-12-01', '2020-12-01',
               '2020-12-01', '2020-12-01'],
              dtype='datetime64[ns]', name='Date', length=2766, freq=None)

In [15]:
# In order to make this operation simpler to work with, we can use a special Python import syntax to import the offsets directory
# or nested package inside pandas directly into the main space in our jupyter notebook. The exact same way we imported data from
# the DataReader library.

from pandas.tseries import offsets

# By importing, we can shorten the reference and are no longer required to reference the prefix "pandas.tseries" every time we
# want to use offsets.
# The above operations can be shortened to the following. 

stocks.index + offsets.MonthEnd()

DatetimeIndex(['2010-01-31', '2010-01-31', '2010-01-31', '2010-01-31',
               '2010-01-31', '2010-01-31', '2010-01-31', '2010-01-31',
               '2010-01-31', '2010-01-31',
               ...
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='Date', length=2766, freq=None)

In [17]:
# Couple more examples on how to use offsets to derive dates in the future or in the past. 

# Business end of the month. This refers to the last weekday of the month. There are business variants off all the normal operations. 
# E.g. if 31st ends on a Saturday, then the end of the business month will be on the 30th (Friday). 
stocks.index + offsets.BMonthEnd() # Business end of the month moving forward. 
stocks.index - offsets.BMonthEnd() # Business end of the month moving backward.

DatetimeIndex(['2009-12-31', '2009-12-31', '2009-12-31', '2009-12-31',
               '2009-12-31', '2009-12-31', '2009-12-31', '2009-12-31',
               '2009-12-31', '2009-12-31',
               ...
               '2020-11-30', '2020-11-30', '2020-11-30', '2020-11-30',
               '2020-11-30', '2020-11-30', '2020-11-30', '2020-11-30',
               '2020-11-30', '2020-11-30'],
              dtype='datetime64[ns]', name='Date', length=2766, freq=None)

In [18]:
stocks.index + offsets.YearEnd() # Converts all datetimes to the year end date moving forwards.

stocks.index + offsets.YearBegin() # For each datetime, moving forward to the beginning to the next year in the future. 

DatetimeIndex(['2010-12-31', '2010-12-31', '2010-12-31', '2010-12-31',
               '2010-12-31', '2010-12-31', '2010-12-31', '2010-12-31',
               '2010-12-31', '2010-12-31',
               ...
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='Date', length=2766, freq=None)

The Timedelta Object

The Pandas timestamp object represents a given moment in time, it is something we can plot on a calendar. In comparison, a Timedelta object represents a time span/time duration/passage of time. It is not connected to a specific date, it is simply a measure of time. Note: delta in Mathematics means the difference between two things. 

In [22]:
# We can create a Timedelta object by subtracting two timestamp objects from each other. 

time_a = pd.Timestamp("2020-12-29 04:35:16PM")
time_b = pd.Timestamp("2020-12-25 02:15:49AM")

time_a - time_b # A Timedelta object is returned by subtracting the timestamp in the past from the future. 

Timedelta('4 days 14:19:27')

In [24]:
time_b - time_a

# The Timedelta is negative when we subtract the timestamp in the future from the earlier timestamp. 

Timedelta('-5 days +09:40:33')

In [26]:
# Creating Timedelta object from scratch. 

pd.Timedelta(weeks = 8, days = 3, hours = 12, minutes = 45, seconds = 20)

Timedelta('59 days 12:45:20')

In [32]:
# Create Timedelta object by passing strings in the argument. 

pd.Timedelta("6 hours 12 minutes 5 seconds")

Timedelta('0 days 06:12:05')

In [30]:
# Performing operations with Timedelta objects. 

time_a + pd.Timedelta(days = 3, hours = 5)

Timestamp('2021-01-01 21:35:16')

Timedelta in a Dataset

In [36]:
shipping = pd.read_csv("ecommerce.csv", index_col = "ID", parse_dates = ["order_date", "delivery_date"])
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1998-05-24,1999-02-05
2,1992-04-22,1998-03-06
4,1991-02-10,1992-08-26


In [40]:
# Find out the duration between order and delivery date. 

shipping["Delivery Time"] = shipping["delivery_date"] - shipping["order_date"]
# Returns a series of pandas Timedeltas. 

In [41]:
shipping.head(3)

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1998-05-24,1999-02-05,257 days
2,1992-04-22,1998-03-06,2144 days
4,1991-02-10,1992-08-26,563 days


In [47]:
# We can create brand new series by combining timestamps and timedelta columns. 

# Find the delivery date if the delivery time took twice as long. Add delivery time to the delivery date. 
shipping["Twice As Long"] = shipping["delivery_date"] + shipping["Delivery Time"]
shipping.head(3)
# Returns a series of the new delivery date by adding the delivery date to the delivery time (despite them being different date types).

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice As Long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1998-05-24,1999-02-05,257 days,1999-10-20
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12


In [48]:
shipping.dtypes

order_date        datetime64[ns]
delivery_date     datetime64[ns]
Delivery Time    timedelta64[ns]
Twice As Long     datetime64[ns]
dtype: object

In [56]:
# Now we can perform all types of operations. 
# Filter all those that took longer than a year to deliver. 
 
# Algorithm:
# 1) Check entries in the delivery time column which are greater than 365 days. --> Returns boolean series. Assign to variable.  
# 2) Input boolean series (assigned to variable) to shipping[]

condition = shipping["Delivery Time"] > "365 days"
shipping[condition]

Unnamed: 0_level_0,order_date,delivery_date,Delivery Time,Twice As Long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12
5,1992-07-21,1997-11-20,1948 days,2003-03-22
7,1993-09-02,1998-06-10,1742 days,2003-03-18
9,1990-01-25,1994-10-02,1711 days,1999-06-09
...,...,...,...,...
986,1990-12-10,1992-12-16,737 days,1994-12-23
990,1991-06-24,1996-02-02,1684 days,2000-09-12
991,1991-09-09,1998-03-30,2394 days,2004-10-18
993,1990-11-16,1998-04-27,2719 days,2005-10-06


In [59]:
# We can also use other functions to on the Timedelta column. 

shipping["Delivery Time"].max() # Returns the max value in the Delivery Time Timedelta column.
shipping["Delivery Time"].min() # Returns the min value in the Delivery Time Timedelta column.

Timedelta('8 days 00:00:00')