<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_dates_and_times.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas Dates and Times
- Python has a built-in date module

In [2]:
# libraries needed
import numpy as np
import pandas as pd
import datetime as dt   

# datetime is not an external library
# it's part of Python; part of standard library; think of as internal libraries
# not imported automatically to preserve memory

# but imported the same way as pandas is to keep things consistent and simple

## Review of Python's datetime module
- internal library Python loads on demand
- not automatically loaded to preserve memory
- module is synonymous with libary
- dt.date() method creates a date object
- dt.datetime() method creates a datetime object, which is date plus time hh:mm:ss

In [3]:
# .datetime() method is part of datetime module/library
# .datetime() method creates a datetime object

# date object: YYYY-MM-DD
# datetime object: YYYY-MM-DD also includes timestamp

In [4]:
# create date object using dt.date() method
new_years_day = dt.date(year = 2023, month = 1, day = 1)
new_years_day 

datetime.date(2023, 1, 1)

In [5]:
# attributes on new_years_day
print(new_years_day.year)
print(new_years_day.month)
print(new_years_day.day)

2023
1
1


In [6]:
# create datetime object using dt.datetime() method
maddie_bday = dt.datetime(
    year = 2022, month = 3, day = 3,
    hour = 1, minute = 39
)

maddie_bday

datetime.datetime(2022, 3, 3, 1, 39)

In [7]:
# view in alternative format
str(maddie_bday)

'2022-03-03 01:39:00'

In [8]:
# attributes of maddie_bday
print(maddie_bday.year)
print(maddie_bday.month)
print(maddie_bday.day)

print(maddie_bday.hour)
print(maddie_bday.minute)
print(maddie_bday.second)

2022
3
3
1
39
0


## Pandas timestamp object
- equivalent of Python datetime object
  - pandas Timestamp is better for data analysis; custom 
  built, so do things like add and subtract chunks of time; 
  more efficient
- basically a single moment in time: a date and a time, but called a timestamp
- don't need hour:minute:second part; default to midnight, 00:00:00
- pd.Timestamp() method
  - input is a string, can be flexible in string format
  - but best stick with YYYY-MM-DD format

In [9]:
# January 1, 2023
pd.Timestamp('2023-01-01')

Timestamp('2023-01-01 00:00:00')

In [10]:
# also January 1, 2023
pd.Timestamp('01/01/23')
  # note the midnight default for time component

Timestamp('2023-01-01 00:00:00')

In [11]:
# March 3, 2022 at 1:39am, Maddie's birthday
pd.Timestamp('2022-03-03 01:39:00')

Timestamp('2022-03-03 01:39:00')

In [12]:
# also March 3, 2022 at 1:39am, Maddie's birthday
pd.Timestamp('2022-03-03 1:39am')

Timestamp('2022-03-03 01:39:00')

In [13]:
# feed a Pandas Datetime object
pd.Timestamp(dt.datetime(2023, 1, 1))

Timestamp('2023-01-01 00:00:00')

## pd.DatetimeIndex object
- a collection of pandas Timestamps

In [14]:
# follow along video
list_of_dates = ['2023-01-01', '2023-01-02', '2023-01-03']
pd.DatetimeIndex(list_of_dates)

# convert date column/Series to index --> more relevant use case you'll encounter

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [15]:
# using Python date
another_list_of_dates = [dt.date(2023, 1, 1), dt.date(2023, 1, 2), dt.date(2023, 1, 3)]
pd.DatetimeIndex(another_list_of_dates)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [16]:
# create a Series where index is datetime
pd.Series(
    data = [1, 2, 3],
    index = pd.DatetimeIndex(list_of_dates)         # list_of_dates defined above
)

2023-01-01    1
2023-01-02    2
2023-01-03    3
dtype: int64

In [17]:
# convert data column/Series to index
fake_dat = pd.DataFrame(
    {
        'purchase_date': ['2023-01-01', '2023-01-02', '2023-01-03'], 
        'purchase_amout': [100, 200, 300],
        'item': ['pen', 'pencil', 'paper']
    }
)

fake_dat

Unnamed: 0,purchase_date,purchase_amout,item
0,2023-01-01,100,pen
1,2023-01-02,200,pencil
2,2023-01-03,300,paper


In [18]:
# convert purchase_date to datetime
fake_dat['purchase_date'] = pd.to_datetime(fake_dat['purchase_date'])
fake_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   purchase_date   3 non-null      datetime64[ns]
 1   purchase_amout  3 non-null      int64         
 2   item            3 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 200.0+ bytes


In [19]:
# make purchase_date the index
(
    fake_dat
      .set_index('purchase_date')
      .index
)

# index is already a datetime index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', name='purchase_date', freq=None)

## pd.to_datetime() method
- convert to Pandas time related object
- e.g., string or Python date object to Pandas datetime object
- errors = 'coerce' for NaT, not a time, when something can't be converted to a Pandas datetime
  - default is errors = 'raise'

In [20]:
# convert strings to Pandas datetime
pd.to_datetime('2023-01-01')

Timestamp('2023-01-01 00:00:00')

In [21]:
# can also convert a list of strings
pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [22]:
# convert Pandas dates to datetime
pd.to_datetime(dt.datetime(2023, 1, 1))

# same with list

Timestamp('2023-01-01 00:00:00')

In [23]:
# common use is converting existing Series to datetime
purchase_dates = pd.Series(
    data = ['2023-01-01', '2023-01-02', '2023-01-03']
)

purchase_dates

0    2023-01-01
1    2023-01-02
2    2023-01-03
dtype: object

In [24]:
# convert purchase_dates Series to datetime
purchase_dates = pd.to_datetime(purchase_dates)

purchase_dates

0   2023-01-01
1   2023-01-02
2   2023-01-03
dtype: datetime64[ns]

In [25]:
# what if some of the dates aren't a date?
(
    pd.to_datetime(
        pd.Series(
            data = ['2023-01-01', '2023-01-02', 'hello', '2023-02-31'],  # first two are fine; hello and feb 31 aren't dates
        ),
        errors = 'coerce'                                                # default is 'raise'

    )
)

# NaT is not a time; basically can't be converted in pandas datetime
# akin to NaN

0   2023-01-01
1   2023-01-02
2          NaT
3          NaT
dtype: datetime64[ns]

In [26]:
# unix time: number of seconds since Jan 1, 1970 at midnight
# need additional parameter: unit = 's'
(
    pd.to_datetime(
        [1, 2, 3],                # unix time
        unit = 's'
    )
)

DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02',
               '1970-01-01 00:00:03'],
              dtype='datetime64[ns]', freq=None)

## pd.date_range() method
- create a date index quickly based on start, end, and frequency (freq)

In [27]:
# start, end, and freq
# need at least 2 out of these 3 parameters

In [28]:
# from jan 1 thru dec 31, 2023
(
    pd.date_range(
        start = '2023-01-01',
        end = '2023-12-31', 
        freq = 'D'                # D for 1 Day; increments of 1 day; can do 2D for 2 days, 3D for 3 days
    )
)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-12-22', '2023-12-23', '2023-12-24', '2023-12-25',
               '2023-12-26', '2023-12-27', '2023-12-28', '2023-12-29',
               '2023-12-30', '2023-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [29]:
# other freq arguments
# B for business days; excludes weekends and maybe holidays?
(
    pd.date_range(
        start = '2018-12-20', 
        end = '2018-12-31', 
        freq = 'B'                  # excludes weekends, but inclues Christmas day
    )
)

DatetimeIndex(['2018-12-20', '2018-12-21', '2018-12-24', '2018-12-25',
               '2018-12-26', '2018-12-27', '2018-12-28', '2018-12-31'],
              dtype='datetime64[ns]', freq='B')

In [30]:
# W for week; see documentation; W-SUN, etc.
# H for hour; can precede with number for number of hours
# M for month end, last day of month

In [31]:
pd.date_range(
    start = '2023-01-01', 
    end = '2023-12-01',      
    freq = 'M'                  # Dec 31 after the end parameter of Dec 1
)

DatetimeIndex(['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30'],
              dtype='datetime64[ns]', freq='M')

In [32]:
# MS for month start
pd.date_range(
    start = '2023-01-01', 
    end = '2023-12-01',      
    freq = 'MS'                  # now we get start of each month
)

DatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',
               '2023-05-01', '2023-06-01', '2023-07-01', '2023-08-01',
               '2023-09-01', '2023-10-01', '2023-11-01', '2023-12-01'],
              dtype='datetime64[ns]', freq='MS')

In [33]:
# A for year end; last day of each year
pd.date_range(
  start = '2023-01-01', 
  end = '2050-01-01',
  freq = 'A'
)

DatetimeIndex(['2023-12-31', '2024-12-31', '2025-12-31', '2026-12-31',
               '2027-12-31', '2028-12-31', '2029-12-31', '2030-12-31',
               '2031-12-31', '2032-12-31', '2033-12-31', '2034-12-31',
               '2035-12-31', '2036-12-31', '2037-12-31', '2038-12-31',
               '2039-12-31', '2040-12-31', '2041-12-31', '2042-12-31',
               '2043-12-31', '2044-12-31', '2045-12-31', '2046-12-31',
               '2047-12-31', '2048-12-31', '2049-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')

## pd.date_range() method, continued
- create a DatetimeIndex of a specified length using periods parameter

In [34]:
# 25 days starting on sep 9, 2012
(
    pd.date_range(
        start = '2012-09-12', 
        periods = 25,
        freq = 'D'
    )
)

# length of resulting DatetimeIndex is 25

DatetimeIndex(['2012-09-12', '2012-09-13', '2012-09-14', '2012-09-15',
               '2012-09-16', '2012-09-17', '2012-09-18', '2012-09-19',
               '2012-09-20', '2012-09-21', '2012-09-22', '2012-09-23',
               '2012-09-24', '2012-09-25', '2012-09-26', '2012-09-27',
               '2012-09-28', '2012-09-29', '2012-09-30', '2012-10-01',
               '2012-10-02', '2012-10-03', '2012-10-04', '2012-10-05',
               '2012-10-06'],
              dtype='datetime64[ns]', freq='D')

In [35]:
# 50 business days staring on april 15, 2022
(
    pd.date_range(
        start = '2022-04-15', 
        periods = 50,
        freq = 'B'                 # exclude weekends
    )
)

# length of resulting DatetimeIndex is 50

DatetimeIndex(['2022-04-15', '2022-04-18', '2022-04-19', '2022-04-20',
               '2022-04-21', '2022-04-22', '2022-04-25', '2022-04-26',
               '2022-04-27', '2022-04-28', '2022-04-29', '2022-05-02',
               '2022-05-03', '2022-05-04', '2022-05-05', '2022-05-06',
               '2022-05-09', '2022-05-10', '2022-05-11', '2022-05-12',
               '2022-05-13', '2022-05-16', '2022-05-17', '2022-05-18',
               '2022-05-19', '2022-05-20', '2022-05-23', '2022-05-24',
               '2022-05-25', '2022-05-26', '2022-05-27', '2022-05-30',
               '2022-05-31', '2022-06-01', '2022-06-02', '2022-06-03',
               '2022-06-06', '2022-06-07', '2022-06-08', '2022-06-09',
               '2022-06-10', '2022-06-13', '2022-06-14', '2022-06-15',
               '2022-06-16', '2022-06-17', '2022-06-20', '2022-06-21',
               '2022-06-22', '2022-06-23'],
              dtype='datetime64[ns]', freq='B')

In [36]:
# 10 weeks, starting on Mondays
# starting on Jan 1, 2023

pd.date_range(
    start = '2023-01-01', 
    periods = 10, 
    freq = 'W-MON'
)

DatetimeIndex(['2023-01-02', '2023-01-09', '2023-01-16', '2023-01-23',
               '2023-01-30', '2023-02-06', '2023-02-13', '2023-02-20',
               '2023-02-27', '2023-03-06'],
              dtype='datetime64[ns]', freq='W-MON')

## pd.date_range() method, continued
- create and DatetimeIndex of specific length and end date using end and periods parameter

In [37]:
# 30 Mondays before 2023-01-02
pd.date_range(
    end = '2023-01-02', 
    periods = 30,
    freq = 'W-MON'
)

DatetimeIndex(['2022-06-13', '2022-06-20', '2022-06-27', '2022-07-04',
               '2022-07-11', '2022-07-18', '2022-07-25', '2022-08-01',
               '2022-08-08', '2022-08-15', '2022-08-22', '2022-08-29',
               '2022-09-05', '2022-09-12', '2022-09-19', '2022-09-26',
               '2022-10-03', '2022-10-10', '2022-10-17', '2022-10-24',
               '2022-10-31', '2022-11-07', '2022-11-14', '2022-11-21',
               '2022-11-28', '2022-12-05', '2022-12-12', '2022-12-19',
               '2022-12-26', '2023-01-02'],
              dtype='datetime64[ns]', freq='W-MON')

In [38]:
# 53 months ending Feb 2023
pd.date_range(
    end = '2023-02-01', 
    periods = 53, 
    freq = 'MS'
)

DatetimeIndex(['2018-10-01', '2018-11-01', '2018-12-01', '2019-01-01',
               '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01',
               '2019-06-01', '2019-07-01', '2019-08-01', '2019-09-01',
               '2019-10-01', '2019-11-01', '2019-12-01', '2020-01-01',
               '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01',
               '2020-06-01', '2020-07-01', '2020-08-01', '2020-09-01',
               '2020-10-01', '2020-11-01', '2020-12-01', '2021-01-01',
               '2021-02-01', '2021-03-01', '2021-04-01', '2021-05-01',
               '2021-06-01', '2021-07-01', '2021-08-01', '2021-09-01',
               '2021-10-01', '2021-11-01', '2021-12-01', '2022-01-01',
               '2022-02-01', '2022-03-01', '2022-04-01', '2022-05-01',
               '2022-06-01', '2022-07-01', '2022-08-01', '2022-09-01',
               '2022-10-01', '2022-11-01', '2022-12-01', '2023-01-01',
               '2023-02-01'],
              dtype='datetime64[ns]', freq='MS'

## .dt accessor
- akin to .str prefix before string methods
- useful for pulling out elements of a date
  - year
  - month: both number and name
  - day: both number and day of week (e.g., Sunday)
- need to precede with .dt accessor
- parallels R lubridate package

In [39]:
# create a DatetimeIndex
bunch_of_dates = pd.date_range(
    start = '2019-07-06', 
    end = '2023-07-06', 
    freq = '2W'
)

bunch_of_dates

DatetimeIndex(['2019-07-07', '2019-07-21', '2019-08-04', '2019-08-18',
               '2019-09-01', '2019-09-15', '2019-09-29', '2019-10-13',
               '2019-10-27', '2019-11-10',
               ...
               '2023-02-26', '2023-03-12', '2023-03-26', '2023-04-09',
               '2023-04-23', '2023-05-07', '2023-05-21', '2023-06-04',
               '2023-06-18', '2023-07-02'],
              dtype='datetime64[ns]', length=105, freq='2W-SUN')

In [40]:
# put bunch_of_dates in a DataFrame
bunch_of_dates_df = (
    pd.Series(bunch_of_dates)
      .to_frame()
      .rename(columns = {0: 'date'})
)

bunch_of_dates_df.head()

Unnamed: 0,date
0,2019-07-07
1,2019-07-21
2,2019-08-04
3,2019-08-18
4,2019-09-01


In [41]:
# get year of each date and assign it to its own column/Series
bunch_of_dates_df['year'] = bunch_of_dates_df['date'].dt.year      # need to use .dt accessor

bunch_of_dates_df.head()

Unnamed: 0,date,year
0,2019-07-07,2019
1,2019-07-21,2019
2,2019-08-04,2019
3,2019-08-18,2019
4,2019-09-01,2019


In [42]:
# get month of each date as a word and assign it to its own column/Series
bunch_of_dates_df['month_name'] = bunch_of_dates_df['date'].dt.month_name()

bunch_of_dates_df.head()

Unnamed: 0,date,year,month_name
0,2019-07-07,2019,July
1,2019-07-21,2019,July
2,2019-08-04,2019,August
3,2019-08-18,2019,August
4,2019-09-01,2019,September


In [43]:
# get month of each date as a number and assign it to its own column/Series
bunch_of_dates_df['month_number'] = bunch_of_dates_df['date'].dt.month

bunch_of_dates_df.head()

Unnamed: 0,date,year,month_name,month_number
0,2019-07-07,2019,July,7
1,2019-07-21,2019,July,7
2,2019-08-04,2019,August,8
3,2019-08-18,2019,August,8
4,2019-09-01,2019,September,9


In [44]:
# get each day as a day of week (Mon, Tue, Wed, etc.) and assign it to its own column/Series
bunch_of_dates_df['day_of_week_name'] = bunch_of_dates_df['date'].dt.day_name()

bunch_of_dates_df.head()

Unnamed: 0,date,year,month_name,month_number,day_of_week_name
0,2019-07-07,2019,July,7,Sunday
1,2019-07-21,2019,July,7,Sunday
2,2019-08-04,2019,August,8,Sunday
3,2019-08-18,2019,August,8,Sunday
4,2019-09-01,2019,September,9,Sunday


In [45]:
# get each day as a number and assign it to its own column/Series
bunch_of_dates_df['day'] = bunch_of_dates_df['date'].dt.day

bunch_of_dates_df.head()

Unnamed: 0,date,year,month_name,month_number,day_of_week_name,day
0,2019-07-07,2019,July,7,Sunday,7
1,2019-07-21,2019,July,7,Sunday,21
2,2019-08-04,2019,August,8,Sunday,4
3,2019-08-18,2019,August,8,Sunday,18
4,2019-09-01,2019,September,9,Sunday,1


In [46]:
# booleans for whether date is something
bunch_of_dates_df['date'].dt.is_quarter_start

# useful for filtering, boolean mask

0      False
1      False
2      False
3      False
4      False
       ...  
100    False
101    False
102    False
103    False
104    False
Name: date, Length: 105, dtype: bool

## install pandas-datareader library

In [47]:
!pip freeze

# pandas-datareader already installed

absl-py==1.3.0
aeppl==0.0.33
aesara==2.7.9
aiohttp==3.8.3
aiosignal==1.3.1
alabaster==0.7.12
albumentations==1.2.1
altair==4.2.0
appdirs==1.4.4
arviz==0.12.1
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
async-timeout==4.0.2
atari-py==0.2.9
atomicwrites==1.4.1
attrs==22.2.0
audioread==3.0.0
autograd==1.5
Babel==2.11.0
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==5.0.1
blis==0.7.9
bokeh==2.3.3
branca==0.6.0
bs4==0.0.1
CacheControl==0.12.11
cachetools==5.2.0
catalogue==2.0.8
certifi==2022.12.7
cffi==1.15.1
cftime==1.6.2
chardet==4.0.0
charset-normalizer==2.1.1
click==7.1.2
clikit==0.6.2
cloudpickle==1.5.0
cmake==3.22.6
cmdstanpy==1.0.8
colorcet==3.0.1
colorlover==0.3.0
community==1.0.0b1
confection==0.0.3
cons==0.4.5
contextlib2==0.5.5
convertdate==2.4.0
crashtest==0.3.1
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.3.0
cvxpy==1.2.2
cycler==0.11.0
cymem==2.0.7
Cython==0.29.32
daft==0.0.4
dask==2022.2.1
datascience==0.17.5
db-dtypes==1.0.5
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.7.1
desc

## Import financial data with pandas_datareader library

In [48]:
#!pip install --upgrade pandas
#!pip install --upgrade pandas-datareader

In [49]:
from pandas_datareader import data   # data can be called at top level without a prefix

In [50]:
# MSFT stock data
#data.DataReader(
 #   name = "MSFT", 
  #  start = '2022-01-01', 
   # end = '2022-12-31', 
    #data_source = 'yahoo'
#)

In [51]:
# datareader didn't work; just work with another dataset that has a date index

## Selecting rows from a DataFrame with a DatetimeIndex
- using .loc[ ]

In [52]:
# DatetimeIndex
pd.date_range(
    start = '2022-01-01', 
    end = '2022-12-31', 
    freq = 'D'
)

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10',
               ...
               '2022-12-22', '2022-12-23', '2022-12-24', '2022-12-25',
               '2022-12-26', '2022-12-27', '2022-12-28', '2022-12-29',
               '2022-12-30', '2022-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [53]:
# random integer betwen 1 and 10
np.random.randint(low = 1, high = 11, size = 365)

array([ 9,  5,  1,  5,  5,  3,  8,  7,  3,  7,  4,  3,  1,  1,  9,  5,  7,
        6,  8,  3,  2,  9, 10,  1,  7,  5,  1,  6,  4, 10,  7, 10,  5,  3,
        5,  7,  9,  5,  2,  4,  7, 10,  9,  1,  9,  6,  4,  9,  8, 10,  3,
        6,  2,  8, 10,  8,  9,  4,  8, 10,  7,  2, 10,  7,  4,  7,  4,  6,
        3, 10, 10,  9,  4,  5, 10,  1,  2,  6,  7,  9,  1,  3, 10,  3,  1,
        9,  1, 10,  8,  1,  2,  9,  7,  8,  9,  5,  4,  3,  3,  3,  1,  8,
        1,  5,  9,  3,  2,  1,  9,  1,  4,  8,  9,  1,  2, 10,  6,  4,  7,
        5, 10,  4,  7,  8,  3,  1,  3,  2,  3,  3, 10,  2,  3,  7, 10,  2,
        4,  2, 10,  6,  4,  9,  2,  7,  4,  8,  8,  5,  5,  1,  1,  5,  2,
        9,  1,  8,  2,  3,  2,  6,  4,  3,  6,  5,  4,  4,  1,  8,  1,  5,
       10,  2,  5,  1,  4,  6,  1,  7,  5,  9,  5,  9,  9,  5,  3,  8,  5,
        8,  6,  9,  7,  6,  5,  9,  3,  4,  3,  7,  5,  6,  6,  6,  4,  2,
        7,  8,  5,  7,  6,  1,  5, 10,  5, 10,  8,  7,  7,  8,  9,  9,  6,
        6,  3,  4,  8,  6

In [54]:
fake_df = pd.DataFrame(
    {
        'purchase_date': pd.date_range(start = '2022-01-01', end = '2022-12-31', freq = 'D'),
        'amount': np.random.randint(low = 1, high = 11, size = 365)
    }
).set_index('purchase_date')

fake_df.head()

Unnamed: 0_level_0,amount
purchase_date,Unnamed: 1_level_1
2022-01-01,2
2022-01-02,3
2022-01-03,5
2022-01-04,9
2022-01-05,10


In [55]:
# pull out jan 1, 2022
fake_df.loc['2022-01-01']

amount    2
Name: 2022-01-01 00:00:00, dtype: int64

In [56]:
# preferred way to pull out jan 1, 2022
fake_df.loc[pd.Timestamp('2022-01-01')]

amount    2
Name: 2022-01-01 00:00:00, dtype: int64

In [57]:
# for a list, you need to convert strings into Timestamps; I guess you don't, contrary to video, code below works
# pull out jan 1 and jan 10
fake_df.loc[['2022-01-01', '2022-01-02'],:]

Unnamed: 0_level_0,amount
purchase_date,Unnamed: 1_level_1
2022-01-01,2
2022-01-02,3


In [58]:
# feb 1 thru feb 15, 2023
fake_df.loc['2022-02-01':'2022-02-15', :]    # comma separates arguments; colon indicates all columns

Unnamed: 0_level_0,amount
purchase_date,Unnamed: 1_level_1
2022-02-01,2
2022-02-02,9
2022-02-03,4
2022-02-04,3
2022-02-05,7
2022-02-06,10
2022-02-07,5
2022-02-08,9
2022-02-09,6
2022-02-10,8


In [59]:
# 14th of each month
month_14th = pd.date_range(start = '2022-01-14', end = '2022-12-14', freq = pd.DateOffset(months = 1))
  # 2022-01-14, 2022-02-14, ...

# boolean mask
fake_df.index.isin(month_14th)

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [60]:
# values on 14th of each month
fake_df.loc[fake_df.index.isin(month_14th), :]

Unnamed: 0_level_0,amount
purchase_date,Unnamed: 1_level_1
2022-01-14,7
2022-02-14,2
2022-03-14,6
2022-04-14,10
2022-05-14,9
2022-06-14,3
2022-07-14,6
2022-08-14,3
2022-09-14,5
2022-10-14,6


## Pandas Timestamp object's methods and attributes
- df.insert() method; called on dataframe; for inserting a column

In [61]:
fake_df.head()

Unnamed: 0_level_0,amount
purchase_date,Unnamed: 1_level_1
2022-01-01,2
2022-01-02,3
2022-01-03,5
2022-01-04,9
2022-01-05,10


In [62]:
# make purchase_date a column instead of the axis
fake_df = fake_df.reset_index()

fake_df.head()

Unnamed: 0,purchase_date,amount
0,2022-01-01,2
1,2022-01-02,3
2,2022-01-03,5
3,2022-01-04,9
4,2022-01-05,10


In [63]:
# year of purchase_date as own column
fake_df['purchase_date_year'] = (
    fake_df['purchase_date']
      .dt.year
)

fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year
0,2022-01-01,2,2022
1,2022-01-02,3,2022
2,2022-01-03,5,2022
3,2022-01-04,9,2022
4,2022-01-05,10,2022


In [64]:
# month of purcahse_date as own column
fake_df['purchase_date_month'] = (
    fake_df['purchase_date']
      .dt.month
)

fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month
0,2022-01-01,2,2022,1
1,2022-01-02,3,2022,1
2,2022-01-03,5,2022,1
3,2022-01-04,9,2022,1
4,2022-01-05,10,2022,1


In [65]:
# month name of purchase date as own column
fake_df['purchase_date_month_name'] = (
    fake_df['purchase_date']
      .dt.month_name()
)

fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name
0,2022-01-01,2,2022,1,January
1,2022-01-02,3,2022,1,January
2,2022-01-03,5,2022,1,January
3,2022-01-04,9,2022,1,January
4,2022-01-05,10,2022,1,January


In [66]:
# day of purchase_date as own column
fake_df['purchase_date_day'] = (
    fake_df['purchase_date']
      .dt.day
)

fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day
0,2022-01-01,2,2022,1,January,1
1,2022-01-02,3,2022,1,January,2
2,2022-01-03,5,2022,1,January,3
3,2022-01-04,9,2022,1,January,4
4,2022-01-05,10,2022,1,January,5


In [67]:
# weekday (e.g., Monday, Tuesday) of purchase_Date as own column
fake_df['purchase_date_weekday'] = (
    fake_df['purchase_date']
      .dt.day_name()
)

fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
0,2022-01-01,2,2022,1,January,1,Saturday
1,2022-01-02,3,2022,1,January,2,Sunday
2,2022-01-03,5,2022,1,January,3,Monday
3,2022-01-04,9,2022,1,January,4,Tuesday
4,2022-01-05,10,2022,1,January,5,Wednesday


In [68]:
# purchase amounts on last day of each month
fake_df[fake_df['purchase_date'].dt.is_month_end]

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
30,2022-01-31,4,2022,1,January,31,Monday
58,2022-02-28,5,2022,2,February,28,Monday
89,2022-03-31,9,2022,3,March,31,Thursday
119,2022-04-30,4,2022,4,April,30,Saturday
150,2022-05-31,10,2022,5,May,31,Tuesday
180,2022-06-30,2,2022,6,June,30,Thursday
211,2022-07-31,3,2022,7,July,31,Sunday
242,2022-08-31,3,2022,8,August,31,Wednesday
272,2022-09-30,4,2022,9,September,30,Friday
303,2022-10-31,9,2022,10,October,31,Monday


In [69]:
# purchases on first day of each month
fake_df[fake_df['purchase_date'].dt.is_month_start]

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
0,2022-01-01,2,2022,1,January,1,Saturday
31,2022-02-01,2,2022,2,February,1,Tuesday
59,2022-03-01,3,2022,3,March,1,Tuesday
90,2022-04-01,8,2022,4,April,1,Friday
120,2022-05-01,1,2022,5,May,1,Sunday
151,2022-06-01,7,2022,6,June,1,Wednesday
181,2022-07-01,1,2022,7,July,1,Friday
212,2022-08-01,6,2022,8,August,1,Monday
243,2022-09-01,1,2022,9,September,1,Thursday
273,2022-10-01,10,2022,10,October,1,Saturday


In [70]:
# purchases between jan 1 and jan 10
(
    fake_df
      [
          fake_df['purchase_date']
            .between('2022-01-01', '2022-01-10')
      ]
)

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
0,2022-01-01,2,2022,1,January,1,Saturday
1,2022-01-02,3,2022,1,January,2,Sunday
2,2022-01-03,5,2022,1,January,3,Monday
3,2022-01-04,9,2022,1,January,4,Tuesday
4,2022-01-05,10,2022,1,January,5,Wednesday
5,2022-01-06,5,2022,1,January,6,Thursday
6,2022-01-07,4,2022,1,January,7,Friday
7,2022-01-08,1,2022,1,January,8,Saturday
8,2022-01-09,9,2022,1,January,9,Sunday
9,2022-01-10,4,2022,1,January,10,Monday


In [71]:
# purchases on 22nd of each month
(
    fake_df
    [
        (fake_df['purchase_date_day'] == 22) 
          & (fake_df['amount'] > 0)
    ]
)

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
21,2022-01-22,4,2022,1,January,22,Saturday
52,2022-02-22,4,2022,2,February,22,Tuesday
80,2022-03-22,6,2022,3,March,22,Tuesday
111,2022-04-22,5,2022,4,April,22,Friday
141,2022-05-22,1,2022,5,May,22,Sunday
172,2022-06-22,2,2022,6,June,22,Wednesday
202,2022-07-22,10,2022,7,July,22,Friday
233,2022-08-22,7,2022,8,August,22,Monday
264,2022-09-22,7,2022,9,September,22,Thursday
294,2022-10-22,9,2022,10,October,22,Saturday


## pd.DateOffset object
- for adding and substracting time from a timestamp
- parameters are years, months, days, hours, minutes, seconds

In [72]:
fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
0,2022-01-01,2,2022,1,January,1,Saturday
1,2022-01-02,3,2022,1,January,2,Sunday
2,2022-01-03,5,2022,1,January,3,Monday
3,2022-01-04,9,2022,1,January,4,Tuesday
4,2022-01-05,10,2022,1,January,5,Wednesday


In [73]:
# add 5 days to each purchase_date
(
    fake_df['purchase_date']
      .add(pd.DateOffset(days = 5))
)

0     2022-01-06
1     2022-01-07
2     2022-01-08
3     2022-01-09
4     2022-01-10
         ...    
360   2023-01-01
361   2023-01-02
362   2023-01-03
363   2023-01-04
364   2023-01-05
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [74]:
# subtract 10 days from each purchase_date
(
    fake_df['purchase_date']
      .subtract(pd.DateOffset(days = 10))
)

0     2021-12-22
1     2021-12-23
2     2021-12-24
3     2021-12-25
4     2021-12-26
         ...    
360   2022-12-17
361   2022-12-18
362   2022-12-19
363   2022-12-20
364   2022-12-21
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [75]:
# add 2 weeks to each date
(
    fake_df['purchase_date']
      .add(pd.DateOffset(weeks = 2))
)

0     2022-01-15
1     2022-01-16
2     2022-01-17
3     2022-01-18
4     2022-01-19
         ...    
360   2023-01-10
361   2023-01-11
362   2023-01-12
363   2023-01-13
364   2023-01-14
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [76]:
# subtract 1 year from each date
(
    fake_df['purchase_date']
      .subtract(pd.DateOffset(years = 1))
)

0     2021-01-01
1     2021-01-02
2     2021-01-03
3     2021-01-04
4     2021-01-05
         ...    
360   2021-12-27
361   2021-12-28
362   2021-12-29
363   2021-12-30
364   2021-12-31
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [77]:
# add 1 month to each date
(
    fake_df['purchase_date']
      .add(pd.DateOffset(months = 1))
)

0     2022-02-01
1     2022-02-02
2     2022-02-03
3     2022-02-04
4     2022-02-05
         ...    
360   2023-01-27
361   2023-01-28
362   2023-01-29
363   2023-01-30
364   2023-01-31
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [78]:
# mix and match paratmers in pd.DateOffset
# add 1 year, 2 months, 3 hours, and 4 minutes to each date
(
    fake_df['purchase_date']
      .add(
          pd.DateOffset(
              years = 1, months = 2, hours = 3, minutes = 4
          )
      )
)

0     2023-03-01 03:04:00
1     2023-03-02 03:04:00
2     2023-03-03 03:04:00
3     2023-03-04 03:04:00
4     2023-03-05 03:04:00
              ...        
360   2024-02-27 03:04:00
361   2024-02-28 03:04:00
362   2024-02-29 03:04:00
363   2024-02-29 03:04:00
364   2024-02-29 03:04:00
Name: purchase_date, Length: 365, dtype: datetime64[ns]

## Timeseries offsets
- pandas library consists of folders
  - at top level, e.g., pd.DataFrame(), pd.Series()
  - but nested within folders are different methods
- Offsets is one of those folders

In [79]:
fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
0,2022-01-01,2,2022,1,January,1,Saturday
1,2022-01-02,3,2022,1,January,2,Sunday
2,2022-01-03,5,2022,1,January,3,Monday
3,2022-01-04,9,2022,1,January,4,Tuesday
4,2022-01-05,10,2022,1,January,5,Wednesday


In [80]:
# truncate purchase_date to month begin, e.g., Jan 2 to Jan 1
(
    fake_df['purchase_date']
      .dt.to_period('M')            # YYYY-MM format, no -DD; data type is period        
      .dt.to_timestamp()           # convert to timestamp; get the -DD
)

# above wasn't part of video tutorial but could be helpful

0     2022-01-01
1     2022-01-01
2     2022-01-01
3     2022-01-01
4     2022-01-01
         ...    
360   2022-12-01
361   2022-12-01
362   2022-12-01
363   2022-12-01
364   2022-12-01
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [81]:
# truncate purchase_date to year
(
    fake_df['purchase_date']
      .dt.to_period('Y')
      .dt.to_timestamp()
)

# above wasn't part of video tutorial but could be helpful

0     2022-01-01
1     2022-01-01
2     2022-01-01
3     2022-01-01
4     2022-01-01
         ...    
360   2022-01-01
361   2022-01-01
362   2022-01-01
363   2022-01-01
364   2022-01-01
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [82]:
# now back to video tutorial
# get month begin of next month
(
    fake_df['purchase_date']
      .add(pd.tseries.offsets.MonthBegin())     # MonthBegin() is in offsets "folder", which is inside teries "folder"
)

0     2022-02-01
1     2022-02-01
2     2022-02-01
3     2022-02-01
4     2022-02-01
         ...    
360   2023-01-01
361   2023-01-01
362   2023-01-01
363   2023-01-01
364   2023-01-01
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [83]:
# note that first purchase_date is 2022-01-01
# above, it became 2022-02-01; it's the next month begin
# if it's already a month_begin, doesn't count; goes to next one
fake_df.head()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
0,2022-01-01,2,2022,1,January,1,Saturday
1,2022-01-02,3,2022,1,January,2,Sunday
2,2022-01-03,5,2022,1,January,3,Monday
3,2022-01-04,9,2022,1,January,4,Tuesday
4,2022-01-05,10,2022,1,January,5,Wednesday


In [84]:
# for each purchase_date, previous month's end
(
    fake_df['purchase_date']
      .subtract(
          pd.tseries.offsets.MonthEnd()       
      )
)

0     2021-12-31
1     2021-12-31
2     2021-12-31
3     2021-12-31
4     2021-12-31
         ...    
360   2022-11-30
361   2022-11-30
362   2022-11-30
363   2022-11-30
364   2022-11-30
Name: purchase_date, Length: 365, dtype: datetime64[ns]

In [85]:
# last purchase_date is 2022-12-31, which is already a month end
# gets convereted to 2022-11-30, which is previous month end
# if it's already a month end, it doesn't count; gets moved to the next one
# hence, the offsets folder
fake_df.tail()

Unnamed: 0,purchase_date,amount,purchase_date_year,purchase_date_month,purchase_date_month_name,purchase_date_day,purchase_date_weekday
360,2022-12-27,2,2022,12,December,27,Tuesday
361,2022-12-28,4,2022,12,December,28,Wednesday
362,2022-12-29,10,2022,12,December,29,Thursday
363,2022-12-30,7,2022,12,December,30,Friday
364,2022-12-31,5,2022,12,December,31,Saturday


## Timedelta object
- time duration; time between two timestamps
- timestamps are momment in time; timedetla is time between two timestamps
- arrive at a timedelta by subtracting a timestamp from another

In [92]:
time_a = pd.Timestamp('2023-01-01')

time_b = pd.Timestamp('2023-01-08')

# 7 days between time_a and time_b

In [93]:
time_b - time_a               # can't do .add() or .sub()#

# delta of positive 7 days

Timedelta('7 days 00:00:00')

In [94]:
time_a - time_b

# delta of negative 7 days

Timedelta('-7 days +00:00:00')

In [96]:
# can also add hh:mm:ss and pm/am
# pandas will parse this accordingly

In [100]:
# create a Timedelta without subtracting two Timestamps
pd.Timedelta(days = 3, hours = 12, minutes = 3, seconds = 1)

# see documentation for other paramters
# no years parameter; due to leap years; 365 vs. 366 days
# best practice to put larger parameters on left/first, i.e., weeks before days before hours before minutes before seconds

Timedelta('3 days 12:03:01')

In [101]:
# can add or subtract a Timedelta to a Timestamp
pd.Timestamp('2023-01-01') + pd.Timedelta(days = 1)

Timestamp('2023-01-02 00:00:00')

In [102]:
# can also enter strings
pd.Timedelta("5 minutes")

Timedelta('0 days 00:05:00')

In [105]:
# other strings
pd.Timedelta("6 hours 12 minutes")

Timedelta('0 days 06:12:00')

## Timedeltas in a DataFrame

In [106]:
ecommerce = pd.read_csv(
    'https://raw.githubusercontent.com/jack-cao-623/python_learning/main/pandas/ecommerce.csv'
)

ecommerce.head()

Unnamed: 0,ID,order_date,delivery_date
0,1,5/24/98,2/5/99
1,2,4/22/92,3/6/98
2,4,2/10/91,8/26/92
3,5,7/21/92,11/20/97
4,7,9/2/93,6/10/98


In [111]:
# make ID the index
ecommerce = ecommerce.set_index('ID')

ecommerce.head()

Unnamed: 0_level_0,order_date,delivery_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5/24/98,2/5/99
2,4/22/92,3/6/98
4,2/10/91,8/26/92
5,7/21/92,11/20/97
7,9/2/93,6/10/98


In [113]:
# order_date and delivery_date to datetime
ecommerce['order_date'] = pd.to_datetime(ecommerce['order_date'])
ecommerce['delivery_date'] = pd.to_datetime(ecommerce['delivery_date'])

ecommerce.head()

Unnamed: 0_level_0,order_date,delivery_date
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1998-05-24,1999-02-05
2,1992-04-22,1998-03-06
4,1991-02-10,1992-08-26
5,1992-07-21,1997-11-20
7,1993-09-02,1998-06-10


In [121]:
# calculate timedelta between delivery_date and order_date
ecommerce['days_between_order_and_delivery'] = (
   ecommerce['delivery_date']
      .subtract(ecommerce['order_date'])
)

ecommerce.head()

Unnamed: 0_level_0,order_date,delivery_date,days_between_order_and_delivery
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1998-05-24,1999-02-05,257 days
2,1992-04-22,1998-03-06,2144 days
4,1991-02-10,1992-08-26,563 days
5,1992-07-21,1997-11-20,1948 days
7,1993-09-02,1998-06-10,1742 days


In [125]:
# what would hypothetical delivery date be if it took 2x as long to deliver?
ecommerce['hypothetical_delivery_date_2x_long'] = (
    ecommerce['delivery_date']
      .add(ecommerce['days_between_order_and_delivery'])
)

# can add (or subtract) a timedelta to a timestamp to get a new timestamp

ecommerce.head()

Unnamed: 0_level_0,order_date,delivery_date,days_between_order_and_delivery,hypothetical_delivery_date_2x_long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1998-05-24,1999-02-05,257 days,1999-10-20
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12
5,1992-07-21,1997-11-20,1948 days,2003-03-22
7,1993-09-02,1998-06-10,1742 days,2003-03-18


In [128]:
# above is equivalent to:
ecommerce['days_between_order_and_delivery'].mul(2) + ecommerce['order_date']

ID
1     1999-10-20
2     2004-01-18
4     1994-03-12
5     2003-03-22
7     2003-03-18
         ...    
990   2000-09-12
991   2004-10-18
993   2005-10-06
994   1993-06-23
997   1993-07-01
Length: 501, dtype: datetime64[ns]

In [132]:
# orders that took longer than 365 days to deliver; regular boolean mask filtering
(
    ecommerce
      [
          ecommerce['days_between_order_and_delivery'] > '365 days'
      ]
)

Unnamed: 0_level_0,order_date,delivery_date,days_between_order_and_delivery,hypothetical_delivery_date_2x_long
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1992-04-22,1998-03-06,2144 days,2004-01-18
4,1991-02-10,1992-08-26,563 days,1994-03-12
5,1992-07-21,1997-11-20,1948 days,2003-03-22
7,1993-09-02,1998-06-10,1742 days,2003-03-18
9,1990-01-25,1994-10-02,1711 days,1999-06-09
...,...,...,...,...
986,1990-12-10,1992-12-16,737 days,1994-12-23
990,1991-06-24,1996-02-02,1684 days,2000-09-12
991,1991-09-09,1998-03-30,2394 days,2004-10-18
993,1990-11-16,1998-04-27,2719 days,2005-10-06


In [133]:
# these methods work
ecommerce['days_between_order_and_delivery'].min()

Timedelta('8 days 00:00:00')

In [134]:
ecommerce['days_between_order_and_delivery'].max()

Timedelta('3583 days 00:00:00')

In [135]:
ecommerce['days_between_order_and_delivery'].median()

Timedelta('998 days 00:00:00')

In [136]:
ecommerce['days_between_order_and_delivery'].mean()

Timedelta('1217 days 22:53:53.532934128')

In [None]:
# end of section on dates and times