<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_dates_and_times.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas Dates and Times
- Python has a built-in date module

In [27]:
# libraries needed
import numpy as np
import pandas as pd
import datetime as dt   

# datetime is not an external library
# it's part of Python; part of standard library; think of as internal libraries
# not imported automatically to preserve memory

# but imported the same way as pandas is to keep things consistent and simple

## Review of Python's datetime module
- internal library Python loads on demand
- not automatically loaded to preserve memory
- module is synonymous with libary
- dt.date() method creates a date object
- dt.datetime() method creates a datetime object, which is date plus time hh:mm:ss

In [28]:
# .datetime() method is part of datetime module/library
# .datetime() method creates a datetime object

# date object: YYYY-MM-DD
# datetime object: YYYY-MM-DD also includes timestamp

In [29]:
# create date object using dt.date() method
new_years_day = dt.date(year = 2023, month = 1, day = 1)
new_years_day 

datetime.date(2023, 1, 1)

In [30]:
# attributes on new_years_day
print(new_years_day.year)
print(new_years_day.month)
print(new_years_day.day)

2023
1
1


In [31]:
# create datetime object using dt.datetime() method
maddie_bday = dt.datetime(
    year = 2022, month = 3, day = 3,
    hour = 1, minute = 39
)

maddie_bday

datetime.datetime(2022, 3, 3, 1, 39)

In [32]:
# view in alternative format
str(maddie_bday)

'2022-03-03 01:39:00'

In [33]:
# attributes of maddie_bday
print(maddie_bday.year)
print(maddie_bday.month)
print(maddie_bday.day)

print(maddie_bday.hour)
print(maddie_bday.minute)
print(maddie_bday.second)

2022
3
3
1
39
0


## Pandas timestamp object
- equivalent of Python datetime object
  - pandas Timestamp is better for data analysis; custom 
  built, so do things like add and subtract chunks of time; 
  more efficient
- basically a single moment in time: a date and a time, but called a timestamp
- don't need hour:minute:second part; default to midnight, 00:00:00
- pd.Timestamp() method
  - input is a string, can be flexible in string format
  - but best stick with YYYY-MM-DD format

In [34]:
# January 1, 2023
pd.Timestamp('2023-01-01')

Timestamp('2023-01-01 00:00:00')

In [35]:
# also January 1, 2023
pd.Timestamp('01/01/23')
  # note the midnight default for time component

Timestamp('2023-01-01 00:00:00')

In [36]:
# March 3, 2022 at 1:39am, Maddie's birthday
pd.Timestamp('2022-03-03 01:39:00')

Timestamp('2022-03-03 01:39:00')

In [37]:
# also March 3, 2022 at 1:39am, Maddie's birthday
pd.Timestamp('2022-03-03 1:39am')

Timestamp('2022-03-03 01:39:00')

In [38]:
# feed a Pandas Datetime object
pd.Timestamp(dt.datetime(2023, 1, 1))

Timestamp('2023-01-01 00:00:00')

## pd.DatetimeIndex object
- a collection of pandas Timestamps

In [39]:
# follow along video
list_of_dates = ['2023-01-01', '2023-01-02', '2023-01-03']
pd.DatetimeIndex(list_of_dates)

# convert date column/Series to index --> more relevant use case you'll encounter

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [40]:
# using Python date
another_list_of_dates = [dt.date(2023, 1, 1), dt.date(2023, 1, 2), dt.date(2023, 1, 3)]
pd.DatetimeIndex(another_list_of_dates)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [41]:
# create a Series where index is datetime
pd.Series(
    data = [1, 2, 3],
    index = pd.DatetimeIndex(list_of_dates)         # list_of_dates defined above
)

2023-01-01    1
2023-01-02    2
2023-01-03    3
dtype: int64

In [42]:
# convert data column/Series to index
fake_dat = pd.DataFrame(
    {
        'purchase_date': ['2023-01-01', '2023-01-02', '2023-01-03'], 
        'purchase_amout': [100, 200, 300],
        'item': ['pen', 'pencil', 'paper']
    }
)

fake_dat

Unnamed: 0,purchase_date,purchase_amout,item
0,2023-01-01,100,pen
1,2023-01-02,200,pencil
2,2023-01-03,300,paper


In [43]:
# convert purchase_date to datetime
fake_dat['purchase_date'] = pd.to_datetime(fake_dat['purchase_date'])
fake_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   purchase_date   3 non-null      datetime64[ns]
 1   purchase_amout  3 non-null      int64         
 2   item            3 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 200.0+ bytes


In [44]:
# make purchase_date the index
(
    fake_dat
      .set_index('purchase_date')
      .index
)

# index is already a datetime index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', name='purchase_date', freq=None)

## pd.to_datetime() method
- convert to Pandas time related object
- e.g., string or Python date object to Pandas datetime object
- errors = 'coerce' for NaT, not a time, when something can't be converted to a Pandas datetime
  - default is errors = 'raise'

In [45]:
# convert strings to Pandas datetime
pd.to_datetime('2023-01-01')

Timestamp('2023-01-01 00:00:00')

In [46]:
# can also convert a list of strings
pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [47]:
# convert Pandas dates to datetime
pd.to_datetime(dt.datetime(2023, 1, 1))

# same with list

Timestamp('2023-01-01 00:00:00')

In [48]:
# common use is converting existing Series to datetime
purchase_dates = pd.Series(
    data = ['2023-01-01', '2023-01-02', '2023-01-03']
)

purchase_dates

0    2023-01-01
1    2023-01-02
2    2023-01-03
dtype: object

In [49]:
# convert purchase_dates Series to datetime
purchase_dates = pd.to_datetime(purchase_dates)

purchase_dates

0   2023-01-01
1   2023-01-02
2   2023-01-03
dtype: datetime64[ns]

In [50]:
# what if some of the dates aren't a date?
(
    pd.to_datetime(
        pd.Series(
            data = ['2023-01-01', '2023-01-02', 'hello', '2023-02-31'],  # first two are fine; hello and feb 31 aren't dates
        ),
        errors = 'coerce'                                                # default is 'raise'

    )
)

# NaT is not a time; basically can't be converted in pandas datetime
# akin to NaN

0   2023-01-01
1   2023-01-02
2          NaT
3          NaT
dtype: datetime64[ns]

In [51]:
# unix time: number of seconds since Jan 1, 1970 at midnight
# need additional parameter: unit = 's'
(
    pd.to_datetime(
        [1, 2, 3],                # unix time
        unit = 's'
    )
)

DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02',
               '1970-01-01 00:00:03'],
              dtype='datetime64[ns]', freq=None)

## pd.date_range() method
- create a date index quickly based on start, end, and frequency (freq)

In [53]:
# start, end, and freq
# need at least 2 out of these 3 parameters

In [54]:
# from jan 1 thru dec 31, 2023
(
    pd.date_range(
        start = '2023-01-01',
        end = '2023-12-31', 
        freq = 'D'                # D for 1 Day; increments of 1 day; can do 2D for 2 days, 3D for 3 days
    )
)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06', '2023-01-07', '2023-01-08',
               '2023-01-09', '2023-01-10',
               ...
               '2023-12-22', '2023-12-23', '2023-12-24', '2023-12-25',
               '2023-12-26', '2023-12-27', '2023-12-28', '2023-12-29',
               '2023-12-30', '2023-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [56]:
# other freq arguments
# B for business days; excludes weekends and maybe holidays?
(
    pd.date_range(
        start = '2018-12-20', 
        end = '2018-12-31', 
        freq = 'B'                  # excludes weekends, but inclues Christmas day
    )
)

DatetimeIndex(['2018-12-20', '2018-12-21', '2018-12-24', '2018-12-25',
               '2018-12-26', '2018-12-27', '2018-12-28', '2018-12-31'],
              dtype='datetime64[ns]', freq='B')

In [57]:
# W for week; see documentation; W-SUN, etc.
# H for hour; can precede with number for number of hours
# M for month end, last day of month

In [58]:
pd.date_range(
    start = '2023-01-01', 
    end = '2023-12-01',      
    freq = 'M'                  # Dec 31 after the end parameter of Dec 1
)

DatetimeIndex(['2023-01-31', '2023-02-28', '2023-03-31', '2023-04-30',
               '2023-05-31', '2023-06-30', '2023-07-31', '2023-08-31',
               '2023-09-30', '2023-10-31', '2023-11-30'],
              dtype='datetime64[ns]', freq='M')

In [59]:
# MS for month start
pd.date_range(
    start = '2023-01-01', 
    end = '2023-12-01',      
    freq = 'MS'                  # now we get start of each month
)

DatetimeIndex(['2023-01-01', '2023-02-01', '2023-03-01', '2023-04-01',
               '2023-05-01', '2023-06-01', '2023-07-01', '2023-08-01',
               '2023-09-01', '2023-10-01', '2023-11-01', '2023-12-01'],
              dtype='datetime64[ns]', freq='MS')

In [61]:
# A for year end; last day of each year
pd.date_range(
  start = '2023-01-01', 
  end = '2050-01-01',
  freq = 'A'
)

DatetimeIndex(['2023-12-31', '2024-12-31', '2025-12-31', '2026-12-31',
               '2027-12-31', '2028-12-31', '2029-12-31', '2030-12-31',
               '2031-12-31', '2032-12-31', '2033-12-31', '2034-12-31',
               '2035-12-31', '2036-12-31', '2037-12-31', '2038-12-31',
               '2039-12-31', '2040-12-31', '2041-12-31', '2042-12-31',
               '2043-12-31', '2044-12-31', '2045-12-31', '2046-12-31',
               '2047-12-31', '2048-12-31', '2049-12-31'],
              dtype='datetime64[ns]', freq='A-DEC')

## pd.date_range() method, continued
- create a DatetimeIndex of a specified length using periods parameter

In [62]:
# 25 days starting on sep 9, 2012
(
    pd.date_range(
        start = '2012-09-12', 
        periods = 25,
        freq = 'D'
    )
)

# length of resulting DatetimeIndex is 25

DatetimeIndex(['2012-09-12', '2012-09-13', '2012-09-14', '2012-09-15',
               '2012-09-16', '2012-09-17', '2012-09-18', '2012-09-19',
               '2012-09-20', '2012-09-21', '2012-09-22', '2012-09-23',
               '2012-09-24', '2012-09-25', '2012-09-26', '2012-09-27',
               '2012-09-28', '2012-09-29', '2012-09-30', '2012-10-01',
               '2012-10-02', '2012-10-03', '2012-10-04', '2012-10-05',
               '2012-10-06'],
              dtype='datetime64[ns]', freq='D')

In [65]:
# 50 business days staring on april 15, 2022
(
    pd.date_range(
        start = '2022-04-15', 
        periods = 50,
        freq = 'B'                 # exclude weekends
    )
)

# length of resulting DatetimeIndex is 50

DatetimeIndex(['2022-04-15', '2022-04-18', '2022-04-19', '2022-04-20',
               '2022-04-21', '2022-04-22', '2022-04-25', '2022-04-26',
               '2022-04-27', '2022-04-28', '2022-04-29', '2022-05-02',
               '2022-05-03', '2022-05-04', '2022-05-05', '2022-05-06',
               '2022-05-09', '2022-05-10', '2022-05-11', '2022-05-12',
               '2022-05-13', '2022-05-16', '2022-05-17', '2022-05-18',
               '2022-05-19', '2022-05-20', '2022-05-23', '2022-05-24',
               '2022-05-25', '2022-05-26', '2022-05-27', '2022-05-30',
               '2022-05-31', '2022-06-01', '2022-06-02', '2022-06-03',
               '2022-06-06', '2022-06-07', '2022-06-08', '2022-06-09',
               '2022-06-10', '2022-06-13', '2022-06-14', '2022-06-15',
               '2022-06-16', '2022-06-17', '2022-06-20', '2022-06-21',
               '2022-06-22', '2022-06-23'],
              dtype='datetime64[ns]', freq='B')

In [66]:
# 10 weeks, starting on Mondays
# starting on Jan 1, 2023

pd.date_range(
    start = '2023-01-01', 
    periods = 10, 
    freq = 'W-MON'
)

DatetimeIndex(['2023-01-02', '2023-01-09', '2023-01-16', '2023-01-23',
               '2023-01-30', '2023-02-06', '2023-02-13', '2023-02-20',
               '2023-02-27', '2023-03-06'],
              dtype='datetime64[ns]', freq='W-MON')

## pd.date_range() method, continued