<a href="https://colab.research.google.com/github/jack-cao-623/python_learning/blob/main/pandas_dates_and_times.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Pandas Dates and Times
- Python has a built-in date module

In [65]:
# libraries needed
import numpy as np
import pandas as pd
import datetime as dt   

# datetime is not an external library
# it's part of Python; part of standard library; think of as internal libraries
# not imported automatically to preserve memory

# but imported the same way as pandas is to keep things consistent and simple

## Review of Python's datetime module
- internal library Python loads on demand
- not automatically loaded to preserve memory
- module is synonymous with libary
- dt.date() method creates a date object
- dt.datetime() method creates a datetime object, which is date plus time hh:mm:ss

In [66]:
# .datetime() method is part of datetime module/library
# .datetime() method creates a datetime object

# date object: YYYY-MM-DD
# datetime object: YYYY-MM-DD also includes timestamp

In [67]:
# create date object using dt.date() method
new_years_day = dt.date(year = 2023, month = 1, day = 1)
new_years_day 

datetime.date(2023, 1, 1)

In [68]:
# attributes on new_years_day
print(new_years_day.year)
print(new_years_day.month)
print(new_years_day.day)

2023
1
1


In [69]:
# create datetime object using dt.datetime() method
maddie_bday = dt.datetime(
    year = 2022, month = 3, day = 3,
    hour = 1, minute = 39
)

maddie_bday

datetime.datetime(2022, 3, 3, 1, 39)

In [70]:
# view in alternative format
str(maddie_bday)

'2022-03-03 01:39:00'

In [71]:
# attributes of maddie_bday
print(maddie_bday.year)
print(maddie_bday.month)
print(maddie_bday.day)

print(maddie_bday.hour)
print(maddie_bday.minute)
print(maddie_bday.second)

2022
3
3
1
39
0


## Pandas timestamp object
- equivalent of Python datetime object
  - pandas Timestamp is better for data analysis; custom 
  built, so do things like add and subtract chunks of time; 
  more efficient
- basically a single moment in time: a date and a time, but called a timestamp
- don't need hour:minute:second part; default to midnight, 00:00:00
- pd.Timestamp() method
  - input is a string, can be flexible in string format
  - but best stick with YYYY-MM-DD format

In [72]:
# January 1, 2023
pd.Timestamp('2023-01-01')

Timestamp('2023-01-01 00:00:00')

In [73]:
# also January 1, 2023
pd.Timestamp('01/01/23')
  # note the midnight default for time component

Timestamp('2023-01-01 00:00:00')

In [74]:
# March 3, 2022 at 1:39am, Maddie's birthday
pd.Timestamp('2022-03-03 01:39:00')

Timestamp('2022-03-03 01:39:00')

In [75]:
# also March 3, 2022 at 1:39am, Maddie's birthday
pd.Timestamp('2022-03-03 1:39am')

Timestamp('2022-03-03 01:39:00')

In [76]:
# feed a Pandas Datetime object
pd.Timestamp(dt.datetime(2023, 1, 1))

Timestamp('2023-01-01 00:00:00')

## pd.DatetimeIndex object
- a collection of pandas Timestamps

In [77]:
# follow along video
list_of_dates = ['2023-01-01', '2023-01-02', '2023-01-03']
pd.DatetimeIndex(list_of_dates)

# convert date column/Series to index --> more relevant use case you'll encounter

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [78]:
# using Python date
another_list_of_dates = [dt.date(2023, 1, 1), dt.date(2023, 1, 2), dt.date(2023, 1, 3)]
pd.DatetimeIndex(another_list_of_dates)

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [79]:
# create a Series where index is datetime
pd.Series(
    data = [1, 2, 3],
    index = pd.DatetimeIndex(list_of_dates)         # list_of_dates defined above
)

2023-01-01    1
2023-01-02    2
2023-01-03    3
dtype: int64

In [80]:
# convert data column/Series to index
fake_dat = pd.DataFrame(
    {
        'purchase_date': ['2023-01-01', '2023-01-02', '2023-01-03'], 
        'purchase_amout': [100, 200, 300],
        'item': ['pen', 'pencil', 'paper']
    }
)

fake_dat

Unnamed: 0,purchase_date,purchase_amout,item
0,2023-01-01,100,pen
1,2023-01-02,200,pencil
2,2023-01-03,300,paper


In [81]:
# convert purchase_date to datetime
fake_dat['purchase_date'] = pd.to_datetime(fake_dat['purchase_date'])
fake_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   purchase_date   3 non-null      datetime64[ns]
 1   purchase_amout  3 non-null      int64         
 2   item            3 non-null      object        
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 200.0+ bytes


In [82]:
# make purchase_date the index
(
    fake_dat
      .set_index('purchase_date')
      .index
)

# index is already a datetime index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', name='purchase_date', freq=None)

## pd.to_datetime() method
- convert to Pandas time related object
- e.g., string or Python date object to Pandas datetime object
- errors = 'coerce' for NaT, not a time, when something can't be converted to a Pandas datetime
  - default is errors = 'raise'

In [83]:
# convert strings to Pandas datetime
pd.to_datetime('2023-01-01')

Timestamp('2023-01-01 00:00:00')

In [84]:
# can also convert a list of strings
pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03'], dtype='datetime64[ns]', freq=None)

In [85]:
# convert Pandas dates to datetime
pd.to_datetime(dt.datetime(2023, 1, 1))

# same with list

Timestamp('2023-01-01 00:00:00')

In [86]:
# common use is converting existing Series to datetime
purchase_dates = pd.Series(
    data = ['2023-01-01', '2023-01-02', '2023-01-03']
)

purchase_dates

0    2023-01-01
1    2023-01-02
2    2023-01-03
dtype: object

In [87]:
# convert purchase_dates Series to datetime
purchase_dates = pd.to_datetime(purchase_dates)

purchase_dates

0   2023-01-01
1   2023-01-02
2   2023-01-03
dtype: datetime64[ns]

In [93]:
# what if some of the dates aren't a date?
(
    pd.to_datetime(
        pd.Series(
            data = ['2023-01-01', '2023-01-02', 'hello', '2023-02-31'],  # first two are fine; hello and feb 31 aren't dates
        ),
        errors = 'coerce'                                                # default is 'raise'

    )
)

# NaT is not a time; basically can't be converted in pandas datetime
# akin to NaN

0   2023-01-01
1   2023-01-02
2          NaT
3          NaT
dtype: datetime64[ns]

In [96]:
# unix time: number of seconds since Jan 1, 1970 at midnight
# need additional parameter: unit = 's'
(
    pd.to_datetime(
        [1, 2, 3],                # unix time
        unit = 's'
    )
)

DatetimeIndex(['1970-01-01 00:00:01', '1970-01-01 00:00:02',
               '1970-01-01 00:00:03'],
              dtype='datetime64[ns]', freq=None)

## pd.date_range() method