In [1]:
# Python has a built-in datatime object in the datetime library
from datetime import datetime
import pandas as pd

In [2]:
# to get the current time 
time_now = datetime.now()
print(time_now)

2020-07-04 13:44:25.310029


In [3]:
type(time_now)

datetime.datetime

In [4]:
time = datetime(1970,12,25) # yyyy mm dd
diff = time_now - time

In [5]:
print(diff)

18089 days, 13:44:25.310029


In [6]:
type(diff)

datetime.timedelta

# Conversion to datetime

In [7]:
import pandas as pd
import numpy as np
data = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\country_timeseries.csv")

In [8]:
data.shape

(122, 18)

In [9]:
data.head(10)

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,0.0
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,0.0
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,0.0
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,0.0
5,12/28/2014,281,2706.0,8018.0,9446.0,,,,,,1708.0,3423.0,2758.0,,,,,
6,12/27/2014,280,2695.0,,9409.0,,,,,,1697.0,,2732.0,,,,,
7,12/24/2014,277,2630.0,7977.0,9203.0,,,,,,,3413.0,2655.0,,,,,
8,12/21/2014,273,2597.0,,9004.0,,,,,,1607.0,,2582.0,,,,,
9,12/20/2014,272,2571.0,7862.0,8939.0,,,,,,1586.0,3384.0,2556.0,,,,,


In [10]:
data.info() # you might noticed that the Date column is encoded as a generic string object in Pandas

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
Date                   122 non-null object
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            17 non-null float64
dtypes: float64(16), int64(1), object(1)
memory usage: 17.3+ KB


In [11]:
# the following line create a new column that convert the "Date" column into a datetime object

data["Date_dt"] = pd.to_datetime(data["Date"])

In [12]:
# we can format the date as follows : 
data["Data_dt_1"] = pd.to_datetime(data["Date"], format = '%m/%d/%Y')

In [13]:
data["Date_dt_2"] = pd.to_datetime(data["Date"]).dt.strftime('%d-%m-%Y') 
# this creates a new column and format the date as d/m/yyyy

# Loading data that contains dates

In [14]:
# the read csv function has a lot of parameters such as parse_date, inher_datetime_format, keep_date_col
# we can parse the Date column directly by specifying the column we want in the parse_dates parameter
# for more information, refer to < https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html >

In [15]:
data_revised = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\country_timeseries.csv", parse_dates = [0])
# by specifying the date column index in the read_csv function under the parse_date parameters, the "Date" column is automatically read as a DateTime object

In [16]:
data_revised.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
Date                   122 non-null datetime64[ns]
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            17 non-null float64
dtypes: datetime64[ns](1), float64(16), int64(1)
memory usage: 17.3 KB


# Extracting Date Components

In [17]:
d = pd.to_datetime("2016-03-12")
print(d)

2016-03-12 00:00:00


In [18]:
print(type(d)) # if we pass a single string, we get a Timestamp

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [19]:
# we can access the various date component

print(d.year)
print(d.month)
print(d.day)

2016
3
12


In [20]:
# We can create a new column that store year based on the "Date" column 

data["Year"] = data["Date_dt"].dt.year # note that Date column in the data variable is a object, NOT a datetime object 
data[["Date", "Date_dt","Year"]].head()

Unnamed: 0,Date,Date_dt,Year
0,1/5/2015,2015-01-05,2015
1,1/4/2015,2015-01-04,2015
2,1/3/2015,2015-01-03,2015
3,1/2/2015,2015-01-02,2015
4,12/31/2014,2014-12-31,2014


In [21]:
data_revised["Year"] = data_revised["Date"].dt.year # note that the Date column in the data_revised variable is a datetime object 
data_revised[["Date","Year"]].head()

Unnamed: 0,Date,Year
0,2015-01-05,2015
1,2015-01-04,2015
2,2015-01-03,2015
3,2015-01-02,2015
4,2014-12-31,2014


In [22]:
data_revised["Month"], data_revised["Day"] = data_revised["Date"].dt.month, data_revised["Date"].dt.day
data_revised[["Year","Month","Day"]].head()

Unnamed: 0,Year,Month,Day
0,2015,1,5
1,2015,1,4
2,2015,1,3
3,2015,1,2
4,2014,12,31


# Date Calculations and Timedeltas

In [23]:
# to obtain the first day of the outbreak, i.e. the earliest day of the outbreak, we need to get the minimum date value

outbreak = data_revised["Date"].min()
print(outbreak)

2014-03-22 00:00:00


In [24]:
type(outbreak)

pandas._libs.tslibs.timestamps.Timestamp

In [25]:
data_revised["days_since_outbreak"] = data_revised["Date"] - outbreak

In [26]:
data_revised.info() # notice that the column "days_since_outbreak" is a timedelta object
# note that whenever perform calculation on a datetime object, we return a timedelta object


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 21 columns):
Date                   122 non-null datetime64[ns]
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            17 non-null float64
Year                   122 non-null int64
Month                  122 non-null int64
days_since_outbreak    122 non

In [27]:
data_revised.tail()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,...,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali,Year,Month,days_since_outbreak
117,2014-03-27,27,103.0,8.0,6.0,,,,,,...,6.0,5.0,,,,,,2014,3,5 days
118,2014-03-26,26,86.0,,,,,,,,...,,,,,,,,2014,3,4 days
119,2014-03-25,25,86.0,,,,,,,,...,,,,,,,,2014,3,3 days
120,2014-03-24,24,86.0,,,,,,,,...,,,,,,,,2014,3,2 days
121,2014-03-22,22,49.0,,,,,,,,...,,,,,,,,2014,3,0 days


# Datetime Methods 

In [28]:
banks = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\banklist.csv", parse_dates= [5,6])
# note that we are parsing the date column at column index value of 5 and 6

In [29]:
banks.columns

Index(['Bank Name', 'City', 'ST', 'CERT', 'Acquiring Institution',
       'Closing Date', 'Updated Date'],
      dtype='object')

In [30]:
banks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 553 entries, 0 to 552
Data columns (total 7 columns):
Bank Name                553 non-null object
City                     553 non-null object
ST                       553 non-null object
CERT                     553 non-null int64
Acquiring Institution    553 non-null object
Closing Date             553 non-null datetime64[ns]
Updated Date             553 non-null datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 30.4+ KB


In [31]:
banks["close_year"], banks["close_quarter"], banks["close_month"] = banks["Closing Date"].dt.year, banks["Closing Date"].dt.quarter, banks["Closing Date"].dt.month 

In [32]:
close_year_report = banks.groupby(["close_year"]).size() # size method simply the count the number of row items 
close_year_report_quarter = banks.groupby(["close_year","close_quarter"]).size()

In [33]:
print(close_year_report)

close_year
2000      2
2001      4
2002     11
2003      3
2004      4
2007      3
2008     25
2009    140
2010    157
2011     92
2012     51
2013     24
2014     18
2015      8
2016      5
2017      6
dtype: int64


In [34]:
print(close_year_report_quarter)

close_year  close_quarter
2000        4                 2
2001        1                 1
            2                 1
            3                 2
2002        1                 6
            2                 2
            3                 1
            4                 2
2003        1                 1
            2                 1
            4                 1
2004        1                 3
            2                 1
2007        1                 1
            3                 1
            4                 1
2008        1                 2
            2                 2
            3                 9
            4                12
2009        1                21
            2                24
            3                50
            4                45
2010        1                41
            2                45
            3                41
            4                30
2011        1                26
            2                22
            3 

In [35]:
import matplotlib.pyplot as plt 

fig, ax = plt.subplots()
ax = close_year_report.plot()
plt.show()

fig, ax = plt.subplots()
ax = close_year_report_quarter.plot()
plt.show()

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

# Getting stock data

In [36]:
tesla = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\tesla_stock_yahoo.csv", parse_dates= [0])
tesla.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,19.0,25.0,17.540001,23.889999,23.889999,18766300
1,2010-06-30,25.790001,30.42,23.299999,23.83,23.83,17187100
2,2010-07-01,25.0,25.92,20.27,21.959999,21.959999,8218800
3,2010-07-02,23.0,23.1,18.709999,19.200001,19.200001,5139800
4,2010-07-06,20.0,20.0,15.83,16.110001,16.110001,6866900


In [37]:
tesla.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1791 entries, 0 to 1790
Data columns (total 7 columns):
Date         1791 non-null datetime64[ns]
Open         1791 non-null float64
High         1791 non-null float64
Low          1791 non-null float64
Close        1791 non-null float64
Adj Close    1791 non-null float64
Volume       1791 non-null int64
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 98.1 KB


In [38]:
tesla.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1791.0,1791.0,1791.0,1791.0,1791.0,1791.0
mean,143.053222,145.496315,140.449425,143.039464,143.039464,4448296.0
std,101.898965,103.311576,100.320158,101.886058,101.886058,4308184.0
min,16.139999,16.629999,14.98,15.8,15.8,118500.0
25%,30.564999,31.355,29.730001,30.66,30.66,1261750.0
50%,176.160004,180.470001,171.199997,177.110001,177.110001,3427400.0
75%,227.209999,230.794999,222.860001,226.940002,226.940002,5974700.0
max,386.690002,386.98999,379.350006,383.450012,383.450012,37163900.0


In [39]:
tesla.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')

In [40]:
# if we need data from June 2010, we can : 

tesla_june2010 = tesla.loc[(tesla.Date.dt.year == 2010) & (tesla.Date.dt.month == 6)]
tesla_june2010.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2010-06-29,19.0,25.0,17.540001,23.889999,23.889999,18766300
1,2010-06-30,25.790001,30.42,23.299999,23.83,23.83,17187100


# Datetimeindex Object

In [41]:
# in Pandas, by default, the row number is the row index
tesla.index = tesla["Date"] # here we assign the row index to the Date column
print(tesla.index)

DatetimeIndex(['2010-06-29', '2010-06-30', '2010-07-01', '2010-07-02',
               '2010-07-06', '2010-07-07', '2010-07-08', '2010-07-09',
               '2010-07-12', '2010-07-13',
               ...
               '2017-07-26', '2017-07-27', '2017-07-28', '2017-07-31',
               '2017-08-01', '2017-08-02', '2017-08-03', '2017-08-04',
               '2017-08-07', '2017-08-08'],
              dtype='datetime64[ns]', name='Date', length=1791, freq=None)


In [42]:
# with the index set as a date object, the index is now a datetimeindex
# with the index set as the datetimeindex object, we can use the date directly to subset the row data

In [43]:
print(tesla["2015"].head(2))
# if the date is not set to the index, tesla["2015"] will generate an error

Date        Open    High         Low       Close   Adj Close  \
Date                                                                            
2015-01-02 2015-01-02  222.869995  223.25  213.259995  219.309998  219.309998   
2015-01-05 2015-01-05  214.550003  216.50  207.160004  210.089996  210.089996   

             Volume  
Date                 
2015-01-02  4764400  
2015-01-05  5368500  


In [44]:
tesla.Date.dt.year.unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017], dtype=int64)

In [45]:
tesla["2010-06"].iloc[:,:5]

Unnamed: 0_level_0,Date,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-06-29,2010-06-29,19.0,25.0,17.540001,23.889999
2010-06-30,2010-06-30,25.790001,30.42,23.299999,23.83


# Timedeltaindex object

In [46]:
# similar to how we use a datetime object to create a datetimeindex, we can use timedelta object to create a timedeltaindex

tesla["ref_date"] = tesla["Date"] - tesla["Date"].min() # this minus the date column with the min (earliest) value of the date column
print(tesla["ref_date"].head())
tesla.index = tesla["ref_date"]

Date
2010-06-29   0 days
2010-06-30   1 days
2010-07-01   2 days
2010-07-02   3 days
2010-07-06   7 days
Name: ref_date, dtype: timedelta64[ns]


In [47]:
print(tesla.head(5)) # the index column will alway be on the extreme left of the DataFrame object

Date       Open   High        Low      Close  Adj Close  \
ref_date                                                                 
0 days   2010-06-29  19.000000  25.00  17.540001  23.889999  23.889999   
1 days   2010-06-30  25.790001  30.42  23.299999  23.830000  23.830000   
2 days   2010-07-01  25.000000  25.92  20.270000  21.959999  21.959999   
3 days   2010-07-02  23.000000  23.10  18.709999  19.200001  19.200001   
7 days   2010-07-06  20.000000  20.00  15.830000  16.110001  16.110001   

            Volume ref_date  
ref_date                     
0 days    18766300   0 days  
1 days    17187100   1 days  
2 days     8218800   2 days  
3 days     5139800   3 days  
7 days     6866900   7 days  


# Date Range 

In [48]:
# Not every dataset will have a fixed frequency of value
# in the data_revised variable which contain the ebola dataset, we do not have an observations for every day in the date range 
# for example, 2015-01-01 and 2014-03-23 is missing from the dataset
# it is common practise to create a date range to reindex a dataset, we can leverage on date_range function for this purpose 

In [49]:
# first, lets create a date range for specific range. for example the head range

head_range = pd.date_range(start="2014-12-31", end="2015-01-05")
print(head_range)

DatetimeIndex(['2014-12-31', '2015-01-01', '2015-01-02', '2015-01-03',
               '2015-01-04', '2015-01-05'],
              dtype='datetime64[ns]', freq='D')


In [50]:
data_revised_5 = data_revised.head(5) # we will work with the first 5 rows of the data 

In [51]:
# second, we set the original Date column as the index 
data_revised_5.index = data_revised_5["Date"]

In [52]:
# third, we reindex our data 
data_revised_5.reindex(head_range)

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,...,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali,Year,Month,days_since_outbreak
2014-12-31,2014-12-31,31.0,2730.0,8115.0,9633.0,,,,,,...,3471.0,2827.0,,,,,0.0,2014.0,12.0,284 days
2015-01-01,NaT,,,,,,,,,,...,,,,,,,,,,NaT
2015-01-02,2015-01-02,2.0,,8157.0,,,,,,,...,3496.0,,,,,,0.0,2015.0,1.0,286 days
2015-01-03,2015-01-03,3.0,2769.0,8166.0,9722.0,,,,,,...,3496.0,2915.0,,,,,0.0,2015.0,1.0,287 days
2015-01-04,2015-01-04,4.0,2775.0,,9780.0,,,,,,...,,2943.0,,,,,0.0,2015.0,1.0,288 days
2015-01-05,2015-01-05,5.0,2776.0,,10030.0,,,,,,...,,2977.0,,,,,0.0,2015.0,1.0,289 days


In [53]:
print(data_revised_5.iloc[:,:5])

Date  Day  Cases_Guinea  Cases_Liberia  Cases_SierraLeone
Date                                                                      
2015-01-05 2015-01-05    5        2776.0            NaN            10030.0
2015-01-04 2015-01-04    4        2775.0            NaN             9780.0
2015-01-03 2015-01-03    3        2769.0         8166.0             9722.0
2015-01-02 2015-01-02    2           NaN         8157.0                NaN
2014-12-31 2014-12-31   31        2730.0         8115.0             9633.0


# Frequencies 

In [54]:
# When we created our head_range, the print statement included a parameter called freq
# in the above example, freq was "D" which stands for "day"
# that is the values in our date range were stepped through using a day-by-day increment 

pd.date_range(start="2017-01-01", end="2017-01-07", freq = "B" ) # note that 2017-01-01 was a Sunday
# refer to page 229 for the list of frequencies 

DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05',
               '2017-01-06'],
              dtype='datetime64[ns]', freq='B')

In [57]:
pd.date_range(start="2020-01-01", periods=5, freq="MS")

DatetimeIndex(['2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01',
               '2020-05-01'],
              dtype='datetime64[ns]', freq='MS')

# Shifting Values

In [62]:
import matplotlib.pyplot as plt
data.index = data["Date"]

In [65]:
plt.plot(data)

TypeError: unhashable type: 'numpy.ndarray'