# Part 2 - Crude Oil Export in CAD, GDP Contribution of Oil Export from CA

In [1]:
# Dependencies and Setup
import pandas as pd
import numpy as np
import datetime

In [2]:
# List all related data files path
table_4_file = "Resources/Raw Data/Table 4 - Canadian Oil Export by Type.xls"
table_8_file = "Resources/Raw Data/Table 8 - Canada GDP Data.csv"
table_6_file = "Resources/Raw Data/Table 6 - USD-CAD Exchange Rate (API Output).csv"


# Table 4 - Canadian Oil Export by Type

In [3]:
# Read 
table_4_df = pd.read_excel(table_4_file)
table_4_df.head(20)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,,,,,,,,,,,,
1,,,,,,,,,,,,
2,Crude Oil Exports - Summary by Type - Monthly\...,,,,,,,,,,,
3,,,,,,,,,,,,
4,,,,Light\n(> 30 API),,,Medium\n(25 - 30 API),Heavy\n(<25 API),,,,
5,Period:,,,,,,,,,Total,,
6,2000-01-01,,,102223,,,25889.1,90128.1,,218240,,
7,2000-02-01,,,104746,,,27786.6,81843.1,,214376,,
8,2000-03-01,,,101307,,,26496,91580.9,,219384,,
9,2000-04-01,,,106014,,,25062.3,78239.9,,209316,,


In [6]:
# Take any row of needed data, drop all NaN values, then get the list of columns
sample_row = table_4_df.iloc[6]
list_columns = sample_row.dropna().index

# Parse the list of needed columns to get the clean df
cleaned_table_4_df = table_4_df[list_columns]

# Drop all rows with NaN data, reset index and rename columns
cleaned_table_4_df.dropna(how = "any", inplace = True)
cleaned_table_4_df.reset_index(drop = True, inplace = True)
cleaned_table_4_df.columns = ["Date", "Light Oil", "Medium Oil", "Heavy Oil", "Oil Export (m3 per day)"]
cleaned_table_4_df.head(20)

Unnamed: 0,Date,Light Oil,Medium Oil,Heavy Oil,Oil Export (m3 per day)
0,2000-01-01,102223.0,25889.1,90128.1,218240
1,2000-02-01,104746.0,27786.6,81843.1,214376
2,2000-03-01,101307.0,26496.0,91580.9,219384
3,2000-04-01,106014.0,25062.3,78239.9,209316
4,2000-05-01,108434.0,31567.8,97939.7,237941
5,2000-06-01,114142.0,24275.0,93868.2,232286
6,2000-07-01,94025.7,30742.3,87349.9,212118
7,2000-08-01,98057.1,28718.4,84657.0,211432
8,2000-09-01,95258.6,28253.1,80536.2,204048
9,2000-10-01,100042.0,28735.0,90494.5,219271


In [8]:
final_table_4_df = cleaned_table_4_df[["Date", "Oil Export (m3 per day)"]]
final_table_4_df = final_table_4_df.loc[final_table_4_df["Date"] >= "2005-01-01"].reset_index(drop = True)
final_table_4_df.head()

Unnamed: 0,Date,Oil Export (m3 per day)
0,2005-01-01,238379
1,2005-02-01,236555
2,2005-03-01,230862
3,2005-04-01,246359
4,2005-05-01,278832


# Table 8 - Canada GDP Data

In [45]:
# Cleaning the Canada GDP csv
table_8_df = pd.read_csv(table_8_file)
table_8_df.head(20)

Unnamed: 0,"Gross domestic product (GDP) at basic prices, by industry, monthly, industry detail (x 1,000,000) 1 2",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 239,Unnamed: 240,Unnamed: 241,Unnamed: 242,Unnamed: 243,Unnamed: 244,Unnamed: 245,Unnamed: 246,Unnamed: 247,Unnamed: 248
0,Monthly,,,,,,,,,,...,,,,,,,,,,
1,Table: 36-10-0434-04 (formerly CANSIM 379-0031),,,,,,,,,,...,,,,,,,,,,
2,Geography: Canada,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,Seasonally adjusted at annual rates,,,,,,,,,...,,,,,,,,,,
5,,Chained (2012) dollars 3,,,,,,,,,...,,,,,,,,,,
6,,Canada,,,,,,,,,...,,,,,,,,,,
7,North American Industry Classification System ...,Jan-00,Feb-00,Mar-00,Apr-00,May-00,Jun-00,Jul-00,Aug-00,Sep-00,...,19-Nov,19-Dec,20-Jan,20-Feb,20-Mar,20-Apr,20-May,20-Jun,20-Jul,20-Aug
8,,Dollars,,,,,,,,,...,,,,,,,,,,
9,"Agriculture, forestry, fishing and hunting [11]",28095,27638,28461,28216,28397,28848,28793,28054,27649,...,41436,40718,40665,41001,41397,41228,41985,42196,42008,42010


In [130]:
new_table_8_df = table_8_df.dropna(axis = 0, how = "any")

# Filter out the columns which contain word "Oil"
table_8_filter_1 = new_table_8_df[new_table_8_df.columns[0]].str.contains("NAICS")
table_8_filter_2 = new_table_8_df[new_table_8_df.columns[0]].str.contains("Oil")
new_table_8_df = new_table_8_df[table_8_filter_1].append(new_table_8_df[table_8_filter_2]).reset_index(drop = True)
new_table_8_df

Unnamed: 0,"Gross domestic product (GDP) at basic prices, by industry, monthly, industry detail (x 1,000,000) 1 2",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 239,Unnamed: 240,Unnamed: 241,Unnamed: 242,Unnamed: 243,Unnamed: 244,Unnamed: 245,Unnamed: 246,Unnamed: 247,Unnamed: 248
0,North American Industry Classification System ...,Jan-00,Feb-00,Mar-00,Apr-00,May-00,Jun-00,Jul-00,Aug-00,Sep-00,...,19-Nov,19-Dec,20-Jan,20-Feb,20-Mar,20-Apr,20-May,20-Jun,20-Jul,20-Aug
1,Oil and gas extraction [211],69825,69154,68513,69640,69412,69491,68357,68179,68884,...,109386,110088,109236,109549,109250,103118,99818,100173,97703,93892
2,Oil and gas extraction (except oil sands) [21...,..,..,..,..,..,..,..,..,..,...,55738,53935,52645,53965,53619,49829,48269,50583,49222,49122
3,Oil sands extraction [21114],..,..,..,..,..,..,..,..,..,...,54466,56914,57321,56351,56388,53982,52220,50325,49194,45508


In [154]:
oil_table_8_df = new_table_8_df.iloc[0:2]
oil_table_8_df.set_index(oil_table_8_df.columns[0], inplace = True)
oil_table_8_df.index.name = None

# Transpose the dataframe and reset its index
clean_table_8_df = oil_table_8_df.T.reset_index(drop = True)

# Rename columns
clean_table_8_df.columns = ["Date", "Oil and gas extraction [211]"]

# Remove rows with year 2000
table_8_filter_3 = clean_table_8_df["Date"].str.contains("00")
final_table_8_df = clean_table_8_df[~table_8_filter_3]
final_table_8_df["Date"] = pd.to_datetime(final_table_8_df["Date"], format = "%y-%b")

final_table_8_df = final_table_8_df[final_table_8_df["Date"]>= "2005-01-01"].reset_index(drop = True)
final_table_8_df

Unnamed: 0,Date,Oil and gas extraction [211]
0,2005-01-01,75199
1,2005-02-01,73681
2,2005-03-01,72409
3,2005-04-01,74726
4,2005-05-01,75464
...,...,...
183,2020-04-01,103118
184,2020-05-01,99818
185,2020-06-01,100173
186,2020-07-01,97703


# Table 6 - USD-CAD Exchange Rate (API Output)

In [12]:
table_6_df = pd.read_csv(table_6_file)
table_6_df.head(30)

Unnamed: 0,Date,USD/CAD,CAD/USD
0,2005-01-31,1.240278,0.806271
1,2005-02-28,1.232134,0.8116
2,2005-03-31,1.211088,0.825704
3,2005-04-30,1.254543,0.797103
4,2005-05-31,1.255365,0.796581
5,2005-06-30,1.228558,0.813962
6,2005-07-31,1.22644,0.815368
7,2005-08-31,1.190318,0.840112
8,2005-09-30,1.163774,0.859273
9,2005-10-31,1.17981,0.847594
