## Data Exploration

### 1. Open Canada

Fuel Consumption ratings of cars

Main link: https://open.canada.ca/data/en/dataset/98f1a129-f628-4ce4-b24d-6f16bf24dd64

Datasets provide model-specific fuel consumption ratings and estimated carbon dioxide emissions for new light-duty vehicles for retail sale in Canada.

In [None]:
import pandas as pd
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import re

In [None]:
url = "https://open.canada.ca/data/en/dataset/98f1a129-f628-4ce4-b24d-6f16bf24dd64"
uClient = uReq(url)
page_html = uClient.read()
uClient.close()

In [None]:
page_soup = soup(page_html, "html.parser")

In [None]:
# Access all links and names
csv_links = page_soup.findAll("span", {"property":"url"})
table_name = page_soup.findAll("span",{"property":"name","lang":'en'})

# Parse as comprehension lists
csv_list = [csv_links[i].text for i in range(len(csv_links))]
names_list = [table_name[i].text.split("(")[0] for i in range(len(table_name))]
dates_list = [table_name[i].text.split("(")[1].replace(")","")  if len(table_name[i].text.split("("))>1 else "" for i in range(len(table_name))]

# Build dataframe
data_df = pd.DataFrame({"Table Name":names_list,"Table Date":dates_list, "Table Url": csv_list})

# There are duplications of the datasets - one link in English, one in French
data_df.drop_duplicates(subset=['Table Name'], inplace=True)

# Clean up
data_df['Table Name'] = data_df['Table Name'].str.rstrip()

In [None]:
data_df.head(10)

### Exploring 2022 Fuel Consumption Ratings

In [None]:
year = "2022"
name = f"{year} Fuel Consumption Ratings"
fuel_consumption_2022 = data_df[data_df['Table Name']==name]['Table Url'].values[0]

In [None]:
# Save file
csv_req = requests.get(fuel_consumption_2022)
url_content = csv_req.content
file_name = f'{name.replace(" ","_")}.csv'
csv_file = open(file_name, 'wb',)

csv_file.write(url_content)
csv_file.close()

In [None]:
sample_df = pd.read_csv(file_name, sep=",", low_memory=False)

In [None]:
sample_df

### Data cleaning

In [None]:
sample_df_col = sample_df.dropna(thresh=1 ,axis=1).dropna(thresh=1 ,axis=0)

In [None]:
sample_df_no_footer = sample_df_col.dropna(thresh=3 ,axis=0)

In [None]:
sample_df_col[['Model','Make']].iloc[-22:]

In [None]:
sample_df_no_footer

In [None]:
# Clean header 0 on df

cols = sample_df_no_footer.columns
cleaned_cols = [re.sub(r'Unnamed: \d*', "Fuel Consumption", item) if "Unnamed" in item else item for item in cols]

# Clean row 1 on df
str_item_cols = [str(item) for item in sample_df_no_footer.iloc[0:1,].values[0]]
str_non_nan = ["" if item=='nan' else item for item in str_item_cols]

# Form new columns

new_cols = []
for itema,itemb in zip(cleaned_cols, str_non_nan):
    new_cols.append(f'{itema}_{itemb}')

In [None]:
# Assign new columns
co2_consumption = sample_df_no_footer.iloc[1:, ].copy()
co2_consumption.columns = new_cols

In [None]:
co2_consumption.info()

In [None]:
co2_consumption.head(2)

Assign correct data type

In [None]:
col_list_float = ['Model_Year','Engine Size_(L)','Cylinders_',
                  'Fuel Consumption_City (L/100 km)','Fuel Consumption_Hwy (L/100 km)',	
                  'Fuel Consumption_Comb (L/100 km)','Fuel Consumption_Comb (mpg)',
                  'CO2 Emissions_(g/km)'	,
                  'CO2_Rating',	'Smog_Rating']       
for item in col_list_float:
    co2_consumption[item] = pd.to_numeric(co2_consumption[item])

In [None]:
co2_consumption.describe()

### Visualization

In [None]:
pairplor_data_fig = sns.pairplot(co2_consumption, hue='Make_')
pairplor_data_fig.savefig(f'pairplot_co2_{year}_make.png')

In [None]:
pairplor_data_fig = sns.pairplot(co2_consumption, hue='Vehicle Class_')
pairplor_data_fig.savefig(f'pairplot_co2_{year}_vehicle_class.png')

In [None]:
pairplor_data_fig = sns.pairplot(co2_consumption, hue='Model.1_')
pairplor_data_fig.savefig(f'pairplot_co2_{year}_model.png')

In [None]:
pairplor_data_fig = sns.pairplot(co2_consumption, hue='Transmission_')
pairplor_data_fig.savefig(f'pairplot_co2_{year}_transmission.png')

### Data distribution

In [None]:
sns.set()

In [None]:
col_list_float = ['Model_Year','Engine Size_(L)','Cylinders_',
                  'Fuel Consumption_City (L/100 km)','Fuel Consumption_Hwy (L/100 km)',	
                  'Fuel Consumption_Comb (L/100 km)','Fuel Consumption_Comb (mpg)',
                  'CO2 Emissions_(g/km)'	,
                  'CO2_Rating',	'Smog_Rating']      


col_list_float.pop(0)    

fig, axes = plt.subplots(3, 3, sharey=True, figsize=(20,10))
fig.suptitle('Data distribution')
k=0
for i in range(0,3):
    for j in range(0,3):

        k+=1
        variable= col_list_float[k-1]
        sns.histplot(ax=axes[i, j], data=co2_consumption, x=variable, bins=10)
plt.show()

In [None]:
col_list_float = ['Model_Year','Engine Size_(L)','Cylinders_',
                  'Fuel Consumption_City (L/100 km)','Fuel Consumption_Hwy (L/100 km)',	
                  'Fuel Consumption_Comb (L/100 km)','Fuel Consumption_Comb (mpg)',
                  'CO2 Emissions_(g/km)'	,
                  'CO2_Rating',	'Smog_Rating']      

col_list_float.pop(0)    

fig, axes = plt.subplots(3, 3, sharex=True, figsize=(30,10))
fig.suptitle('Data distribution')

k = 0
for i in range(0,3):
    for j in range(0,3):
        k+=1
        variable= col_list_float[k-1]
        sns.boxplot(ax=axes[i, j], data=co2_consumption, y=variable, x="Transmission_")
plt.show()