## Data Wrangling

In [3]:
import pandas as pd
import bs4
import requests
import requests
import seaborn as sns
import numpy as np
import re
from pathlib import Path
import json

from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

## Step 1: obtain data and metadata

### Topic: Fuel consumption ratings (Statistics Canada)

Site https://open.canada.ca/data/en/dataset/98f1a129-f628-4ce4-b24d-6f16bf24dd64

In [4]:
url_open_canada = "https://open.canada.ca/data/api/action/package_show?id=98f1a129-f628-4ce4-b24d-6f16bf24dd64"

json_resp = requests.get(url_open_canada)

In [5]:
json_resp.headers.get('Content-Type','')

'application/json;charset=utf-8'

In [6]:
if json_resp.status_code == 200 and 'application/json' in json_resp.headers.get('Content-Type',''):
    open_canada_data = json_resp.json()
else:
    print("Error")

In [7]:
open_canada_data.keys()

dict_keys(['help', 'success', 'result'])

In [8]:
# Data wrangling
data_entries = pd.json_normalize(open_canada_data['result'], record_path="resources")
data_entries['language'] = data_entries['language'].apply(lambda col: col[0])
data_entries_english = data_entries[data_entries['language']=='en']


In [9]:
data_entries_english.head(2)

Unnamed: 0,cache_last_updated,unique_identifier,package_id,datastore_contains_all_records_of_source_file,validation_status,datastore_active,character_set,validation_timestamp,id,state,...,language,created,url,last_modified,resource_type,position,revision_id,data_quality,name_translated.fr,name_translated.en
0,,,98f1a129-f628-4ce4-b24d-6f16bf24dd64,False,,False,,,026e45b4-eb63-451f-b34f-d9308ea3a3d9,active,...,en,2017-03-31T09:40:17.244058,https://www.nrcan.gc.ca/sites/nrcan/files/oee/...,,dataset,0,ffd5fc97-9248-48ed-b1bc-7fa542cdad40,[],Véhicules électriques à batterie 2012-2022 (20...,Battery-electric vehicles 2012-2022 (2022-05-16)
2,,,98f1a129-f628-4ce4-b24d-6f16bf24dd64,False,,False,,,8812228b-a6aa-4303-b3d0-66489225120d,active,...,en,2017-03-31T09:40:17.244000,https://www.nrcan.gc.ca/sites/nrcan/files/oee/...,,dataset,2,28104dc6-6147-4ecc-913c-ad13f4f565bf,[],Véhicules hybrides électriques rechargeables 2...,Plug-in hybrid electric vehicles 2012-2022 (20...


In [11]:
data_entries_english.loc[:,"start_year"] =  data_entries_english['name'].str.extract('(\d+)')[0].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


#### Automate data extraction

In [18]:
folder = "C:/Users/Laura GF/Documents/GitHub/machine-learning-capstone/data/"

total_no_records = 0
for item in data_entries_english[['name','url','start_year']].iterrows():
    # Form file name
    file_name = f'{item[1]["name"].replace(" ","_")}.csv'
    # Perform query
    csv_req = requests.get(item[1]['url'])
    # Parse content
    url_content = csv_req.content
    
    # Save content into file
    csv_file = open(Path(folder + "raw-data",file_name), 'wb',)
    csv_file.write(url_content)
    csv_file.close()
    
    # Data cleaning
    df = pd.read_csv(Path(folder + "raw-data",file_name), sep=",", low_memory=False, encoding='cp1252')
    sample_df_col = df.dropna(thresh=1 ,axis=1).dropna(thresh=1 ,axis=0)
    sample_df_no_footer = sample_df_col.dropna(thresh=3 ,axis=0)
    
    # Remove Unnamed cols
    cols = sample_df_no_footer.columns
    cleaned_cols = [re.sub(r'Unnamed: \d*', "Fuel Consumption", item) if "Unnamed" in item else item for item in cols]

    # Clean row 1 on df
    str_item_cols = [str(item) for item in sample_df_no_footer.iloc[0:1,].values[0]]
    str_non_nan = ["" if item=='nan' else item for item in str_item_cols]

    # Form new columns
    new_cols = []
    for itema,itemb in zip(cleaned_cols, str_non_nan):
        new_cols.append(f'{itema}_{itemb}')


    final_df = sample_df_no_footer.iloc[1:, ].copy()
    final_df.columns = new_cols
    
    # Save clean df
    print("Number of records in file", file_name, ":", final_df.size)
    total_no_records += final_df.size
    final_df.to_csv(Path(folder + "clean-data",f'{file_name}'))

Number of records in file Battery-electric_vehicles_2012-2022_(2022-05-16).csv : 5580
Number of records in file Plug-in_hybrid_electric_vehicles_2012-2022_(2022-03-28).csv : 4340
Number of records in file 2022_Fuel_Consumption_Ratings_(2022-08-18).csv : 14325
Number of records in file 2021_Fuel_Consumption_Ratings_(2022-08-09).csv : 14115
Number of records in file 2020_Fuel_Consumption_Ratings_(2021-09-29).csv : 14520
Number of records in file 2019_Fuel_Consumption_Ratings_(2021-09-29).csv : 15840
Number of records in file 2018_Fuel_Consumption_Ratings_(2021-09-29).csv : 16245
Number of records in file 2017_Fuel_Consumption_Ratings_(2020-03-17).csv : 15870
Number of records in file 2016_Fuel_Consumption_Ratings_(2020-03-17).csv : 15540
Number of records in file 2015_Fuel_Consumption_Ratings_(2020-03-17).csv : 14716
Number of records in file 2010-2014_Fuel_Consumption_Ratings_(2020-03-17).csv : 69667
Number of records in file 2005-2009_Fuel_Consumption_Ratings_(2020-01-31).csv : 67665
N

In [19]:
print("Total number of records for this data set", total_no_records)

Total number of records for this data set 594294


In [20]:
final_df

Unnamed: 0,MODEL_YEAR,MAKE_,MODEL.1_# = high output engine,VEHICLE CLASS_,ENGINE SIZE_(L),CYLINDERS_,TRANSMISSION_,FUEL_TYPE,FUEL CONSUMPTION_CITY (L/100 km),Fuel Consumption_HWY (L/100 km),Fuel Consumption_COMB (L/100 km),Fuel Consumption_COMB (mpg),CO2 EMISSIONS _(g/km)
1,1995,ACURA,INTEGRA,SUBCOMPACT,1.8,4.0,A4,X,10.2,7.0,8.8,32,202
2,1995,ACURA,INTEGRA,SUBCOMPACT,1.8,4.0,M5,X,9.6,7.0,8.4,34,193
3,1995,ACURA,INTEGRA GS-R,SUBCOMPACT,1.8,4.0,M5,Z,9.4,7.0,8.3,34,191
4,1995,ACURA,LEGEND,COMPACT,3.2,6.0,A4,Z,12.6,8.9,10.9,26,251
5,1995,ACURA,LEGEND COUPE,COMPACT,3.2,6.0,A4,Z,13.0,9.3,11.3,25,260
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3515,1999,VOLVO,V70 T5 TURBO WAGON,STATION WAGON - MID-SIZE,2.3,5.0,A4,Z,11.9,7.9,10.1,28,232
3516,1999,VOLVO,V70 T5 TURBO WAGON,STATION WAGON - MID-SIZE,2.3,5.0,M5,Z,11.9,7.9,10.1,28,232
3517,1999,VOLVO,V70 WAGON,STATION WAGON - MID-SIZE,2.4,5.0,A4,Z,11.6,7.7,9.8,29,225
3518,1999,VOLVO,V70 WAGON,STATION WAGON - MID-SIZE,2.4,5.0,M5,Z,11.2,7.6,9.6,29,221


###  To scrape cars sold in Canada and US by make

https://www.goodcarbadcar.net/


To do: write a Scrapy script that extracts data on cars (for each make found in the 

### Topic: New motor vehicle registrations, quarterly (Statistics Canada)

Site https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2010002401



In [21]:
vehicle_registrations = "https://www150.statcan.gc.ca/n1/tbl/csv/20100024-eng.zip"
resp = urlopen(vehicle_registrations)
myzip = ZipFile(BytesIO(resp.read()))
file_name = [item for item in myzip.namelist() if "MetaData" not in item]

In [22]:
vehicle_reg_csv = myzip.open(file_name[0])
vehicle_reg_df = pd.read_csv(vehicle_reg_csv)

In [23]:
vehicle_reg_df.drop(columns=['DGUID',
                             'UOM_ID',
                             'SCALAR_ID',
                             'VECTOR',
                             'COORDINATE',
                             'STATUS',
                             'SYMBOL',
                             'TERMINATED',
                             'DECIMALS'], inplace=True)

In [29]:
vehicle_reg_df.to_csv(Path(folder+"clean-data", "new_motor_vehicle_reg.csv"))

In [30]:
vehicle_reg_df

Unnamed: 0,REF_DATE,GEO,Fuel type,Vehicle type,Statistics,UOM,SCALAR_FACTOR,VALUE
0,2017-01,Canada,All fuel types,"Total, vehicle type",Number of vehicles,Units,units,425043.0
1,2017-01,Canada,All fuel types,Passenger cars,Number of vehicles,Units,units,130741.0
2,2017-01,Canada,All fuel types,Pickup trucks,Number of vehicles,Units,units,89797.0
3,2017-01,Canada,All fuel types,Multi-purpose vehicles,Number of vehicles,Units,units,176937.0
4,2017-01,Canada,All fuel types,Vans,Number of vehicles,Units,units,27568.0
...,...,...,...,...,...,...,...,...
8465,2022-04,British Columbia and the Territories,Other fuel types,"Total, vehicle type",Number of vehicles,Units,units,9.0
8466,2022-04,British Columbia and the Territories,Other fuel types,Passenger cars,Number of vehicles,Units,units,9.0
8467,2022-04,British Columbia and the Territories,Other fuel types,Pickup trucks,Number of vehicles,Units,units,0.0
8468,2022-04,British Columbia and the Territories,Other fuel types,Multi-purpose vehicles,Number of vehicles,Units,units,0.0


### Topic: New zero-emission vehicle registrations, quarterly (Statistics Canada)

https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2010002501

In [31]:
near_zero_vehicle = "https://www150.statcan.gc.ca/n1/tbl/csv/20100025-eng.zip"

resp = urlopen(near_zero_vehicle)
myzip = ZipFile(BytesIO(resp.read()))
file_name = [item for item in myzip.namelist() if "MetaData" not in item]

In [32]:
near_zero_vehicle_reg_csv = myzip.open(file_name[0])
near_zero_vehicle_reg_df = pd.read_csv(near_zero_vehicle_reg_csv)

In [33]:
near_zero_vehicle_reg_df.drop(columns=['DGUID',
                                       'UOM_ID',
                                       'SCALAR_ID',
                                       'VECTOR',
                                       'COORDINATE',
                                       'STATUS',
                                       'SYMBOL','TERMINATED','DECIMALS'], inplace=True)

In [35]:
near_zero_vehicle_reg_df.to_csv(Path(folder+"clean-data", "near_zero_vehicle_registrations.csv"))

In [36]:
near_zero_vehicle_reg_df

Unnamed: 0,REF_DATE,GEO,Zero-Emission Vehicles Fuel Type,Vehicle type,Statistics,UOM,SCALAR_FACTOR,VALUE
0,2017-01,Canada,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,units,3488.0
1,2017-01,Canada,Battery electric,"Total, vehicle type",Number of vehicles,Units,units,1664.0
2,2017-01,Canada,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,units,1824.0
3,2017-01,Newfoundland and Labrador,All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,units,
4,2017-01,Newfoundland and Labrador,Battery electric,"Total, vehicle type",Number of vehicles,Units,units,
...,...,...,...,...,...,...,...,...
352765,2022-04,Taloyoak,Battery electric,"Total, vehicle type",Number of vehicles,Units,units,
352766,2022-04,Taloyoak,Plug-in hybrid electric,"Total, vehicle type",Number of vehicles,Units,units,
352767,2022-04,"Kitikmeot, Unorganized",All zero-emission vehicles,"Total, vehicle type",Number of vehicles,Units,units,
352768,2022-04,"Kitikmeot, Unorganized",Battery electric,"Total, vehicle type",Number of vehicles,Units,units,


### Topic: Sales of fuel used for road motor vehicles, annual (Statistics Canada)

https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2310006601

In [37]:
fuel_sold = "https://www150.statcan.gc.ca/n1/tbl/csv/23100066-eng.zip"

resp = urlopen(fuel_sold)
myzip = ZipFile(BytesIO(resp.read()))
file_name = [item for item in myzip.namelist() if "MetaData" not in item]

fuel_sold_csv = myzip.open(file_name[0])
fuel_sold_df = pd.read_csv(fuel_sold_csv)

fuel_sold_df.drop(columns=['DGUID',
                                       'UOM_ID',
                                       'SCALAR_ID',
                                       'VECTOR',
                                       'COORDINATE',
                                       'STATUS',
                                       'SYMBOL',
                                       'TERMINATED',
                                       'DECIMALS'], inplace=True)

fuel_sold_df.to_csv(Path(folder+"clean-data", "fuel_sold_motor_vehicles.csv"))

In [38]:
fuel_sold_df

Unnamed: 0,REF_DATE,GEO,Type of fuel sales,UOM,SCALAR_FACTOR,VALUE
0,1987,Canada,Net sales of gasoline,Litres,thousands,31089401.0
1,1987,Canada,Gross sales of gasoline,Litres,thousands,32819878.0
2,1987,Canada,Net sales of diesel oil,Litres,thousands,4418838.0
3,1987,Canada,Net sales of liquefied petroleum gas,Litres,thousands,40919.0
4,1987,Newfoundland and Labrador,Net sales of gasoline,Litres,thousands,535421.0
...,...,...,...,...,...,...
1911,2021,Northwest Territories,Net sales of liquefied petroleum gas,Litres,thousands,0.0
1912,2021,Nunavut,Net sales of gasoline,Litres,thousands,22922.0
1913,2021,Nunavut,Gross sales of gasoline,Litres,thousands,22922.0
1914,2021,Nunavut,Net sales of diesel oil,Litres,thousands,40581.0


### Topic: Daily average time spent in hours on transport to and from activities, by mode of transport (Statistics Canada)

https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=4510001403

In [39]:
hours_on_transport = "https://www150.statcan.gc.ca/n1/tbl/csv/45100014-eng.zip"

resp = urlopen(hours_on_transport)
myzip = ZipFile(BytesIO(resp.read()))
file_name = [item for item in myzip.namelist() if "MetaData" not in item]

hours_on_transport_csv = myzip.open(file_name[0])
hours_on_transport_df = pd.read_csv(hours_on_transport_csv)

hours_on_transport_df.drop(columns=['DGUID',
                                       'UOM_ID',
                                       'SCALAR_ID',
                                       'VECTOR',
                                       'COORDINATE',
                                       'STATUS',
                                       'SYMBOL',
                                       'TERMINATED',
                                       'DECIMALS'], inplace=True)



In [40]:
hours_on_transport_df['Activity group'].unique()

array(['Sleep and personal activities',
       'Sleeping, resting, relaxing, sick in bed', 'Personal care',
       'Eating or drinking', 'Paid work activities',
       'Studying or learning', 'Transport to and from activity',
       'Private vehicle', 'Active transport', 'Public transport',
       'Unpaid work activities', 'Household chores',
       'Care of household children under 18 years',
       'Care of household adults', 'Shopping for goods or services',
       'Socializing', 'Civic, religious and organizational activities',
       'Active sports and events', 'Active sports', 'Active leisure',
       'Use of technology',
       'Arts and hobbies, leisure activities or writing',
       'Passive leisure', 'Watching television or videos',
       'Reading or listening to music or radio',
       'Other or unknown activity'], dtype=object)

In [41]:
condition_1 = 'Transport to and from activity'
condition_2 = 'Private vehicle'
condition_3 = 'Active transport'
condition_4 = 'Public transport'
time_spent_on_transport = hours_on_transport_df[(hours_on_transport_df['Activity group']==condition_4) | 
(hours_on_transport_df['Activity group']==condition_4) |
(hours_on_transport_df['Activity group']==condition_4) |
(hours_on_transport_df['Activity group']==condition_4)]

In [42]:
time_spent_on_transport

Unnamed: 0,REF_DATE,GEO,Activity group,Age group,Sex,Statistics,UOM,SCALAR_FACTOR,VALUE
1512,2015,Canada,Public transport,"Total, 15 years and over",Both sexes,"Daily average time, population",Hours,units,0.1
1513,2015,Canada,Public transport,"Total, 15 years and over",Both sexes,"Daily average time, participants",Hours,units,1.6
1514,2015,Canada,Public transport,"Total, 15 years and over",Both sexes,"Proportion of day, population",Percent,units,0.4
1515,2015,Canada,Public transport,"Total, 15 years and over",Both sexes,"Proportion of day, participants",Percent,units,6.7
1516,2015,Canada,Public transport,"Total, 15 years and over",Both sexes,Participation rate,Percent,units,9.5
...,...,...,...,...,...,...,...,...,...
23515,2015,British Columbia,Public transport,65 years and over,Female,"Proportion of day, population",Percent,units,
23516,2015,British Columbia,Public transport,65 years and over,Female,"Proportion of day, participants",Percent,units,
23517,2015,British Columbia,Public transport,65 years and over,Female,Participation rate,Percent,units,8.9
23518,2015,British Columbia,Public transport,65 years and over,Female,Low 95% confidence interval,Passengers-miles,units,5.5


In [44]:
time_spent_on_transport.to_csv(Path(folder+"clean-data","average_on_transport.csv"))