In [1]:
#|default_exp datadownload

In [1]:
from pathlib import Path
import requests
import csv
from io import StringIO
from rich import inspect
import pandas as pd
import json

# Data download

In [2]:
def extract_metadata(metadata_url):
    """
    Extracts a list of filenames and urls from Open Cananda metadata url.

    Parameters
    ----------
    metadata_url : str
        Fuel consumption ratings metadata url from Open Canada website.

    Returns
    -------
    english_resources_df : pd.DataFrame
        DataFrame of file names and urls for energy consumption ratings.
    """
    try:      
        metadata_resp = requests.get(metadata_url)
    except requests.exceptions.RequestException as e:
        # If request fails, return an error message and stop.
        print(f'Error making url request: {e}')
    
    try:     
        metadata_json = metadata_resp.json()
    except json.JSONDecodeError:
        # If parsing json fails, return an error message and stop.
        print(f'Error: Response is not valid json')
        
    # Access list of downloadable resources
    resources_df = pd.DataFrame(metadata_json['result']['resources'])

    # Change language coding and extract English only resources
    resources_df['language'] = resources_df['language'].apply(lambda item : item[0])
    english_resources_df = resources_df[resources_df['language'] == 'en']
    
    return english_resources_df[['name', 'url']]

In [3]:
#|export
def extract_raw_data(url, file_name):
    """
    Extract raw data from a URL

    Parameters
    ----------
    url : str
        URL to extract data from
        
    file_name : str or Path object
        file name for raw data dump
        
    """
    
    try:
        # Request data from url
        response = requests.get(url)
        content_type = response.headers['content-type']
        response_text = response.text
        print(f'Response status: {response.status_code}\nContent Type: {content_type}')

        # Save request content to csv file
        with open(file_name, mode='w', newline='') as csvfile:
            csvfile.write(response_text)

        print(f'csv file: {file_name} saved')
        
    # Catch errors    
    except requests.exceptions.HTTPError as err_h:
        print(f'HTTP error occured:{err_h}')
    except requests.exceptions.ConnectionError as err_c:
        print(f'Error connecting:{err_c}')
    except requests.exceptions.Timeout as err_t:
        print(f'Timeout Error:{err_t}')
    except requests.exceptions.RequestException as err:
        print(f'There was an unknown error:{err}')

In [4]:
def merge_top_two_rows(input_file, output_file):
    # Open the input CSV file for reading
    with open(input_file, mode='r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        
        # Read the first two rows from the input file
        header_row = next(reader)
        second_row = next(reader)
        
        # Merge the two rows into one header
        merged_header = [f"{header_row[i]} {second_row[i]}" for i in range(len(header_row))]
        
        # Open the output CSV file for writing
        with open(output_file, mode='w', newline='') as output_csvfile:
            writer = csv.writer(output_csvfile)
            
            # Write the merged header to the output file
            writer.writerow(merged_header)
            
            # Copy the rest of the rows from the input file to the output file
            for row in reader:
                writer.writerow(row)

In [124]:
def rename_columns(df):
    """
    Removes unwanted DataFrame columns and rows, then cleans and renames column headers.

    Parameters
    ----------
    df: DataFrame
        DataFrame with columns to clean

    Returns
    -------
    df: DataFrame
        DataFrame with cleaned column headers

    """
    # Drop empty columns and rows from the DataFrame
    df.dropna(axis=1, how='all', inplace=True)
    df.dropna(axis=0, thresh=5, inplace=True)

    # Remove whitespace, replace spaces with _ and change to lower case
    cleaned_cols = (df.columns.str.lower()
                    .str.strip()
                    .str.replace(' # = high output engine', '')
                    .str.replace('*', '')
                    .str.replace('  ', ' ')
                    .str.replace(' ', '_')
                    .str.replace('(', '')
                    .str.replace(')', '')
                    .str.replace('/', '_')
    )

    col_mapper = dict(list(zip(df.columns, cleaned_cols))) # build a dictionary to map old column names to new
    df.rename(columns=col_mapper, inplace=True)
    return df  

In [127]:
def clean_content(df):
    return df

In [6]:
def clean_fuel_dataframe(df):
    """
    Clean fuel-based vehicle DataFrame.

    Assumes DataFrame has already been passed to rename_columns()

    """
    pass

In [7]:
def clean_hybrid_dataframe(df):
    """
    Clean hybrid vehicle DataFrame.

    Assumes DataFrame has already been passed to rename_columns()

    """
    pass

In [8]:
def clean_electric_dataframe(df):
    """
    Clean electric vehicle DataFrame.

    Assumes DataFrame has already been passed to rename_columns()

    """
    pass

## Script

In [9]:
url = 'https://natural-resources.canada.ca/sites/nrcan/files/oee/files/csv/MY2023%20Fuel%20Consumption%20Ratings.csv'
metadata_url = 'https://open.canada.ca/data/api/action/package_show?id=98f1a129-f628-4ce4-b24d-6f16bf24dd64'

In [74]:
# Build list of available resources
resources_df = extract_metadata(metadata_url)

# Remove unwanted old resources
resources_df = resources_df[~resources_df['name'].str.contains('Original')]

# Build filenames for desired resources and add to resources_df
file_names = (resources_df['name']
     .str.replace(' ', '_')
     .str.replace('(', 'v_')
     .str.replace(')', '')
     .str.lower()
)
resources_df.loc[:,'file_name'] = file_names

# Build raw data file path
path = Path.cwd()
raw_path = path / 'data' / 'raw'
merged_header_path = path / 'data' / 'merged-headers'

# Download and save raw data for each resource
for idx, row in resources_df.iterrows():
    url = row[1]
    file_name = row[2]
    raw_file_name = raw_path / f'{file_name}.csv'
    merged_header_file_name = merged_header_path / f'{file_name}.csv'
    extract_raw_data(url, raw_file_name)
    merge_top_two_rows(raw_file_name, merged_header_file_name)

In [132]:
# Start a list of column headers and initiate a master_df
union_of_headers = set()
master_df = pd.DataFrame()

# Build the master DataFrame
for idx, row in resources_df.iterrows():
    # Open each csv file
    url = row[1]
    file_name = row[2]
    merged_header_file_name = merged_header_path / f'{file_name}.csv'

    # Rename the columns and add to set of unique headers
    df = pd.read_csv(merged_header_file_name)
    df = rename_columns(df)
    union_of_headers = set.union(union_of_headers, set(df.columns))

    # Add any missing column headers to master DataFrame columns
    missing_cols = set(master_df.columns) - union_of_headers
    if len(missing_cols) > 0:
        for col in missing_cols:
            master_df[col] = pd.Series()
        
    # Concatenate current df with master_df
    master_df = pd.concat([master_df, df], ignore_index=True)

    # If hybrid and electric only dfs needed then add here

master_df.head()

  df = pd.read_csv(merged_header_file_name)
  df = pd.read_csv(merged_header_file_name)
  df = pd.read_csv(merged_header_file_name)
  df = pd.read_csv(merged_header_file_name)
  df = pd.read_csv(merged_header_file_name)


Unnamed: 0,model_year,make,model,vehicle_class,motor_kw,transmission,fuel_type,consumption_city_kwh_100_km,hwy_kwh_100_km,comb_kwh_100_km,...,fuel_type_1,consumption_combined_le_100_km,range_1_km,fuel_type_2,consumption_city_l_100_km,hwy_l_100_km,comb_l_100_km,range_2_km,fuel_consumption_city_l_100_km,comb_mpg
0,2012,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,...,,,,,,,,,,
1,2012,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,...,,,,,,,,,,
2,2013,Ford,Focus Electric,Compact,107.0,A1,B,19.0,21.1,20.0,...,,,,,,,,,,
3,2013,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,...,,,,,,,,,,
4,2013,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,...,,,,,,,,,,


In [None]:
# Clean the master_df

In [137]:
master_df

Unnamed: 0,model_year,make,model,vehicle_class,motor_kw,transmission,fuel_type,consumption_city_kwh_100_km,hwy_kwh_100_km,comb_kwh_100_km,...,fuel_type_1,consumption_combined_le_100_km,range_1_km,fuel_type_2,consumption_city_l_100_km,hwy_l_100_km,comb_l_100_km,range_2_km,fuel_consumption_city_l_100_km,comb_mpg
0,2012,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,...,,,,,,,,,,
1,2012,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,...,,,,,,,,,,
2,2013,Ford,Focus Electric,Compact,107.0,A1,B,19.0,21.1,20.0,...,,,,,,,,,,
3,2013,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,...,,,,,,,,,,
4,2013,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27752,1999,VOLVO,V70 T5 TURBO WAGON,STATION WAGON - MID-SIZE,,A4,Z,,,,...,,,,,,9.4,11.6,,13.4,24.0
27753,1999,VOLVO,V70 T5 TURBO WAGON,STATION WAGON - MID-SIZE,,M5,Z,,,,...,,,,,,9.4,11.6,,13.4,24.0
27754,1999,VOLVO,V70 WAGON,STATION WAGON - MID-SIZE,,A4,Z,,,,...,,,,,,9.1,11.3,,13.1,25.0
27755,1999,VOLVO,V70 WAGON,STATION WAGON - MID-SIZE,,M5,Z,,,,...,,,,,,9.0,11.0,,12.7,26.0


In [None]:
# Clean electric vehicle headers and add to list 

# Clean hybrid vehcile file headers and add to list 

# Clean fuel vehicle files headers and add to list 

    # Clean and rename columns depending on whether fuel-based, hybrid, or electric
    # Build a list of column headers for each file 
    # Save down processed file

# Find the union of column headers
# Open each file as df and append to a master df
# Save down master df

In [75]:
file_name = resources_df['file_name'][26]
file_name

'2000-2004_fuel_consumption_ratings'

In [76]:
raw_file_name = merged_header_path /  f'{file_name}.csv'

In [77]:
test_df = pd.read_csv(raw_file_name)

In [103]:
test_df.tail(20)

Unnamed: 0,MODEL YEAR,MAKE,MODEL # = high output engine,VEHICLE CLASS,ENGINE SIZE (L),CYLINDERS,TRANSMISSION,FUEL TYPE,FUEL CONSUMPTION* CITY (L/100 km),HWY (L/100 km),COMB (L/100 km),COMB (mpg),CO2 EMISSIONS (g/km)
3781,Understanding the Table,,,,,,,,,,,,
3782,Model,4WD/4X4 = Four-wheel drive,,,,,,,,,,,
3783,,AWD = All-wheel drive,,,,,,,,,,,
3784,,CNG = Compressed natural gas,,,,,,,,,,,
3785,,FFV = Flexible-fuel vehicle,,,,,,,,,,,
3786,,NGV = Natural gas vehicle,,,,,,,,,,,
3787,,# = High output engine that provides more powe...,,,,,,,,,,,
3788,Transmission,A = Automatic,,,,,,,,,,,
3789,,AM = Automated manual,,,,,,,,,,,
3790,,AS = Automatic with select shift,,,,,,,,,,,


In [104]:
cleaned_cols = (test_df.columns.str.lower()
                .str.strip()
                .str.replace(' # = high output engine', '')
                .str.replace('*', '')
                .str.replace('  ', ' ')
                .str.replace(' ', '_')
                .str.replace('(', '')
                .str.replace(')', '')
                .str.replace('/', '_')
)

In [115]:
col_mapper = dict(list(zip(test_df.columns, cleaned_cols)))

In [117]:
test_df.rename(columns=col_mapper, inplace=True)
test_df.head()

Unnamed: 0,model_year,make,model,vehicle_class,engine_size_l,cylinders,transmission,fuel_type,fuel_consumption_city_l_100_km,hwy_l_100_km,comb_l_100_km,comb_mpg,co2_emissions_g_km
0,2000,ACURA,1.6EL,COMPACT,1.6,4.0,A4,X,10.5,8.0,9.4,30.0,216.0
1,2000,ACURA,1.6EL,COMPACT,1.6,4.0,M5,X,9.8,7.8,8.9,32.0,205.0
2,2000,ACURA,3.2TL,MID-SIZE,3.2,6.0,AS5,Z,13.7,8.8,11.5,25.0,265.0
3,2000,ACURA,3.5RL,MID-SIZE,3.5,6.0,A4,Z,15.0,10.9,13.1,22.0,301.0
4,2000,ACURA,INTEGRA,SUBCOMPACT,1.8,4.0,A4,X,11.4,8.3,10.0,28.0,230.0


In [94]:
hybrid_fn = resources_df['file_name'][2]
hybrid_path = merged_header_path /  f'{hybrid_fn}.csv'
hybrid_df = pd.read_csv(hybrid_path)
hybrid_df.head()

Unnamed: 0,Model Year,Make,Model,Vehicle Class,Motor (kW),Engine Size (L),Cylinders,Transmission,Fuel Type 1,Consumption Combined Le/100 km,Range 1 (km),Recharge Time (h),Fuel Type 2,Consumption City (L/100 km),Hwy (L/100 km),Comb (L/100 km),Range 2 (km),CO2 Emissions (g/km),CO2 Rating,Smog Rating
0,2012,Chevrolet,Volt,Compact,111.0,1.4,4.0,AV,B,2.5 (22.3 kWh/100 km),56.0,4.0,Z,6.7,5.9,6.4,550.0,54.0,,
1,2013,Chevrolet,Volt,Compact,111.0,1.4,4.0,AV,B,2.4 (21.4 kWh/100 km),61.0,4.0,Z,6.7,5.9,6.4,550.0,45.0,,
2,2013,Ford,C-MAX Energi,Mid-size,35.0,2.0,4.0,AV,B/X,2.7 ([23.2 kWh + 0.1 L]/100 km),32.0,2.5,X,5.8,6.5,6.1,856.0,80.0,,
3,2013,Ford,Fusion Energi,Mid-size,35.0,2.0,4.0,AV,B/X,2.7 ([23.2 kWh + 0.1 L]/100 km),32.0,2.5,X,5.8,6.5,6.1,856.0,80.0,,
4,2013,Toyota,Prius Plug-in Hybrid,Mid-size,60.0,1.8,4.0,AV,B/X,2.5 ([18.0 kWh + 0.4 L]/100 km),18.0,1.5,X,4.7,4.8,4.7,845.0,101.0,,


In [95]:
(hybrid_df.columns.str.lower()
                .str.strip()
                .str.replace(' # = high output engine', '')
                .str.replace('*', '')
                .str.replace('  ', ' ')
                .str.replace(' ', '_')
                .str.replace('(', '')
                .str.replace(')', '')
                .str.replace('/', '_')
)

Index(['model_year', 'make', 'model', 'vehicle_class', 'motor_kw',
       'engine_size_l', 'cylinders', 'transmission', 'fuel_type_1',
       'consumption_combined_le_100_km', 'range_1_km', 'recharge_time_h',
       'fuel_type_2', 'consumption_city_l_100_km', 'hwy_l_100_km',
       'comb_l_100_km', 'range_2_km', 'co2_emissions_g_km', 'co2_rating',
       'smog_rating'],
      dtype='object')

In [96]:
elec_fn = resources_df['file_name'][0]
elec_fn

'battery-electric_vehicles_2012-2023_v_2023-08-18'

In [118]:
elec_path = merged_header_path /  f'{elec_fn}.csv'
elec_df = pd.read_csv(elec_path)
elec_df.head()

Unnamed: 0,Model Year,Make,Model,Vehicle Class,Motor (kW),Transmission,Fuel Type,Consumption City (kWh/100 km),Hwy (kWh/100 km),Comb (kWh/100 km),...,.193,.194,.195,.196,.197,.198,.199,.200,.201,.202
0,2012,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,...,,,,,,,,,,
1,2012,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,...,,,,,,,,,,
2,2013,Ford,Focus Electric,Compact,107.0,A1,B,19.0,21.1,20.0,...,,,,,,,,,,
3,2013,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,...,,,,,,,,,,
4,2013,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,...,,,,,,,,,,


In [125]:
cleaned_df = rename_columns(elec_df)

In [126]:
cleaned_df.head()

Unnamed: 0,model_year,make,model,vehicle_class,motor_kw,fuel_type,consumption_city_kwh_100_km,consumption_city_kwh_100_km.1,hwy_kwh_100_km,comb_kwh_100_km,city_le_100_km,hwy_le_100_km,comb_le_100_km,range_km,co2_emissions_g_km,co2__rating,smog_rating,recharge_time_h
0,2012,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,1.9,2.4,2.1,100.0,0.0,,,7.0
1,2012,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,2.2,2.6,2.4,117.0,0.0,,,7.0
2,2013,Ford,Focus Electric,Compact,107.0,A1,B,19.0,21.1,20.0,2.1,2.4,2.2,122.0,0.0,,,4.0
3,2013,Mitsubishi,i-MiEV,Subcompact,49.0,A1,B,16.9,21.4,18.7,1.9,2.4,2.1,100.0,0.0,,,7.0
4,2013,Nissan,LEAF,Mid-size,80.0,A1,B,19.3,23.0,21.1,2.2,2.6,2.4,117.0,0.0,,,7.0


In [99]:
(elec_df.columns.str.lower()
                .str.strip()
                .str.replace(' # = high output engine', '')
                .str.replace('*', '')
                .str.replace('  ', ' ')
                .str.replace(' ', '_')
                .str.replace('(', '')
                .str.replace(')', '')
                .str.replace('/', '_')
)

Index(['model_year', 'make', 'model', 'vehicle_class', 'motor_kw',
       'transmission', 'fuel_type', 'consumption_city_kwh_100_km',
       'hwy_kwh_100_km', 'comb_kwh_100_km',
       ...
       '.193', '.194', '.195', '.196', '.197', '.198', '.199', '.200', '.201',
       '.202'],
      dtype='object', length=221)

In [44]:
for idx, row in resources_df.iterrows():
    print(row[2])

battery-electric_vehicles_2012-2023_v_2023-08-18
plug-in_hybrid_electric_vehicles_2012-2023_v_2023-08-18
2023_fuel_consumption_ratings_v_2023-08-18
2022_fuel_consumption_ratings_v_2023-08-18
2021_fuel_consumption_ratings_v_2023-02-03
2020_fuel_consumption_ratings_v_2023-02-03
2019_fuel_consumption_ratings_v_2021-09-29
2018_fuel_consumption_ratings_v_2021-09-29
2017_fuel_consumption_ratings_v_2020-03-17
2016_fuel_consumption_ratings_v_2020-03-17
2015_fuel_consumption_ratings_v_2020-03-17
2010-2014_fuel_consumption_ratings_v_2020-03-17
2005-2009_fuel_consumption_ratings_v_2020-01-31
2000-2004_fuel_consumption_ratings
1995-1999_fuel_consumption_ratings


In [129]:
metadata_resp = requests.get(meta_url)

metadata_json = metadata_resp.json()
type(metadata_json)

dict

In [130]:
resources_df = pd.DataFrame(metadata_json['result']['resources'])

resources_df.head()

resources_df['language'] = resources_df.loc[:,'language'].apply(lambda item : item[0])

english_resources_df = resources_df.loc[resources_df['language'] == 'en',:]

In [11]:
resources_df

Unnamed: 0,name,url
0,Battery-electric vehicles 2012-2023 (2023-08-18),https://natural-resources.canada.ca/sites/nrca...
2,Plug-in hybrid electric vehicles 2012-2023 (20...,https://natural-resources.canada.ca/sites/nrca...
4,2023 Fuel Consumption Ratings (2023-08-18),https://natural-resources.canada.ca/sites/nrca...
6,2022 Fuel Consumption Ratings (2023-08-18),https://natural-resources.canada.ca/sites/nrca...
8,2021 Fuel Consumption Ratings (2023-02-03),https://natural-resources.canada.ca/sites/nrca...
10,2020 Fuel Consumption Ratings (2023-02-03),https://natural-resources.canada.ca/sites/nrca...
12,2019 Fuel Consumption Ratings (2021-09-29),https://natural-resources.canada.ca/sites/nrca...
14,2018 Fuel Consumption Ratings (2021-09-29),https://natural-resources.canada.ca/sites/nrca...
16,2017 Fuel Consumption Ratings (2020-03-17),https://natural-resources.canada.ca/sites/nrca...
18,2016 Fuel Consumption Ratings (2020-03-17),https://natural-resources.canada.ca/sites/nrca...


In [12]:
resources_df = resources_df[~resources_df['name'].str.contains('Original')]

In [13]:
file_names = (resources_df.loc[:,'name']
     .str.replace(' ', '_')
     .str.replace('(', 'v_')
     .str.replace(')', '')
     .str.lower()
)

In [14]:
type(file_names)

pandas.core.series.Series

In [15]:
resources_df

Unnamed: 0,name,url
0,Battery-electric vehicles 2012-2023 (2023-08-18),https://natural-resources.canada.ca/sites/nrca...
2,Plug-in hybrid electric vehicles 2012-2023 (20...,https://natural-resources.canada.ca/sites/nrca...
4,2023 Fuel Consumption Ratings (2023-08-18),https://natural-resources.canada.ca/sites/nrca...
6,2022 Fuel Consumption Ratings (2023-08-18),https://natural-resources.canada.ca/sites/nrca...
8,2021 Fuel Consumption Ratings (2023-02-03),https://natural-resources.canada.ca/sites/nrca...
10,2020 Fuel Consumption Ratings (2023-02-03),https://natural-resources.canada.ca/sites/nrca...
12,2019 Fuel Consumption Ratings (2021-09-29),https://natural-resources.canada.ca/sites/nrca...
14,2018 Fuel Consumption Ratings (2021-09-29),https://natural-resources.canada.ca/sites/nrca...
16,2017 Fuel Consumption Ratings (2020-03-17),https://natural-resources.canada.ca/sites/nrca...
18,2016 Fuel Consumption Ratings (2020-03-17),https://natural-resources.canada.ca/sites/nrca...


In [16]:
file_names

0     battery-electric_vehicles_2012-2023_uploaded_2...
2     plug-in_hybrid_electric_vehicles_2012-2023_upl...
4     2023_fuel_consumption_ratings_uploaded_2023-08-18
6     2022_fuel_consumption_ratings_uploaded_2023-08-18
8     2021_fuel_consumption_ratings_uploaded_2023-02-03
10    2020_fuel_consumption_ratings_uploaded_2023-02-03
12    2019_fuel_consumption_ratings_uploaded_2021-09-29
14    2018_fuel_consumption_ratings_uploaded_2021-09-29
16    2017_fuel_consumption_ratings_uploaded_2020-03-17
18    2016_fuel_consumption_ratings_uploaded_2020-03-17
20    2015_fuel_consumption_ratings_uploaded_2020-03-17
22    2010-2014_fuel_consumption_ratings_uploaded_20...
24    2005-2009_fuel_consumption_ratings_uploaded_20...
26                   2000-2004_fuel_consumption_ratings
28                   1995-1999_fuel_consumption_ratings
Name: name, dtype: object

In [17]:
resources_df.loc[:,'file_name'] = file_names

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resources_df.loc[:,'filename'] = filenames


In [18]:
resources_df

Unnamed: 0,name,url,filename
0,Battery-electric vehicles 2012-2023 (2023-08-18),https://natural-resources.canada.ca/sites/nrca...,battery-electric_vehicles_2012-2023_uploaded_2...
2,Plug-in hybrid electric vehicles 2012-2023 (20...,https://natural-resources.canada.ca/sites/nrca...,plug-in_hybrid_electric_vehicles_2012-2023_upl...
4,2023 Fuel Consumption Ratings (2023-08-18),https://natural-resources.canada.ca/sites/nrca...,2023_fuel_consumption_ratings_uploaded_2023-08-18
6,2022 Fuel Consumption Ratings (2023-08-18),https://natural-resources.canada.ca/sites/nrca...,2022_fuel_consumption_ratings_uploaded_2023-08-18
8,2021 Fuel Consumption Ratings (2023-02-03),https://natural-resources.canada.ca/sites/nrca...,2021_fuel_consumption_ratings_uploaded_2023-02-03
10,2020 Fuel Consumption Ratings (2023-02-03),https://natural-resources.canada.ca/sites/nrca...,2020_fuel_consumption_ratings_uploaded_2023-02-03
12,2019 Fuel Consumption Ratings (2021-09-29),https://natural-resources.canada.ca/sites/nrca...,2019_fuel_consumption_ratings_uploaded_2021-09-29
14,2018 Fuel Consumption Ratings (2021-09-29),https://natural-resources.canada.ca/sites/nrca...,2018_fuel_consumption_ratings_uploaded_2021-09-29
16,2017 Fuel Consumption Ratings (2020-03-17),https://natural-resources.canada.ca/sites/nrca...,2017_fuel_consumption_ratings_uploaded_2020-03-17
18,2016 Fuel Consumption Ratings (2020-03-17),https://natural-resources.canada.ca/sites/nrca...,2016_fuel_consumption_ratings_uploaded_2020-03-17


In [20]:
resources_df['url'][0], resources_df['filename'][0], 

('https://natural-resources.canada.ca/sites/nrcan/files/oee/files/csv/MY2012-2023%20Battery%20Electric%20Vehicles.csv',
 'battery-electric_vehicles_2012-2023_uploaded_2023-08-18')

In [21]:
path = Path.cwd()
path

PosixPath('/home/jsh/vehicle-co2')

In [24]:
filename = resources_df['filename'][0]

In [26]:
url = resources_df['url'][0]
url

'https://natural-resources.canada.ca/sites/nrcan/files/oee/files/csv/MY2012-2023%20Battery%20Electric%20Vehicles.csv'

In [28]:
raw_file_path = path / 'data' / 'raw' / f'{filename}.csv'
raw_file_path

PosixPath('/home/jsh/vehicle-co2/data/raw/battery-electric_vehicles_2012-2023_uploaded_2023-08-18.csv')

In [30]:
extract_raw_data(url, raw_file_path)

Response status: 200
Content Type: text/csv
csv file: /home/jsh/vehicle-co2/data/raw/battery-electric_vehicles_2012-2023_uploaded_2023-08-18.csv saved


In [38]:
# Download meta data and use JSON normalise.

In [None]:
# Adjust so that we have a list of URLs and we cycle through each of traditional, hybrid and electric.

In [10]:
# Declare file names
raw_path = path / 'data' / 'raw'
raw_file_name = raw_path / 'fuel-ratings-raw.csv'
merged_headers_file_name = raw_path / 'fuel-ratings-raw-headers-merged.csv'

In [11]:
extract_raw_data(url, raw_file_name)

Response status: 200
Content Type: text/csv
csv file: /home/jsh/vehicle-co2/data/raw/fuel-ratings-raw.csv saved


In [12]:
merge_top_two_rows(raw_file_name, merged_headers_file_name)

In [31]:
unprocessed_fuel_ratings_df = pd.read_csv(merged_headers_file_name)

  unprocessed_fuel_ratings_df = pd.read_csv(merged_headers_file_name)


In [32]:
unprocessed_fuel_ratings_df.dropna(axis=1, how='all', inplace=True)

In [33]:
unprocessed_fuel_ratings_df.dropna(axis=0, thresh=10, inplace=True)

In [34]:
unprocessed_fuel_ratings_df

Unnamed: 0,Model Year,Make,Model,Vehicle Class,Engine Size (L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Hwy (L/100 km),Comb (L/100 km),Comb (mpg),CO2 Emissions (g/km),CO2 Rating,Smog Rating
0,2023,Acura,Integra,Full-size,1.5,4.0,AV7,Z,7.9,6.3,7.2,39.0,167.0,6.0,7.0
1,2023,Acura,Integra A-SPEC,Full-size,1.5,4.0,AV7,Z,8.1,6.5,7.4,38.0,172.0,6.0,7.0
2,2023,Acura,Integra A-SPEC,Full-size,1.5,4.0,M6,Z,8.9,6.5,7.8,36.0,181.0,6.0,6.0
3,2023,Acura,MDX SH-AWD,SUV: Small,3.5,6.0,AS10,Z,12.6,9.4,11.2,25.0,263.0,4.0,5.0
4,2023,Acura,MDX SH-AWD Type S,SUV: Standard,3.0,6.0,AS10,Z,13.8,11.2,12.4,23.0,291.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828,2023,Volvo,XC40 B5 AWD,SUV: Small,2.0,4.0,AS8,Z,10.2,7.9,9.2,31.0,215.0,5.0,5.0
829,2023,Volvo,XC60 B5 AWD,SUV: Small,2.0,4.0,AS8,Z,10.3,8.2,9.4,30.0,218.0,5.0,5.0
830,2023,Volvo,XC60 B6 AWD,SUV: Small,2.0,4.0,AS8,Z,11.1,8.7,10.0,28.0,233.0,5.0,7.0
831,2023,Volvo,XC90 B5 AWD,SUV: Standard,2.0,4.0,AS8,Z,10.5,8.4,9.6,29.0,223.0,5.0,5.0


In [39]:
rename_columns(unprocessed_fuel_ratings_df)
unprocessed_fuel_ratings_df

Unnamed: 0,model_year,make,model,vehicle_class,engine_size_(l),cylinders,transmission,fuel_type,fuel_consumption_city_(l/100_km),hwy_(l/100_km),comb_(l/100_km),comb_(mpg),co2_emissions_(g/km),co2_rating,smog_rating
0,2023,Acura,Integra,Full-size,1.5,4.0,AV7,Z,7.9,6.3,7.2,39.0,167.0,6.0,7.0
1,2023,Acura,Integra A-SPEC,Full-size,1.5,4.0,AV7,Z,8.1,6.5,7.4,38.0,172.0,6.0,7.0
2,2023,Acura,Integra A-SPEC,Full-size,1.5,4.0,M6,Z,8.9,6.5,7.8,36.0,181.0,6.0,6.0
3,2023,Acura,MDX SH-AWD,SUV: Small,3.5,6.0,AS10,Z,12.6,9.4,11.2,25.0,263.0,4.0,5.0
4,2023,Acura,MDX SH-AWD Type S,SUV: Standard,3.0,6.0,AS10,Z,13.8,11.2,12.4,23.0,291.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828,2023,Volvo,XC40 B5 AWD,SUV: Small,2.0,4.0,AS8,Z,10.2,7.9,9.2,31.0,215.0,5.0,5.0
829,2023,Volvo,XC60 B5 AWD,SUV: Small,2.0,4.0,AS8,Z,10.3,8.2,9.4,30.0,218.0,5.0,5.0
830,2023,Volvo,XC60 B6 AWD,SUV: Small,2.0,4.0,AS8,Z,11.1,8.7,10.0,28.0,233.0,5.0,7.0
831,2023,Volvo,XC90 B5 AWD,SUV: Standard,2.0,4.0,AS8,Z,10.5,8.4,9.6,29.0,223.0,5.0,5.0


In [42]:
unprocessed_fuel_ratings_df['make'].str.lower()

AttributeError: 'Series' object has no attribute 'lower'

In [None]:
clean_fuel_ratings_df = clean_dataframe(unprocessed_fuel_ratings_df)

In [88]:
def extract_first_table(file_path):
    try:
        with open(file_path, 'r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)

        # Find index of first empty row
        empty_row_idx = -1
        for i, row in enumerate(rows):
            if not any(row):
                empty_row_idx = i
                break
                
        if empty_row_idx >= 0:
        # Create a new CSV file with only the rows before the first empty row
            with open('test7.csv', 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerows(rows[:empty_row_idx])
            print(f'Removed rows after the first empty row.')
        else:
            print(f'No empty row found in the file')
    except FileNotFoundError:
        print(f'File {file_path} not found.')
    except Exception as e:
        print('An error occured while processing the file:', e)

In [89]:
# extract_first_table(file_path)

Removed rows after the first empty row.


In [None]:
# Export notebook to module.
from nbdev.export import nb_export
path = Path.cwd()
lib_path = path
nb_path = path / 'datadownload.ipynb'
nb_export(nb_path, lib_path)