In [1]:
#|default_exp datadownload

In [36]:
from pathlib import Path
import requests
import csv
from io import StringIO
from rich import inspect
import pandas as pd

# Data download

In [3]:
def extract_metadata():
    """
    Extract metadata for the fuel emission datasets
    
    """
    pass

In [33]:
#|export
def extract_raw_data(url, file_name):
    """
    Extract raw data from a URL

    Parameters
    ----------
    url : str
        URL to extract data from
        
    file_name : str or Path object
        file name for raw data dump
        
    """
    
    try:
        # Request data from url
        response = requests.get(url)
        content_type = response.headers['content-type']
        response_text = response.text
        print(f'Response status: {response.status_code}\nContent Type: {content_type}')

        # Save request content to csv file
        with open(file_name, mode='w', newline='') as csvfile:
            csvfile.write(response_text)

        print(f'csv file: {file_name} saved')
        
    # Catch errors    
    except requests.exceptions.HTTPError as err_h:
        print(f'HTTP error occured:{err_h}')
    except requests.exceptions.ConnectionError as err_c:
        print(f'Error connecting:{err_c}')
    except requests.exceptions.Timeout as err_t:
        print(f'Timeout Error:{err_t}')
    except requests.exceptions.RequestException as err:
        print(f'There was an unknown error:{err}')

In [8]:
def merge_top_two_rows(input_file, output_file):
    # Open the input CSV file for reading
    with open(input_file, mode='r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        
        # Read the first two rows from the input file
        header_row = next(reader)
        second_row = next(reader)
        
        # Merge the two rows into one header
        merged_header = [f"{header_row[i]} {second_row[i]}" for i in range(len(header_row))]
        
        # Open the output CSV file for writing
        with open(output_file, mode='w', newline='') as output_csvfile:
            writer = csv.writer(output_csvfile)
            
            # Write the merged header to the output file
            writer.writerow(merged_header)
            
            # Copy the rest of the rows from the input file to the output file
            for row in reader:
                writer.writerow(row)

In [9]:
def rename_columns(df):
    """
    Rename dataframe columns so they are database friendly.

    """
    #Drop empty columns and rows from the DataFrame
    df.dropna(axis=1, how='all', inplace=True)
    df.dropna(axis=0, thresh=5, inplace=True)

In [10]:
def clean_dataframe():
    """
    Clean the fuel consumption dataframe.

    """
    pass

## Script

In [11]:
url = 'https://natural-resources.canada.ca/sites/nrcan/files/oee/files/csv/MY2023%20Fuel%20Consumption%20Ratings.csv'

In [12]:
path = Path.cwd()
path

PosixPath('/home/jsh/vehicle-co2')

In [13]:
# Declare file names
raw_path = path / 'data' / 'raw'
raw_file_name = raw_path / 'fuel-ratings-raw.csv'
merged_headers_file_name = raw_path / 'fuel-ratings-raw-headers-merged.csv'

In [32]:
extract_raw_data(url, raw_file_name)

Response status: 200
Content Type: text/csv
csv file: /home/jsh/vehicle-co2/data/raw/fuel-ratings-raw.csv saved


In [34]:
merge_top_two_rows(raw_file_name, merged_headers_file_name)

In [41]:
unprocessed_fuel_ratings_df = pd.read_csv(merged_headers_file_name)

  unprocessed_fuel_ratings_df = pd.read_csv(merged_headers_file_name)


In [48]:
unprocessed_fuel_ratings_df.dropna(axis=1, how='all', inplace=True)

In [50]:
unprocessed_fuel_ratings_df.dropna(axis=0, thresh=10, inplace=True)

In [51]:
unprocessed_fuel_ratings_df

Unnamed: 0,Model Year,Make,Model,Vehicle Class,Engine Size (L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Hwy (L/100 km),Comb (L/100 km),Comb (mpg),CO2 Emissions (g/km),CO2 Rating,Smog Rating
0,2023,Acura,Integra,Full-size,1.5,4.0,AV7,Z,7.9,6.3,7.2,39.0,167.0,6.0,7.0
1,2023,Acura,Integra A-SPEC,Full-size,1.5,4.0,AV7,Z,8.1,6.5,7.4,38.0,172.0,6.0,7.0
2,2023,Acura,Integra A-SPEC,Full-size,1.5,4.0,M6,Z,8.9,6.5,7.8,36.0,181.0,6.0,6.0
3,2023,Acura,MDX SH-AWD,SUV: Small,3.5,6.0,AS10,Z,12.6,9.4,11.2,25.0,263.0,4.0,5.0
4,2023,Acura,MDX SH-AWD Type S,SUV: Standard,3.0,6.0,AS10,Z,13.8,11.2,12.4,23.0,291.0,4.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
828,2023,Volvo,XC40 B5 AWD,SUV: Small,2.0,4.0,AS8,Z,10.2,7.9,9.2,31.0,215.0,5.0,5.0
829,2023,Volvo,XC60 B5 AWD,SUV: Small,2.0,4.0,AS8,Z,10.3,8.2,9.4,30.0,218.0,5.0,5.0
830,2023,Volvo,XC60 B6 AWD,SUV: Small,2.0,4.0,AS8,Z,11.1,8.7,10.0,28.0,233.0,5.0,7.0
831,2023,Volvo,XC90 B5 AWD,SUV: Standard,2.0,4.0,AS8,Z,10.5,8.4,9.6,29.0,223.0,5.0,5.0


In [None]:
rename_columns(unprocessed_fuel_ratings_df)

In [None]:
clean_fuel_ratings_df = clean_dataframe(unprocessed_fuel_ratings_df)

In [88]:
def extract_first_table(file_path):
    try:
        with open(file_path, 'r', newline='') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)

        # Find index of first empty row
        empty_row_idx = -1
        for i, row in enumerate(rows):
            if not any(row):
                empty_row_idx = i
                break
                
        if empty_row_idx >= 0:
        # Create a new CSV file with only the rows before the first empty row
            with open('test7.csv', 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerows(rows[:empty_row_idx])
            print(f'Removed rows after the first empty row.')
        else:
            print(f'No empty row found in the file')
    except FileNotFoundError:
        print(f'File {file_path} not found.')
    except Exception as e:
        print('An error occured while processing the file:', e)

In [89]:
# extract_first_table(file_path)

Removed rows after the first empty row.


In [None]:
# Export notebook to module.
from nbdev.export import nb_export
path = Path.cwd()
lib_path = path
nb_path = path / 'datadownload.ipynb'
nb_export(nb_path, lib_path)