## Data Wrangling

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

In [2]:
folder = 'C:/Users/Laura GF/Documents/GitHub/machine-learning-capstone/data/'
raw_data = f'{folder}raw-data/'
clean_data = f'{folder}clean-data/'


### Topic: Fuel consumption ratings (Statistics Canada)

Site https://open.canada.ca/data/en/dataset/98f1a129-f628-4ce4-b24d-6f16bf24dd64

In [3]:
fuel_df = pd.read_csv(Path(clean_data,"1995_2022_vehicle_fuel_consumption.csv"))

In [7]:
fuel_df[fuel_df['make_']=="ford"]['co2_rating'].mean()

4.240625

### Topic: Number of cars sold in Canada by make and model

Site: https://www.goodcarbadcar.net/


The data was extracted using Scrapy

In [35]:
month_dic = {
            'jan': "01",
            'feb': "02",
            'mar': "03",
            'apr': "04",
            'may': "05",
            'jun': "06",
            'jul': "07",
            'aug': "08",
            'sep': "09",
            'oct': "10",
            'nov': "11",
            'dec': "12"
            }

def process_json_car_sales(json_filen_name, path) -> list():
    """
    This function processes the JSON file containing car sales data and returns a dataframe
    Parameters:
        json_file_name (str): Name of JSON file
        path (str): Path to folder where JSON file is located
    Returns:
        df_expanded_long (pd.DataFrame): Dataframe containing car sales data in long format
        df_expanded_wide (pd.DataFrame): Dataframe containing car sales data in wide format

    """
    json_df = pd.read_json(Path(path,json_filen_name)).set_index("model")
    json_df.dropna(how="all", inplace=True)
    
    # Wide format
    wide_df = pd.read_json(Path(path,json_filen_name))
    df_expanded_wide = wide_df.join(wide_df.reset_index()['model'].str.split(' ', 1, expand=True).rename(columns={0:'make', 1:'model_'})).drop(columns=["model"])
    df_expanded_wide['year'] = json_filen_name.split("_")[0]

    # long format
    long_format_df = pd.DataFrame(json_df.T.unstack()).reset_index().rename(columns={"level_1":"month",0:"number_units_sold"})
    df_expanded_long = long_format_df.join(long_format_df.reset_index()['model'].str.split(' ', 1, expand=True).rename(columns={0:'make', 1:'model_'})).drop(columns=["model"])
    df_expanded_long['year'] = json_filen_name.split("_")[0]
    df_expanded_long['month']  = df_expanded_long['month'].map(month_dic) 

    # Remove ',' from number_units_sold
    df_expanded_long['number_units_sold'] = df_expanded_long['number_units_sold'].str.replace(",","")

    # Transform month and number_units_sold to int 
    df_expanded_long['month'] = df_expanded_long['month'].astype('int')
    df_expanded_long['number_units_sold'] = df_expanded_long['number_units_sold'].astype('int')

    # Combine 'month' and 'year' into 'date' column and convert to datetime in format YYYY-MM 
    df_expanded_long['date'] = df_expanded_long['year'].astype(str) + "-" + df_expanded_long['month'].astype(str)
    # Convert 'date' to datetime
    df_expanded_long['date'] = pd.to_datetime(df_expanded_long['date'], format='%Y-%m')
    # Drop 'month' and 'year' columns
    df_expanded_long.drop(columns=['month','year'], inplace=True)
    

    return df_expanded_long, df_expanded_wide


In [37]:
long_format_2021_sep,df_2021 =  process_json_car_sales("2021_canada_vehicle_sales.json", raw_data)
long_format_2020_sep,df_2020 =  process_json_car_sales("2020_canada_vehicle_sales.json", raw_data)
long_format_2019_sep,df_2019 =  process_json_car_sales("2019_canada_vehicle_sales.json", raw_data)

In [33]:
df_all_sales = pd.read_csv(Path(clean_data, "long_format_car_sales.csv"))

In [34]:
df_all_sales

Unnamed: 0,number_units_sold,make,model_,date
0,86,Acura,ILX,2019-01-01
1,56,Acura,ILX,2019-02-01
2,218,Acura,ILX,2019-03-01
3,286,Acura,ILX,2019-04-01
4,209,Acura,ILX,2019-05-01
...,...,...,...,...
10615,364,Volvo,XC90,2021-08-01
10616,220,Volvo,XC90,2021-09-01
10617,192,Volvo,XC90,2021-10-01
10618,249,Volvo,XC90,2021-11-01


### Topic: New motor vehicle registrations, quarterly (Statistics Canada)

Site https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2010002401



### Topic: New zero-emission vehicle registrations, quarterly (Statistics Canada)

https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2010002501

### Topic: Sales of fuel used for road motor vehicles, annual (Statistics Canada)

https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2310006601

### Topic: Vehicle registrations, by type of vehicle (Statistics Canada)

https://open.canada.ca/data/en/dataset/9aea572f-f54f-42a1-b411-0b06390ed9f9