In [None]:
"""
Create database_file with common configuration of variable naming, columns based on different datasets.
For now we only use *.csv exports from EU database.
https://www.eea.europa.eu/en/datahub/datahubitem-view/fa8b1229-3db6-495d-b18e-9c9b3267c02b?activeAccordion=

This notebook needs to be executed for each dataset we want to use later in the data analysis and model training process.
"""

import pandas as pd
from config import COLS_PRE_DROP, COLS_MAPPER, RAW_DATA_FILES, DENSITY_THRESHOLD, raw_csv_to_proc_csv_converter

In [4]:
# read in raw files based on array in config (update index for different files)
# in real application case we'd consider implementation of a eu_preprocessing_pipeline e.g. through class representation 
file = RAW_DATA_FILES[0]
df = pd.read_csv(file)

  df = pd.read_csv(file)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10734898 entries, 0 to 10734897
Data columns (total 40 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Country               object 
 2   VFN                   object 
 3   Mp                    object 
 4   Mh                    object 
 5   Man                   object 
 6   MMS                   float64
 7   Tan                   object 
 8   T                     object 
 9   Va                    object 
 10  Ve                    object 
 11  Mk                    object 
 12  Cn                    object 
 13  Ct                    object 
 14  Cr                    object 
 15  r                     int64  
 16  m (kg)                float64
 17  Mt                    float64
 18  Enedc (g/km)          float64
 19  Ewltp (g/km)          float64
 20  W (mm)                float64
 21  At1 (mm)              float64
 22  At2 (mm)              float64
 23  Ft   

In [6]:
# set index by eu table id
if df.duplicated(subset=['ID']).sum() != 0:
    raise Exception("Found duplicates in the ID column. Please check the data.")
df.set_index("ID", inplace=True)

In [7]:
# Drop columns based on config COLS_PRE_DROP (learnings from data exploration)
df.drop(columns=COLS_PRE_DROP, inplace=True)

In [8]:
# Rename columns based on config COLS_MAPPER
df.rename(columns=COLS_MAPPER, inplace=True)
df.columns

Index(['member_state', 'manufacturer_name_eu', 'vehicle_type',
       'commercial_name', 'eu_category', 'mass_vehicle', 'weltp_test_mass',
       'specific_co2_emissions', 'fuel_type', 'fuel_mode', 'engine_capacity',
       'engine_power', 'electric_energy_consumption',
       'innovative_technologies', 'erwltp', 'year', 'fuel_consumption',
       'electric_range'],
      dtype='object')

In [9]:
# Give overview about missing percentages for each variable
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

member_state                   0.000000
manufacturer_name_eu           0.000000
vehicle_type                   0.000544
commercial_name                0.000036
eu_category                    0.001231
mass_vehicle                   0.000017
weltp_test_mass                0.015047
specific_co2_emissions         0.001245
fuel_type                      0.000000
fuel_mode                      0.000000
engine_capacity                0.155570
engine_power                   0.004721
electric_energy_consumption    0.773027
innovative_technologies        0.349143
erwltp                         0.353081
year                           0.000000
fuel_consumption               0.175966
electric_range                 0.773778
dtype: float64


In [10]:
# Delete based on threshold, but ignore cols which are part of COLS_MAPPER (indication of interesting vars)
# many variables/cols already dropped through COLS_PRE_DROP process 
cols_to_be_dropped = list()
cols_drop_ignore = list()
for _, col in COLS_MAPPER.items():
    cols_drop_ignore.append(col)

for col, percentage in missing_percentage.items():
    if percentage > DENSITY_THRESHOLD and col not in cols_drop_ignore:
        cols_to_be_dropped.append(col)

print(f"Dropping: {missing_percentage[cols_to_be_dropped]}")

df.drop(columns=cols_to_be_dropped, inplace=True)

Dropping: Series([], dtype: float64)


In [11]:
# split variables into categorical and numerical
cat_vars = df.select_dtypes(include="object")
num_vars = df.select_dtypes(include=["float64", "int64"])

# reordering columns: categorical vars first, numerical vars next, targets at the end

potential_targets = ['electric_energy_consumption', 'fuel_consumption', 'specific_co2_emissions']
col_order = list(cat_vars.columns)  # Start with categorical variables
col_order.extend(num_vars.columns)  # Add numerical variables

# Move target variables to the end
for target in potential_targets:
    if target in col_order:
        col_order.remove(target)
    col_order.append(target)

# Apply the new column order to df1
df = df.loc[:, col_order]

In [12]:
# Give overview about changed dataset
df.head(5)

Unnamed: 0_level_0,member_state,manufacturer_name_eu,vehicle_type,commercial_name,eu_category,fuel_type,fuel_mode,innovative_technologies,mass_vehicle,weltp_test_mass,engine_capacity,engine_power,erwltp,year,electric_range,electric_energy_consumption,fuel_consumption,specific_co2_emissions
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
132193881,DE,VOLKSWAGEN,A1,T-ROC,M1,petrol,M,e13 29,1345.0,1477.0,1498.0,110.0,1.17,2023,,,6.3,143.0
132193882,DE,STELLANTIS EUROPE,356,FIAT TIPO,M1,petrol,H,e3 32,1425.0,1506.0,1469.0,96.0,1.35,2023,,,5.2,118.0
132193883,DE,VOLKSWAGEN,A1,T-ROC,M1,petrol,M,e13 29,1496.0,1595.0,999.0,81.0,1.17,2023,,,6.6,150.0
132193884,DE,BMW AG,FML2E,COOPER SE,M1,electric,E,,1440.0,1557.0,,135.0,,2023,227.0,157.0,,0.0
132193885,DE,VOLKSWAGEN,E2,ID4 GTX 220 KW,M1,electric,E,,2239.0,2409.0,,220.0,,2023,491.0,179.0,,0.0


In [None]:
# export Dataframe in .csv format
export_file_name = raw_csv_to_proc_csv_converter(file)
df.to_csv(export_file_name, index=True)

In [16]:
# Use this output to update config.py DATABASE_FILE_DTYPES
# replace dtype('O') with 'object'
# replace dtype('float64') with 'float64'
# replace dtype('int64') with 'int64'
df.dtypes.to_dict()

{'member_state': dtype('O'),
 'manufacturer_name_eu': dtype('O'),
 'vehicle_type': dtype('O'),
 'commercial_name': dtype('O'),
 'eu_category': dtype('O'),
 'fuel_type': dtype('O'),
 'fuel_mode': dtype('O'),
 'innovative_technologies': dtype('O'),
 'mass_vehicle': dtype('float64'),
 'weltp_test_mass': dtype('float64'),
 'engine_capacity': dtype('float64'),
 'engine_power': dtype('float64'),
 'erwltp': dtype('float64'),
 'year': dtype('int64'),
 'electric_range': dtype('float64'),
 'electric_energy_consumption': dtype('float64'),
 'fuel_consumption': dtype('float64'),
 'specific_co2_emissions': dtype('float64')}