In [None]:
"""
Read in raw data from .csv and give an overview about missing values and data types.
"""

import pandas as pd
from config import PRE_ANALYSIS_FILE, DENSITY_THRESHOLD, COLS_PRE_DROP

In [None]:
df = pd.read_csv(PRE_ANALYSIS_FILE)

print(f"Dimension: {df.shape}")
print(f"Columns: {df.columns}")
print(f"Data types:\n{df.dtypes}")

  df = pd.read_csv(FIRST_ANALYSIS_FILE)


Dimension: (10734898, 40)
Columns: Index(['ID', 'Country', 'VFN', 'Mp', 'Mh', 'Man', 'MMS', 'Tan', 'T', 'Va',
       'Ve', 'Mk', 'Cn', 'Ct', 'Cr', 'r', 'm (kg)', 'Mt', 'Enedc (g/km)',
       'Ewltp (g/km)', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'Ft', 'Fm',
       'ec (cm3)', 'ep (KW)', 'z (Wh/km)', 'IT', 'Ernedc (g/km)',
       'Erwltp (g/km)', 'De', 'Vf', 'Status', 'year', 'Date of registration',
       'Fuel consumption ', 'ech', 'RLFI', 'Electric range (km)'],
      dtype='object')
Data types:
ID                        int64
Country                  object
VFN                      object
Mp                       object
Mh                       object
Man                      object
MMS                     float64
Tan                      object
T                        object
Va                       object
Ve                       object
Mk                       object
Cn                       object
Ct                       object
Cr                       object
r                     

In [3]:
df.head(5)

Unnamed: 0,ID,Country,VFN,Mp,Mh,Man,MMS,Tan,T,Va,...,Erwltp (g/km),De,Vf,Status,year,Date of registration,Fuel consumption,ech,RLFI,Electric range (km)
0,132193881,DE,IP-MQB37SZ_A3_1036-WVW-1,VOLKSWAGEN,VOLKSWAGEN,VOLKSWAGEN AG,,E13*2007/46*1845*26,A1,DXDBX0AC4,...,1.17,,,F,2023,2023-03-14,6.3,,RL-MQ281_6F_20_001-WVW-1,
1,132193882,DE,IP-03_356_0299-ZFA-1,STELLANTIS,STELLANTIS EUROPE,STELLANTIS EUROPE SPA,,E3*2007/46*0373*33,356,HXS12,...,1.35,,,F,2023,2023-01-27,5.2,,RL-03_BU_334_0112-1C4-1,
2,132193883,DE,IP-MQB37SZ_A0_0564-WVW-1,VOLKSWAGEN,VOLKSWAGEN,VOLKSWAGEN AG,,E13*2007/46*1845*27,A1,DLAAX0AE2,...,1.17,,,F,2023,2023-05-15,6.6,,RL-MQ200_6F_18_019-WVW-1,
3,132193884,DE,IP-0000667-WBA-1,BMW,BMW AG,BAYERISCHE MOTOREN WERKE AG,,E1*2007/46*2063*05,FML2E,11DJ,...,,,,F,2023,2023-11-10,,,RL-0100492-WBA-1,227.0
4,132193885,DE,IP-MEB31AZ_A0_1902-WVW-1,VOLKSWAGEN,VOLKSWAGEN,VOLKSWAGEN AG,,E1*2018/858*00004*12,E2,4ACX1EBL1GX1,...,,,,F,2023,2023-08-10,,,RL-EQ151_1K_21_001-WVW-1,491.0


In [4]:
# Give overview about missing percentages for each variable
missing_percentage = df.isna().sum() / len(df)
print(missing_percentage)

ID                      0.000000
Country                 0.000000
VFN                     0.012152
Mp                      0.089450
Mh                      0.000000
Man                     0.000000
MMS                     1.000000
Tan                     0.003023
T                       0.000544
Va                      0.002753
Ve                      0.003527
Mk                      0.000029
Cn                      0.000036
Ct                      0.001231
Cr                      0.000000
r                       0.000000
m (kg)                  0.000017
Mt                      0.015047
Enedc (g/km)            1.000000
Ewltp (g/km)            0.001245
W (mm)                  1.000000
At1 (mm)                1.000000
At2 (mm)                1.000000
Ft                      0.000000
Fm                      0.000000
ec (cm3)                0.155570
ep (KW)                 0.004721
z (Wh/km)               0.773027
IT                      0.349143
Ernedc (g/km)           1.000000
Erwltp (g/

In [6]:
cols_to_be_dropped = list()

for col, percentage in missing_percentage.items():
    if percentage > DENSITY_THRESHOLD:
        cols_to_be_dropped.append(col)

print(f"Columns to be dropped due to availability below threshold: {cols_to_be_dropped}")

Columns to be dropped due to availability below threshold: ['MMS', 'Enedc (g/km)', 'W (mm)', 'At1 (mm)', 'At2 (mm)', 'z (Wh/km)', 'Ernedc (g/km)', 'De', 'Vf', 'RLFI', 'Electric range (km)']


In [7]:
# compare this with config setup for cross checks
for col in cols_to_be_dropped:
    if not col in COLS_PRE_DROP:
        print(f"Column {col} should be dropped based on threshold, but is not in config setup.")

Column z (Wh/km) should be dropped based on threshold, but is not in config setup.
Column Electric range (km) should be dropped based on threshold, but is not in config setup.


The next step is to generate datasets following a common standard (names, types, etc.). The knowledge of this notebook will be used in 1_1-prep_database_file_generator.ipynb

---

### Data Assessment Summary

Based on the initial exploration, we identified the following issues:

#### 1️⃣ **Column Name Inconsistencies**

Some column names do not fully match the table description, which may cause confusion and require renaming:

- `"Country"` ≠ `"MS"`
- `"Electric Range"` ≠ `"Zr"`
- `"Fuel Consumption"` ≠ `"Fc"`
- `"r"` ≠ `"R"`
- `"m (kg)"` ≠ `"M (kg)"`
- `"ec (cm³)"` ≠ `"Ec (cm³)"`
- `"ep (KW)"` ≠ `"Ep (KW)"`
- `"z (Wh/km)"` ≠ `"Z (Wh/km)"`
- `"year"` ≠ `"Year"`

#### 2️⃣ **Columns with Many Missing Values**

Several columns appear to be **completely empty** or contain mostly missing data, requiring verification before further processing:

- `"MMS"`
- `"Enedc (g/km)"`
- `"W (mm)"`
- `"At1 (mm)"`
- `"At2 (mm)"`
- `"Ernedc (g/km)"`
- `"De"`
- `"Vf"`

#### 3️⃣ **Redundant or Less Informative Variables**

Certain columns provide little additional information compared to other, more relevant columns. These might be considered for removal:

- Less informative: `"ID"`, `"Mp"`, `"VFN"`, `"Mk"`, `"Man"`, `"Tan"`, `"Va"`, `"Ve"`, `"Cr"`
- More relevant alternatives: `"T"`, `"Mh"`, `"Cn"`, `"CT"`

#### 4️⃣ **Potentially Constant Columns**

The variable `"r"` appears to always be equal to `1`, which suggests it may not be useful for analysis.

#### 5️⃣ **Deprecated Variables (Relevant Only Until 2016)**

The following columns contain values that are no longer meaningful after 2016 and may be excluded from the analysis:

- `"E (g/km)"`
- `"Er (g/km)"`

#### 6️⃣ **Metadata Columns**

The following columns contain metadata rather than analytical data and should be treated separately:

- `"Status"`
- `"Version_file"`

#### 7️⃣ **Redundant Information**

- `"Year"` and `"Dr"` provide the same information, making one of them unnecessary.