In [1]:
from databricks.connect import DatabricksSession

# Prepare the different parts of the data path
raw_layer_base_path = 'dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/raw'
co2_emissions_feed  = 'co2_passenger_cars_emissions'

spark = DatabricksSession.builder.profile("DEFAULT").getOrCreate()

# Read the data using Spark
df_co2_emissions = (spark.read
                         .option('multiline','true')
                         .json(f'{raw_layer_base_path}/{co2_emissions_feed}'))

In [2]:
# Replace spaces in column names with underscores (“_”). Additionally, remove parentheses from column names.
import re
import pyspark.sql.functions as F

co2_emissions_columns = df_co2_emissions.columns
print(f"Original Column names:: {df_co2_emissions.columns}")

df_co2_emissions = (df_co2_emissions.select(
                      [F.col(col).alias(re.sub('[()]', '', col.replace(' ', '_'))) for col in df_co2_emissions.columns]
                    ))


print(f"Updated Column names:: {df_co2_emissions.columns}")

Original Column names:: ['At1 (mm)', 'At2 (mm)', 'Cn', 'Cr', 'Ct', 'De', 'E (g/km)', 'Enedc (g/km)', 'Er (g/km)', 'Ernedc (g/km)', 'Erwltp (g/km)', 'Ewltp (g/km)', 'Fm', 'Ft', 'ID', 'It', 'MMS', 'MS', 'Man', 'Mh', 'Mk', 'Mp', 'Mt', 'Status', 'T', 'TAN', 'VFN', 'Va', 'Ve', 'Vf', 'W (mm)', 'Zr', 'ec (cm3)', 'ep (KW)', 'm (kg)', 'r', 'version_file', 'year', 'z (Wh/km)']
Updated Column names:: ['At1_mm', 'At2_mm', 'Cn', 'Cr', 'Ct', 'De', 'E_g/km', 'Enedc_g/km', 'Er_g/km', 'Ernedc_g/km', 'Erwltp_g/km', 'Ewltp_g/km', 'Fm', 'Ft', 'ID', 'It', 'MMS', 'MS', 'Man', 'Mh', 'Mk', 'Mp', 'Mt', 'Status', 'T', 'TAN', 'VFN', 'Va', 'Ve', 'Vf', 'W_mm', 'Zr', 'ec_cm3', 'ep_KW', 'm_kg', 'r', 'version_file', 'year', 'z_Wh/km']


In [3]:
# Drop records that only consist of null values (records with null values on all columns).
print(f"Record count prior to dropping null values:: {df_co2_emissions.count()}")
df_co2_emissions = df_co2_emissions.dropna(how="all")
print(f"Record count after to dropping null values:: {df_co2_emissions.count()}")

Record count prior to dropping null values:: 300000
Record count after to dropping null values:: 300000


In [4]:
# Drop duplicate records.

print(f"Record count prior to dropping duplicate values:: {df_co2_emissions.count()}")
df_co2_emissions = df_co2_emissions.dropDuplicates()
print(f"Record count after to dropping duplicate values:: {df_co2_emissions.count()}")

Record count prior to dropping duplicate values:: 300000
Record count after to dropping duplicate values:: 300000


In [5]:
# Drop all records that have a member state code size other than two (column: MS) and that contain any character other than uppercase letters in this column

print(f"Record count prior to filtered state code:: {df_co2_emissions.count()}")
df_co2_emissions = df_co2_emissions.filter(df_co2_emissions['MS'].rlike('^[A-Z][A-Z]$'))
print(f"Record count with filtered state code:: {df_co2_emissions.count()}")

Record count prior to filtered state code:: 300000
Record count with filtered state code:: 299996


In [9]:
from datetime import datetime
current_year = datetime.now().year

dbfs_wdi_data_path = f"dbfs:/Volumes/emissions_datapipeline_workspace/default/datalake/curated/co2_emissions/year={current_year}/"
df_co2_emissions = df_co2_emissions.repartition('year')

df_co2_emissions.write.mode("overwrite").partitionBy('year').parquet(dbfs_wdi_data_path)