In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

In [None]:
from pathlib import Path

emissions_data_path = "/home/iceberg/data/emissions_data"

file_path = f"{emissions_data_path}/co2_emissions_passenger_cars_2020.json"
file_name = Path(file_path).stem

df = spark.read.option("multiline","true").json(file_path)
df.createOrReplaceTempView(f"{file_name}_tempTable")
spark.sql(f"CREATE TABLE IF NOT EXISTS raw.co2_passenger_cars_emissions.{file_name} as select * from {file_name}_tempTable")
spark.catalog.dropTempView(f"{file_name}_tempTable")

In [None]:
co2_emissions_2020_df = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2020")

co2_emissions_2020_df.printSchema()


In [None]:
import re
import pyspark.sql.functions as F

# ------ Applying the data quality filters --------

# Replace spaces in column names with underscores and remove ()
co2_emissions_columns = co2_emissions_2020_df.columns
print(f"Original Column names:: {co2_emissions_2020_df.columns}")

co2_emissions_2020_df = (co2_emissions_2020_df.select(
                      [F.col(col).alias(re.sub('[()]', '', col.replace(' ', '_'))) for col in co2_emissions_2020_df.columns]
                    ))


print(f"Updated Column names:: {co2_emissions_2020_df.columns}")

# Drop null records
print(f"Number of records of CO2 emissions dataframe before dropping nulls: {co2_emissions_2020_df.count()}")
co2_emissions_2020_df = co2_emissions_2020_df.na.drop('all')
print(f"Number of records of CO2 emissions dataframe after dropping nulls: {co2_emissions_2020_df.count()}")

# Drop duplicates
print(f"Number of records of CO2 emissions dataframe before dropping duplicates: {co2_emissions_2020_df.count()}")
co2_emissions_2020_df = co2_emissions_2020_df.distinct()
print(f"Number of records of CO2 emissions dataframe after dropping duplicates: {co2_emissions_2020_df.count()}")

# Filter records with corrupt Member State code - We keep values with two uppercase letters
print(f"Number of records of CO2 emissions dataframe before MS filter: {co2_emissions_2020_df.count()}")
co2_emissions_2020_df = co2_emissions_2020_df.filter(co2_emissions_2020_df['MS'].rlike('^[A-Z][A-Z]$'))
print(f"Number of records of CO2 emissions dataframe after MS filter: {co2_emissions_2020_df.count()}")

In [None]:
# Notice the new column in the output (Enedc_g/km_V2)
display(co2_emissions_2020_df.show())

In [None]:
from pyspark.sql.types import LongType
# We use repartition() to get one file per partition value
# We're dropping the column z_Wh/km because it only contains null values for this year
# Do the same to other columns that may cause issues, except the column Enedc_g/km_V2
co2_emissions_2020_df = co2_emissions_2020_df.repartition('year')
co2_emissions_2020_df = co2_emissions_2020_df.withColumn('z_Wh/km', F.col('z_Wh/km').cast(LongType()))
(
  co2_emissions_2020_df
  .write
  .mode('append')
  .partitionBy('year')
  .format('iceberg')
  .saveAsTable('curated.co2_passenger_cars_emissions')
)

In [None]:
# Updating the columns to match business requirements
co2_emissions_2020_df = (co2_emissions_2020_df
                      .withColumnRenamed('Enedc_g/km', 'Enedc_g/km_deprecated')
                      .withColumnRenamed('Enedc_g/km_V2', 'Enedc_g/km')
                      )

spark.sql("""
  ALTER TABLE curated.co2_passenger_cars_emissions
  ADD COLUMNS (`Enedc_g/km_deprecated` DOUBLE)
""")

(co2_emissions_2020_df
.repartition("year")
.writeTo("curated.co2_passenger_cars_emissions")
.partitionedBy("year")
.options(format="iceberg", mode="overwrite", mergeSchema="true")
.createOrReplace()
)

In [None]:
# Query the history of the table
history_df = spark.read.format("iceberg") \
    .load("curated.co2_passenger_cars_emissions.history")

# Show the results to display the history
history_df.show(truncate=False)

In [None]:
spark.stop()