In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("In-heap memory usage:", spark.sparkContext.uiWebUrl)

In [None]:
df_co2_emissions_2017 = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2017")
df_co2_emissions_2018 = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2018")
df_co2_emissions_2019 = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2019")

combined_co2_emissions_df = df_co2_emissions_2017.unionByName(df_co2_emissions_2018).unionByName(df_co2_emissions_2019)

combined_co2_emissions_df.printSchema()
combined_co2_emissions_df.show(10, truncate=False)

In [None]:
# Replace spaces in column names with underscores (“_”). Additionally, remove parentheses from column names.
import re
import pyspark.sql.functions as F

co2_emissions_columns = combined_co2_emissions_df.columns
print(f"Original Column names:: {combined_co2_emissions_df.columns}")

combined_co2_emissions_df = (combined_co2_emissions_df.select(
                      [F.col(col).alias(re.sub('[()]', '', col.replace(' ', '_'))) for col in combined_co2_emissions_df.columns]
                    ))


print(f"Updated Column names:: {combined_co2_emissions_df.columns}")

In [None]:
# Drop records that only consist of null values (records with null values on all columns).
print(f"Record count prior to dropping null values:: {combined_co2_emissions_df.count()}")
combined_co2_emissions_df = combined_co2_emissions_df.dropna(how="all")
print(f"Record count after to dropping null values:: {combined_co2_emissions_df.count()}")

In [None]:
# Drop duplicate records.

print(f"Record count prior to dropping duplicate values:: {combined_co2_emissions_df.count()}")
combined_co2_emissions_df = combined_co2_emissions_df.dropDuplicates()
print(f"Record count after to dropping duplicate values:: {combined_co2_emissions_df.count()}")

In [None]:
# Drop all records that have a member state code size other than two (column: MS) and that contain any character other than uppercase letters in this column

print(f"Record count prior to filtered state code:: {combined_co2_emissions_df.count()}")
combined_co2_emissions_df = combined_co2_emissions_df.filter(combined_co2_emissions_df['MS'].rlike('^[A-Z][A-Z]$'))
print(f"Record count with filtered state code:: {combined_co2_emissions_df.count()}")

In [None]:
(combined_co2_emissions_df
.repartition("year")
.writeTo("curated.co2_passenger_cars_emissions")
.partitionedBy("year")
.options(format="iceberg", mode="overwrite")
.createOrReplace()
)

In [None]:
spark.stop()