In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())

Spark Running
[('spark.eventLog.enabled', 'true'), ('spark.driver.port', '46609'), ('spark.driver.host', 'eaacf646f70f'), ('spark.history.fs.logDirectory', '/home/iceberg/spark-events'), ('spark.sql.catalog.demo.s3.endpoint', 'http://minio:9000'), ('spark.eventLog.dir', '/home/iceberg/spark-events'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.deployMode', 'client'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=AL

24/07/14 00:43:35 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
df_co2_emissions_2017 = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2017")
df_co2_emissions_2018 = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2018")
df_co2_emissions_2019 = spark.read.table("raw.co2_passenger_cars_emissions.co2_emissions_passenger_cars_2019")

combined_co2_emissions_df = df_co2_emissions_2017.unionByName(df_co2_emissions_2018).unionByName(df_co2_emissions_2019)

combined_co2_emissions_df.printSchema()
combined_co2_emissions_df.show(10, truncate=False)

root
 |-- At1 (mm): long (nullable = true)
 |-- At2 (mm): long (nullable = true)
 |-- Cn: string (nullable = true)
 |-- Cr: string (nullable = true)
 |-- Ct: string (nullable = true)
 |-- De: string (nullable = true)
 |-- E (g/km): string (nullable = true)
 |-- Enedc (g/km): long (nullable = true)
 |-- Er (g/km): string (nullable = true)
 |-- Ernedc (g/km): double (nullable = true)
 |-- Erwltp (g/km): string (nullable = true)
 |-- Ewltp (g/km): long (nullable = true)
 |-- Fm: string (nullable = true)
 |-- Ft: string (nullable = true)
 |-- ID: long (nullable = true)
 |-- It: string (nullable = true)
 |-- MMS: string (nullable = true)
 |-- MS: string (nullable = true)
 |-- Man: string (nullable = true)
 |-- Mh: string (nullable = true)
 |-- Mk: string (nullable = true)
 |-- Mp: string (nullable = true)
 |-- Mt: long (nullable = true)
 |-- Status: string (nullable = true)
 |-- T: string (nullable = true)
 |-- TAN: string (nullable = true)
 |-- VFN: string (nullable = true)
 |-- Va: string

24/07/14 01:10:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+--------+--------+------------------------+---+---+----+--------+------------+---------+-------------+-------------+------------+---+------+-------+---+--------------------------+---+-----------------------+-------+-------+-----------+----+------+----+------------------+---+----------+------------+----+------+----+--------+-------+------+---+------------+----+---------+
|At1 (mm)|At2 (mm)|Cn                      |Cr |Ct |De  |E (g/km)|Enedc (g/km)|Er (g/km)|Ernedc (g/km)|Erwltp (g/km)|Ewltp (g/km)|Fm |Ft    |ID     |It |MMS                       |MS |Man                    |Mh     |Mk     |Mp         |Mt  |Status|T   |TAN               |VFN|Va        |Ve          |Vf  |W (mm)|Zr  |ec (cm3)|ep (KW)|m (kg)|r  |version_file|year|z (Wh/km)|
+--------+--------+------------------------+---+---+----+--------+------------+---------+-------------+-------------+------------+---+------+-------+---+--------------------------+---+-----------------------+-------+-------+-----------+----+------+----

In [4]:
# Replace spaces in column names with underscores (“_”). Additionally, remove parentheses from column names.
import re
import pyspark.sql.functions as F

co2_emissions_columns = combined_co2_emissions_df.columns
print(f"Original Column names:: {combined_co2_emissions_df.columns}")

combined_co2_emissions_df = (combined_co2_emissions_df.select(
                      [F.col(col).alias(re.sub('[()]', '', col.replace(' ', '_'))) for col in combined_co2_emissions_df.columns]
                    ))


print(f"Updated Column names:: {combined_co2_emissions_df.columns}")

Original Column names:: ['At1 (mm)', 'At2 (mm)', 'Cn', 'Cr', 'Ct', 'De', 'E (g/km)', 'Enedc (g/km)', 'Er (g/km)', 'Ernedc (g/km)', 'Erwltp (g/km)', 'Ewltp (g/km)', 'Fm', 'Ft', 'ID', 'It', 'MMS', 'MS', 'Man', 'Mh', 'Mk', 'Mp', 'Mt', 'Status', 'T', 'TAN', 'VFN', 'Va', 'Ve', 'Vf', 'W (mm)', 'Zr', 'ec (cm3)', 'ep (KW)', 'm (kg)', 'r', 'version_file', 'year', 'z (Wh/km)']
Updated Column names:: ['At1_mm', 'At2_mm', 'Cn', 'Cr', 'Ct', 'De', 'E_g/km', 'Enedc_g/km', 'Er_g/km', 'Ernedc_g/km', 'Erwltp_g/km', 'Ewltp_g/km', 'Fm', 'Ft', 'ID', 'It', 'MMS', 'MS', 'Man', 'Mh', 'Mk', 'Mp', 'Mt', 'Status', 'T', 'TAN', 'VFN', 'Va', 'Ve', 'Vf', 'W_mm', 'Zr', 'ec_cm3', 'ep_KW', 'm_kg', 'r', 'version_file', 'year', 'z_Wh/km']


In [5]:
# Drop records that only consist of null values (records with null values on all columns).
print(f"Record count prior to dropping null values:: {combined_co2_emissions_df.count()}")
combined_co2_emissions_df = combined_co2_emissions_df.dropna(how="all")
print(f"Record count after to dropping null values:: {combined_co2_emissions_df.count()}")

Record count prior to dropping null values:: 300000
Record count after to dropping null values:: 300000


In [6]:
# Drop duplicate records.

print(f"Record count prior to dropping duplicate values:: {combined_co2_emissions_df.count()}")
combined_co2_emissions_df = combined_co2_emissions_df.dropDuplicates()
print(f"Record count after to dropping duplicate values:: {combined_co2_emissions_df.count()}")

Record count prior to dropping duplicate values:: 300000


[Stage 12:>                                                         (0 + 8) / 8]

Record count after to dropping duplicate values:: 300000


                                                                                

In [7]:
# Drop all records that have a member state code size other than two (column: MS) and that contain any character other than uppercase letters in this column

print(f"Record count prior to filtered state code:: {combined_co2_emissions_df.count()}")
combined_co2_emissions_df = combined_co2_emissions_df.filter(combined_co2_emissions_df['MS'].rlike('^[A-Z][A-Z]$'))
print(f"Record count with filtered state code:: {combined_co2_emissions_df.count()}")

                                                                                

Record count prior to filtered state code:: 300000


[Stage 22:>                                                         (0 + 3) / 3]

Record count with filtered state code:: 299996


                                                                                

In [9]:
from datetime import datetime
current_year = datetime.now().year

spark.sql("CREATE NAMESPACE IF NOT EXISTS curated.co2_passenger_cars_emissions")

combined_co2_emissions_df = combined_co2_emissions_df.repartition('year')

combined_co2_emissions_df.write \
    .mode("overwrite") \
    .partitionBy('year') \
    .saveAsTable(name="curated.co2_passenger_cars_emissions")

                                                                                