In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

In [None]:
wdi_data_df = spark.read.table("curated.world_development_indicators.data")
wdi_country_df = spark.read.table("curated.world_development_indicators.country")
wdi_series_df = spark.read.table("curated.world_development_indicators.series")

In [None]:
# Filter the countries dataframe to keep data that references actual countries.
# To do so, we filter on the Region column.
df_wdi_countries_filtered = (wdi_country_df
                             .where('Region is not Null')
                             .select(
                               'Country_Code',
                               '2-alpha_code',
                               'Currency_Unit',
                               'Region',
                               'Income_Group'
                            ))

# Perform an inner join to keep only countries' data in the main dataframe
# and add all the columns that come from the countries dataframe.
df_wdi_country_data = (wdi_data_df
                       .join(
                         df_wdi_countries_filtered,
                         on = ['Country_Code'],
                         how = 'inner'
                       ))

# Write the output dataframe to the serving layer
# And we create an external table on top of it
(df_wdi_country_data
  .repartition('Country_Code')
  .write
  .format("iceberg")
  .mode("overwrite")
  .partitionBy('Country_Code')
  .saveAsTable('wdi_serving.countries_data')
)

In [None]:
df_co2_emissions_2017 = spark.read.table("curated.co2_passenger_cars_emissions").filter("year = 2017")

df_co2_emissions_2018 = spark.read.table("curated.co2_passenger_cars_emissions").filter("year = 2018")

df_co2_emissions_2019 = spark.read.table("curated.co2_passenger_cars_emissions").filter("year = 2019")

In [None]:
import pyspark.sql.functions as F

# Group the data using the MS column
# And then we sum the values of the Enedc_g/km column
df_co2_emissions_2017 = (df_co2_emissions_2017
                         .groupBy('MS')
                         .agg(
                           F.sum('Enedc_g/km').alias('sum_2017')
                         )
                        )
df_co2_emissions_2018 = (df_co2_emissions_2018
                         .groupBy('MS')
                         .agg(
                           F.sum('Enedc_g/km').alias('sum_2018')
                         )
                        )
df_co2_emissions_2019 = (df_co2_emissions_2019
                         .groupBy('MS')
                         .agg(
                           F.sum('Enedc_g/km').alias('sum_2019')
                         )
                        )

In [None]:
# Generate the two dataframes that contain the YoY diff
# Do an inner join to only keep actual diffs
# Use withColumn() to first add the YoY diff column
# Use withColumn() again to add a column containing the current year value
df_diff_2018_2017 = (df_co2_emissions_2018
                     .withColumnRenamed('sum_2018', 'sum_current_year')
                     .join(
                       df_co2_emissions_2017.withColumnRenamed('sum_2017', 'sum_previous_year'),
                       on = ['MS'],
                       how = 'inner'
                     )
                     .withColumn('emission_diff_yoy', F.col('sum_current_year') - F.col('sum_previous_year'))
                     .withColumn('year', F.lit('2018'))
                    )
df_diff_2019_2018 = (df_co2_emissions_2019
                     .withColumnRenamed('sum_2019', 'sum_current_year')
                     .join(
                       df_co2_emissions_2018.withColumnRenamed('sum_2018', 'sum_previous_year'),
                       on = ['MS'],
                       how = 'inner'
                     )
                     .withColumn('emission_diff_yoy', F.col('sum_current_year') - F.col('sum_previous_year'))
                     .withColumn('year', F.lit('2019'))
                    )

# Use union() method to generate one dataframe containing both input dataframes
df_emissions_diff = df_diff_2018_2017.union(df_diff_2019_2018)

In [None]:
display(df_emissions_diff.head(25))

In [None]:
(df_emissions_diff
 .write
 .format("iceberg")
 .mode("overwrite")
 .partitionBy('year')
 .saveAsTable('eea_serving.emissions_diff_yoy')
)