In [10]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

Spark Running
[('spark.eventLog.enabled', 'true'), ('spark.driver.cores', '4'), ('spark.task.cpus', '4'), ('spark.executor.cores', '4'), ('spark.history.fs.logDirectory', '/home/iceberg/spark-events'), ('spark.sql.catalog.demo.s3.endpoint', 'http://minio:9000'), ('spark.driver.port', '32891'), ('spark.eventLog.dir', '/home/iceberg/spark-events'), ('spark.app.id', 'local-1721266203718'), ('spark.app.startTime', '1721266203673'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.memory', '8g'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', '2f37682e1403'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED 

24/07/18 01:30:03 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [11]:
wdi_data_df = spark.read.table("curated.world_development_indicators.data")
wdi_country_df = spark.read.table("curated.world_development_indicators.country")
wdi_series_df = spark.read.table("curated.world_development_indicators.series")

In [12]:
# Filter the countries dataframe to keep data that references actual countries.
# To do so, we filter on the Region column.
df_wdi_countries_filtered = (wdi_country_df
                             .where('Region is not Null')
                             .select(
                               'Country_Code',
                               '2-alpha_code',
                               'Currency_Unit',
                               'Region',
                               'Income_Group'
                            ))

# Perform an inner join to keep only countries' data in the main dataframe
# and add all the columns that come from the countries dataframe.
df_wdi_country_data = (wdi_data_df
                       .join(
                         df_wdi_countries_filtered,
                         on = ['Country_Code'],
                         how = 'inner'
                       ))

# Write the output dataframe to the serving layer
# And we create an external table on top of it

(df_wdi_country_data
.repartition('Country_Code')
.writeTo("wdi_serving.countries_data")
.partitionedBy("Country_Code")
.options(format="iceberg", mode="overwrite")
.createOrReplace()
)

                                                                                

In [13]:
df_co2_emissions_2017 = spark.read.table("curated.co2_passenger_cars_emissions").filter("year = 2017")

df_co2_emissions_2018 = spark.read.table("curated.co2_passenger_cars_emissions").filter("year = 2018")

df_co2_emissions_2019 = spark.read.table("curated.co2_passenger_cars_emissions").filter("year = 2019")

In [14]:
import pyspark.sql.functions as F

# Group the data using the MS column
# And then we sum the values of the Enedc_g/km column
df_co2_emissions_2017 = (df_co2_emissions_2017
                         .groupBy('MS')
                         .agg(
                           F.sum('Enedc_g/km').alias('sum_2017')
                         )
                        )
df_co2_emissions_2018 = (df_co2_emissions_2018
                         .groupBy('MS')
                         .agg(
                           F.sum('Enedc_g/km').alias('sum_2018')
                         )
                        )
df_co2_emissions_2019 = (df_co2_emissions_2019
                         .groupBy('MS')
                         .agg(
                           F.sum('Enedc_g/km').alias('sum_2019')
                         )
                        )

In [15]:
# Generate the two dataframes that contain the YoY diff
# Do an inner join to only keep actual diffs
# Use withColumn() to first add the YoY diff column
# Use withColumn() again to add a column containing the current year value
df_diff_2018_2017 = (df_co2_emissions_2018
                     .withColumnRenamed('sum_2018', 'sum_current_year')
                     .join(
                       df_co2_emissions_2017.withColumnRenamed('sum_2017', 'sum_previous_year'),
                       on = ['MS'],
                       how = 'inner'
                     )
                     .withColumn('emission_diff_yoy', F.col('sum_current_year') - F.col('sum_previous_year'))
                     .withColumn('year', F.lit('2018'))
                    )
df_diff_2019_2018 = (df_co2_emissions_2019
                     .withColumnRenamed('sum_2019', 'sum_current_year')
                     .join(
                       df_co2_emissions_2018.withColumnRenamed('sum_2018', 'sum_previous_year'),
                       on = ['MS'],
                       how = 'inner'
                     )
                     .withColumn('emission_diff_yoy', F.col('sum_current_year') - F.col('sum_previous_year'))
                     .withColumn('year', F.lit('2019'))
                    )

# Use union() method to generate one dataframe containing both input dataframes
df_emissions_diff = df_diff_2018_2017.union(df_diff_2019_2018)

In [16]:
display(df_emissions_diff.head(25))

[Row(MS='LT', sum_current_year=41396, sum_previous_year=92977, emission_diff_yoy=-51581, year='2018'),
 Row(MS='FI', sum_current_year=18352, sum_previous_year=85816, emission_diff_yoy=-67464, year='2018'),
 Row(MS='RO', sum_current_year=98029, sum_previous_year=586722, emission_diff_yoy=-488693, year='2018'),
 Row(MS='NL', sum_current_year=219140, sum_previous_year=1057643, emission_diff_yoy=-838503, year='2018'),
 Row(MS='PL', sum_current_year=926631, sum_previous_year=3316037, emission_diff_yoy=-2389406, year='2018'),
 Row(MS='EE', sum_current_year=33268, sum_previous_year=280218, emission_diff_yoy=-246950, year='2018'),
 Row(MS='AT', sum_current_year=640929, sum_previous_year=466231, emission_diff_yoy=174698, year='2018'),
 Row(MS='HR', sum_current_year=49224, sum_previous_year=24368, emission_diff_yoy=24856, year='2018'),
 Row(MS='CZ', sum_current_year=393842, sum_previous_year=1203499, emission_diff_yoy=-809657, year='2018'),
 Row(MS='PT', sum_current_year=89572, sum_previous_year

In [17]:
(df_emissions_diff
.writeTo("eea_serving.emissions_diff_yoy")
.partitionedBy("year")
.options(format="iceberg", mode="overwrite")
.createOrReplace()
)

In [18]:
spark.stop()