In [8]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

Spark Running
[('spark.eventLog.enabled', 'true'), ('spark.driver.cores', '4'), ('spark.app.id', 'local-1721266067644'), ('spark.task.cpus', '4'), ('spark.executor.cores', '4'), ('spark.history.fs.logDirectory', '/home/iceberg/spark-events'), ('spark.sql.catalog.demo.s3.endpoint', 'http://minio:9000'), ('spark.eventLog.dir', '/home/iceberg/spark-events'), ('spark.app.submitTime', '1721265657101'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.memory', '8g'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', '2f37682e1403'), ('spark.driver.port', '39963'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED

24/07/18 01:27:47 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [9]:
wdi_data_df = spark.read.table("curated.world_development_indicators.data")

print(wdi_data_df.rdd.getNumPartitions())

5


In [10]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType,StructField, StringType

schema_wdi = StructType([
               StructField('Country_Name', StringType(), True),
               StructField('Country_Code', StringType(), True),
               StructField('Indicator_Name', StringType(), True),
               StructField('Indicator_Code', StringType(), True),
               StructField('Indicator_Value', StringType(), True),
               StructField('year', StringType(), True)
             ])

emptyRDD              = spark.sparkContext.emptyRDD()
df_wdi_data_unpivoted = spark.createDataFrame(emptyRDD,schema_wdi)

# We loop through the years
# And then add the data of each year to the unpivoted dataframe
for year in range(1960, 2021):
  df_temp = (wdi_data_df
             .select(
               'Country_Name',
               'Country_Code', 
               'Indicator_Name', 
               'Indicator_Code',
               # We keep the column of the current year in the loop
               F.col(str(year)).alias('Indicator_Value')
             )
             .withColumn('year', F.lit(year)) # We add a column that contains the value of the year
            )
  # We append this year's data to the output dataframe via union()
  df_wdi_data_unpivoted = df_wdi_data_unpivoted.union(df_temp)

# Printing the number of partitions of the output dataframe
print(df_wdi_data_unpivoted.rdd.getNumPartitions())

df_wdi_data_unpivoted.printSchema()

df_wdi_data_unpivoted.show(10)

305
root
 |-- Country_Name: string (nullable = true)
 |-- Country_Code: string (nullable = true)
 |-- Indicator_Name: string (nullable = true)
 |-- Indicator_Code: string (nullable = true)
 |-- Indicator_Value: string (nullable = true)
 |-- year: string (nullable = true)

+--------------------+------------+--------------------+-----------------+----------------+----+
|        Country_Name|Country_Code|      Indicator_Name|   Indicator_Code| Indicator_Value|year|
+--------------------+------------+--------------------+-----------------+----------------+----+
|Africa Eastern an...|         AFE|Age dependency ra...|   SP.POP.DPND.OL|5.80595111963956|1960|
|Caribbean small s...|         CSS|   GDP (current US$)|   NY.GDP.MKTP.CD|1880306125.08709|1960|
|East Asia & Pacif...|         EAP|Population ages 0...|SP.POP.0014.FE.ZS|40.1022698469607|1960|
|           Euro area|         EMU|  Population, female|SP.POP.TOTL.FE.IN|       138020284|1960|
|Europe & Central ...|         ECS|Population ag

In [11]:
# Write unpivoted dataframe to a new table partitioned by year

(df_wdi_data_unpivoted
.writeTo("wdi_serving.wdi_data_unpivoted")
.partitionedBy("year")
.options(format="iceberg", mode="overwrite")
.createOrReplace()
)


                                                                                

In [12]:
# We use agg() method to perform aggregations
# We use avg() from the pyspark.sql.functions module to generate the average
# We apply the avg() function on a column from the grouped dataframe
df_wdi_data_average = (df_wdi_data_unpivoted
                       .groupBy(
                         'Country_Name',
                         'Country_Code', 
                         'Indicator_Name', 
                         'Indicator_Code',
                       )
                       .agg(
                        F.avg('Indicator_Value').alias('Indicator_Average_Value')
                       )
                      )

df_wdi_data_unpivoted.unpersist()


DataFrame[Country_Name: string, Country_Code: string, Indicator_Name: string, Indicator_Code: string, Indicator_Value: string, year: string]

In [13]:
(df_wdi_data_average
.repartition("Indicator_Code")
.writeTo("wdi_serving.partitioned_average_indicators")
.partitionedBy("Indicator_Code")
.options(format="iceberg", mode="overwrite")
.createOrReplace()
)

                                                                                

In [14]:
spark.stop()