In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

In [None]:
wdi_data_df = spark.read.table("curated.world_development_indicators.data")

print(wdi_data_df.rdd.getNumPartitions())

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType,StructField, StringType

schema_wdi = StructType([
               StructField('Country_Name', StringType(), True),
               StructField('Country_Code', StringType(), True),
               StructField('Indicator_Name', StringType(), True),
               StructField('Indicator_Code', StringType(), True),
               StructField('Indicator_Value', StringType(), True),
               StructField('year', StringType(), True)
             ])

emptyRDD              = spark.sparkContext.emptyRDD()
df_wdi_data_unpivoted = spark.createDataFrame(emptyRDD,schema_wdi)

# We loop through the years
# And then add the data of each year to the unpivoted dataframe
for year in range(1960, 2021):
  df_temp = (wdi_data_df
             .select(
               'Country_Name',
               'Country_Code', 
               'Indicator_Name', 
               'Indicator_Code',
               # We keep the column of the current year in the loop
               F.col(str(year)).alias('Indicator_Value')
             )
             .withColumn('year', F.lit(year)) # We add a column that contains the value of the year
            )
  # We append this year's data to the output dataframe via union()
  df_wdi_data_unpivoted = df_wdi_data_unpivoted.union(df_temp)

# Printing the number of partitions of the output dataframe
print(df_wdi_data_unpivoted.rdd.getNumPartitions())

df_wdi_data_unpivoted.printSchema()

df_wdi_data_unpivoted.show(10)

In [None]:
# Write unpivoted dataframe to a new table partitioned by year

spark.sql("CREATE NAMESPACE IF NOT EXISTS wdi_serving")
df_wdi_data_unpivoted.createOrReplaceTempView("data_unpivoted_tempTable")

spark.sql("""
  CREATE TABLE IF NOT EXISTS wdi_serving.wdi_data_unpivoted 
  USING iceberg
  PARTITIONED BY (year) 
  AS SELECT * FROM data_unpivoted_tempTable
""")

spark.catalog.dropTempView("data_unpivoted_tempTable")


In [None]:
# We use agg() method to perform aggregations
# We use avg() from the pyspark.sql.functions module to generate the average
# We apply the avg() function on a column from the grouped dataframe
df_wdi_data_average = (df_wdi_data_unpivoted
                       .groupBy(
                         'Country_Name',
                         'Country_Code', 
                         'Indicator_Name', 
                         'Indicator_Code',
                       )
                       .agg(
                        F.avg('Indicator_Value').alias('Indicator_Average_Value')
                       )
                      )

df_wdi_data_unpivoted.unpersist()


In [None]:
# Writing the output data to the serving layer on DBFS

spark.sql("CREATE NAMESPACE IF NOT EXISTS wdi_serving")
repartitioned_df_wdi_data_average = df_wdi_data_average.repartition('Indicator_Code')

(repartitioned_df_wdi_data_average
 .repartition("Indicator_Code")
 .write
 .mode('overwrite')
 .format('iceberg')
 .partitionBy('Indicator_Code')
 .saveAsTable('wdi_serving.partitioned_average_indicators')
)


In [None]:
spark.stop()