In [11]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())

ConnectionRefusedError: [Errno 111] Connection refused

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/usr/local/lib/python3.9/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/usr/local/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/usr/local/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
    self._run_once()
  File "/usr/local/lib/python3.9/asyncio/base_events.py", line 1869, in _run_once
    event_list = self._selector.select(timeout)
  File "/usr/loc

In [2]:
wdi_data_df = spark.read.table("curated.world_development_indicators.data")

print(wdi_data_df.rdd.getNumPartitions())

5


In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType,StructField, StringType

schema_wdi = StructType([
               StructField('Country_Name', StringType(), True),
               StructField('Country_Code', StringType(), True),
               StructField('Indicator_Name', StringType(), True),
               StructField('Indicator_Code', StringType(), True),
               StructField('Indicator_Value', StringType(), True),
               StructField('year', StringType(), True)
             ])

dataframes_list = []

# Collect all DataFrames into a list
for year in range(1960, 2021):
    df_temp = (wdi_data_df
               .select(
                   'Country_Name',
                   'Country_Code',
                   'Indicator_Name',
                   'Indicator_Code',
                   F.col(str(year)).alias('Indicator_Value')
               )
               .withColumn('year', F.lit(year))
              )
    dataframes_list.append(df_temp)

# Union all DataFrames at once
df_wdi_data_unpivoted = dataframes_list[0]
for df_temp in dataframes_list[1:]:
    df_wdi_data_unpivoted = df_wdi_data_unpivoted.union(df_temp)

print(df_wdi_data_unpivoted.rdd.getNumPartitions())

df_wdi_data_unpivoted.printSchema()

df_wdi_data_unpivoted.show(10)

305
root
 |-- Country_Name: string (nullable = true)
 |-- Country_Code: string (nullable = true)
 |-- Indicator_Name: string (nullable = true)
 |-- Indicator_Code: string (nullable = true)
 |-- Indicator_Value: string (nullable = true)
 |-- year: integer (nullable = false)



                                                                                

+--------------------+------------+--------------------+-----------------+----------------+----+
|        Country_Name|Country_Code|      Indicator_Name|   Indicator_Code| Indicator_Value|year|
+--------------------+------------+--------------------+-----------------+----------------+----+
|Africa Eastern an...|         AFE|Age dependency ra...|   SP.POP.DPND.OL|5.80595111963956|1960|
|Caribbean small s...|         CSS|   GDP (current US$)|   NY.GDP.MKTP.CD|1880306125.08709|1960|
|East Asia & Pacif...|         EAP|Population ages 0...|SP.POP.0014.FE.ZS|40.1022698469607|1960|
|           Euro area|         EMU|  Population, female|SP.POP.TOTL.FE.IN|       138020284|1960|
|Europe & Central ...|         ECS|Population ages 5...|SP.POP.5054.FE.5Y|6.14985530006535|1960|
|Fragile and confl...|         FCS|Population ages 8...|SP.POP.80UP.MA.5Y|0.21932284916253|1960|
|         High income|         HIC|Age dependency ra...|   SP.POP.DPND.YG| 46.445585532069|1960|
|           IDA total|        

In [4]:
# Write unpivoted dataframe to a new table partitioned by year

spark.sql("CREATE NAMESPACE IF NOT EXISTS wdi_serving")
df_wdi_data_unpivoted.createOrReplaceTempView("data_unpivoted_tempTable")

spark.sql("""
  CREATE TABLE IF NOT EXISTS wdi_serving.wdi_data_unpivoted 
  USING iceberg
  PARTITIONED BY (year) 
  AS SELECT * FROM data_unpivoted_tempTable
""")

spark.catalog.dropTempView("data_unpivoted_tempTable")


24/07/14 17:29:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


True

In [5]:
spark.catalog.clearCache()

In [6]:
# We use agg() method to perform aggregations
# We use avg() from the pyspark.sql.functions module to generate the average
# We apply the avg() function on a column from the grouped dataframe
df_wdi_data_average = (df_wdi_data_unpivoted
                       .groupBy(
                         'Country_Name',
                         'Country_Code', 
                         'Indicator_Name', 
                         'Indicator_Code',
                       )
                       .agg(
                        F.avg('Indicator_Value').alias('Indicator_Average_Value')
                       )
                      )

In [10]:
# Writing the output data to the serving layer on DBFS
spark.sql("CREATE NAMESPACE IF NOT EXISTS wdi_serving")
repartitioned_df_wdi_data_average = df_wdi_data_average.repartition('Indicator_Code')
repartitioned_df_wdi_data_average.createOrReplaceTempView("average_indicators_tempTable")

spark.sql("""
  CREATE TABLE IF NOT EXISTS wdi_serving.average_indicators
  USING iceberg
  PARTITIONED BY (Indicator_Code) 
  AS SELECT * FROM average_indicators_tempTable
""")
# 
# spark.catalog.dropTempView("average_indicators_tempTable")

# (df_wdi_data_average
#  .repartition('Indicator_Code')
#  .write
#  .mode('overwrite')
#  .format('iceberg')
#  .partitionBy('Indicator_Code')
#  .saveAsTable('serving.world_development_indicators.average_indicators')
# )

ConnectionRefusedError: [Errno 111] Connection refused