In [1]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.catalog-impl", "org.apache.iceberg.rest.RESTCatalog") \
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg-rest:8181") \
    .config("spark.sql.catalog.iceberg.warehouse", "warehouse") \
    .config("spark.sql.catalog.iceberg.s3.access-key", "admin") \
    .config("spark.sql.catalog.iceberg.s3.secret-key", "password") \
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") \
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true") \
    .config("spark.sql.catalog.iceberg.client.factory", "com.starrocks.connector.iceberg.IcebergAwsClientFactory") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())

Spark Running
[('spark.eventLog.enabled', 'true'), ('spark.driver.port', '37227'), ('spark.app.submitTime', '1720970383212'), ('spark.history.fs.logDirectory', '/home/iceberg/spark-events'), ('spark.sql.catalog.demo.s3.endpoint', 'http://minio:9000'), ('spark.eventLog.dir', '/home/iceberg/spark-events'), ('spark.app.startTime', '1720970383329'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.submit.deployMode', 'client'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNN

24/07/14 15:19:45 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
wdi_data_df = spark.read.table("curated.world_development_indicators.data")

print(wdi_data_df.rdd.getNumPartitions())

4


In [3]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType,StructField, StringType

schema_wdi = StructType([
               StructField('Country_Name', StringType(), True),
               StructField('Country_Code', StringType(), True),
               StructField('Indicator_Name', StringType(), True),
               StructField('Indicator_Code', StringType(), True),
               StructField('Indicator_Value', StringType(), True),
               StructField('year', StringType(), True)
             ])

dataframes_list = []

# Collect all DataFrames into a list
for year in range(1960, 2021):
    df_temp = (wdi_data_df
               .select(
                   'Country_Name',
                   'Country_Code',
                   'Indicator_Name',
                   'Indicator_Code',
                   F.col(str(year)).alias('Indicator_Value')
               )
               .withColumn('year', F.lit(year))
              )
    dataframes_list.append(df_temp)

# Union all DataFrames at once
df_wdi_data_unpivoted = dataframes_list[0]
for df_temp in dataframes_list[1:]:
    df_wdi_data_unpivoted = df_wdi_data_unpivoted.union(df_temp)

print(df_wdi_data_unpivoted.rdd.getNumPartitions())

df_wdi_data_unpivoted.printSchema()

df_wdi_data_unpivoted.show(10)

244
root
 |-- Country_Name: string (nullable = true)
 |-- Country_Code: string (nullable = true)
 |-- Indicator_Name: string (nullable = true)
 |-- Indicator_Code: string (nullable = true)
 |-- Indicator_Value: string (nullable = true)
 |-- year: integer (nullable = false)



                                                                                

+--------------------+------------+--------------------+--------------------+---------------+----+
|        Country_Name|Country_Code|      Indicator_Name|      Indicator_Code|Indicator_Value|year|
+--------------------+------------+--------------------+--------------------+---------------+----+
|Africa Eastern an...|         AFE|Access to clean f...|      EG.CFT.ACCS.ZS|           NULL|1960|
|Africa Eastern an...|         AFE|Access to electri...|      EG.ELC.ACCS.ZS|           NULL|1960|
|Africa Eastern an...|         AFE|Access to electri...|   EG.ELC.ACCS.RU.ZS|           NULL|1960|
|Africa Eastern an...|         AFE|Access to electri...|   EG.ELC.ACCS.UR.ZS|           NULL|1960|
|Africa Eastern an...|         AFE|Adjusted net nati...|   NY.ADJ.NNTY.KD.ZG|           NULL|1960|
|Africa Eastern an...|         AFE|Adjusted net nati...|      NY.ADJ.NNTY.KD|           NULL|1960|
|Africa Eastern an...|         AFE|Adjusted net nati...|      NY.ADJ.NNTY.CD|           NULL|1960|
|Africa Ea

In [4]:
# Write unpivoted dataframe to a new table partitioned by year
spark.sql("CREATE NAMESPACE IF NOT EXISTS serving.world_development_indicators")

df_wdi_data_unpivoted.write \
    .partitionBy('year') \
    .format("iceberg") \
    .saveAsTable(name="serving.world_development_indicators.wdi_data_unpivoted")

                                                                                