In [5]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

Spark Running
[('spark.eventLog.enabled', 'true'), ('spark.driver.cores', '4'), ('spark.app.id', 'local-1721266403247'), ('spark.task.cpus', '4'), ('spark.app.submitTime', '1721266402522'), ('spark.executor.cores', '4'), ('spark.history.fs.logDirectory', '/home/iceberg/spark-events'), ('spark.sql.catalog.demo.s3.endpoint', 'http://minio:9000'), ('spark.eventLog.dir', '/home/iceberg/spark-events'), ('spark.serializer.objectStreamReset', '100'), ('spark.master', 'local[*]'), ('spark.executor.memory', '8g'), ('spark.submit.deployMode', 'client'), ('spark.driver.host', '2f37682e1403'), ('spark.driver.port', '39267'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED

In [6]:
# Reading the WDI unpivoted data
df_wdi_unpivoted = spark.read.table("wdi_serving.wdi_data_unpivoted")

In [7]:
from datetime import datetime
from pyspark.sql.functions import udf

# Python function to generate the decade
# This can be done in multiple ways
# One way is to simply keep the first three chars from the year
# and then add '0s'
def generate_decade_value(year: int) -> str:
  return str(year)[:3] + '0s'

# We register the function as a UDF
# We can also use the @udf annotation
generate_decade_udf = udf(generate_decade_value)

# We add the decade column
df_wdi_unpivoted_udf_v1 = df_wdi_unpivoted.withColumn('decade', generate_decade_udf('year'))


# we start the timer
start = datetime.now()

# We use noop format to simulate the write action
(df_wdi_unpivoted_udf_v1
 .write
 .mode("overwrite")
 .format("noop")
 .save()
 )

# We print the time taken by our job
print(f'Time taken: {datetime.now() - start}')



Time taken: 0:00:10.669684


                                                                                

In [8]:
import pandas as pd
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import StringType


# This time we're using the annotation
# The Pandas UDF input and output is of type pd.Series
# We use a map function on the input Series to perform the modification
@pandas_udf(StringType())
def generate_decade_udf_v2(year: pd.Series) -> pd.Series:
  return year.map(lambda x: str(x)[:3] + '0s')


# We add the decade column
df_wdi_unpivoted_udf_v2 = df_wdi_unpivoted.withColumn('decade', generate_decade_udf_v2('year'))


# we start the timer
start = datetime.now()

# We use noop format to simulate the write action
(df_wdi_unpivoted_udf_v2
 .write
 .mode("overwrite")
 .format("noop")
 .save())

# We print the time taken by our job
print(f'Time taken: {datetime.now() - start}')



Time taken: 0:00:05.727843


                                                                                

The Pandas UDF is faster mainly because it leverages [Apache Arrow](https://arrow.apache.org/) for the data transfer.

In [9]:
(df_wdi_unpivoted_udf_v2
.repartition("decade")
.writeTo("wdi_serving.decade_level_datas")
.partitionedBy("decade")
.options(format="iceberg", mode="overwrite")
.createOrReplace()
)

                                                                                

In [10]:
display(df_wdi_unpivoted_udf_v2.show(10))

+--------------------+------------+--------------------+-----------------+-----------------+----+------+
|        Country_Name|Country_Code|      Indicator_Name|   Indicator_Code|  Indicator_Value|year|decade|
+--------------------+------------+--------------------+-----------------+-----------------+----+------+
|Africa Eastern an...|         AFE|Age dependency ra...|   SP.POP.DPND.OL| 5.59001872292559|2012| 2010s|
|Caribbean small s...|         CSS|   GDP (current US$)|   NY.GDP.MKTP.CD| 72051116148.0838|2012| 2010s|
|East Asia & Pacif...|         EAP|Population ages 0...|SP.POP.0014.FE.ZS|  20.522287762446|2012| 2010s|
|           Euro area|         EMU|  Population, female|SP.POP.TOTL.FE.IN|        172120855|2012| 2010s|
|Europe & Central ...|         ECS|Population ages 5...|SP.POP.5054.FE.5Y| 6.92774019899506|2012| 2010s|
|Fragile and confl...|         FCS|Population ages 8...|SP.POP.80UP.MA.5Y|0.349797743847993|2012| 2010s|
|         High income|         HIC|Age dependency ra...

None

In [11]:
spark.stop()