In [None]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Iceberg Catalog Setup") \
    .getOrCreate()

print("Spark Running")
print(spark.sparkContext.getConf().getAll())
print("current catalog:", spark.catalog.currentCatalog())
print("Spark UI:", spark.sparkContext.uiWebUrl)

In [None]:
# Reading the WDI unpivoted data
df_wdi_unpivoted = spark.read.table("wdi_serving.wdi_data_unpivoted")

In [None]:
from datetime import datetime
from pyspark.sql.functions import udf

# Python function to generate the decade
# This can be done in multiple ways
# One way is to simply keep the first three chars from the year
# and then add '0s'
def generate_decade_value(year: int) -> str:
  return str(year)[:3] + '0s'

# We register the function as a UDF
# We can also use the @udf annotation
generate_decade_udf = udf(generate_decade_value)

# We add the decade column
df_wdi_unpivoted_udf_v1 = df_wdi_unpivoted.withColumn('decade', generate_decade_udf('year'))


# we start the timer
start = datetime.now()

# We use noop format to simulate the write action
(df_wdi_unpivoted_udf_v1
 .write
 .mode("overwrite")
 .format("noop")
 .save()
 )

# We print the time taken by our job
print(f'Time taken: {datetime.now() - start}')

In [None]:
import pandas as pd
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import StringType


# This time we're using the annotation
# The Pandas UDF input and output is of type pd.Series
# We use a map function on the input Series to perform the modification
@pandas_udf(StringType())
def generate_decade_udf_v2(year: pd.Series) -> pd.Series:
  return year.map(lambda x: str(x)[:3] + '0s')


# We add the decade column
df_wdi_unpivoted_udf_v2 = df_wdi_unpivoted.withColumn('decade', generate_decade_udf_v2('year'))


# we start the timer
start = datetime.now()

# We use noop format to simulate the write action
(df_wdi_unpivoted_udf_v2
 .write
 .mode("overwrite")
 .format("noop")
 .save())

# We print the time taken by our job
print(f'Time taken: {datetime.now() - start}')

The Pandas UDF is faster mainly because it leverages [Apache Arrow](https://arrow.apache.org/) for the data transfer.

In [None]:
(df_wdi_unpivoted_udf_v2
.repartition("decade")
.writeTo("wdi_serving.decade_level_datas")
.partitionedBy("decade")
.options(format="iceberg", mode="overwrite")
.createOrReplace()
)

In [None]:
display(df_wdi_unpivoted_udf_v2.show(10))

In [None]:
spark.stop()