In [1]:
import random
from faker import Faker

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth

### Spark Setup

In [3]:
spark_jar_packages = ",".join([
    "org.apache.hudi:hudi-spark3.5-bundle_2.12:1.0.0",
    "org.slf4j:slf4j-api:1.7.36",
    "org.apache.logging.log4j:log4j-slf4j-impl:2.24.3",
    "org.apache.hive:hive-metastore:3.1.3",
    "org.apache.hive:hive-exec:3.1.3",
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
])

In [4]:
spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("delta-hive-playground")
    .config("spark.jars.packages", spark_jar_packages)

    # Hudi-Hive Integration
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar")
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog")
    .config("spark.sql.catalog.spark_catalog.type", "hive")
    .config("spark.sql.catalogImplementation", "hive")
    .config("hive.metastore.uris", "thrift://localhost:9083")

    # S3 (MinIO Integration)
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
    .config("spark.hadoop.fs.s3a.region", "us-east-1")

    .getOrCreate()
)

25/01/06 03:55:15 WARN Utils: Your hostname, Brunos-Macbook-Pro-16-M1-Max.local resolves to a loopback address: 127.0.0.1; using 192.168.15.86 instead (on interface en0)
25/01/06 03:55:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/Users/iobruno/.sdkman/candidates/spark/3.5.3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/iobruno/.ivy2/cache
The jars for the packages stored in: /Users/iobruno/.ivy2/jars
org.apache.hudi#hudi-spark3.5-bundle_2.12 added as a dependency
org.slf4j#slf4j-api added as a dependency
org.apache.logging.log4j#log4j-slf4j-impl added as a dependency
org.apache.hive#hive-metastore added as a dependency
org.apache.hive#hive-exec added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b08014a2-af2c-411f-9866-a94ab834fff5;1.0
	confs: [default]
	found org.apache.hudi#hudi-spark3.5-bundle_2.12;1.0.0 in central
	found org.apache.hive#hive-storage-api;2.8.1 in central
	found org.slf4j#slf4j-api;1.7.36 in central
	found org.apache.logging.log4j#log4j-slf4j-impl;2.24.3 in central
	found org.apache.logging.log4j#log4j-api;2.24.3 in central
	found org.apache.logging.log4j#log4j-core;2.24.3 in central
	found org.apache.hive#hi

### Dataset Generation

In [5]:
def generate_entry(faker: Faker, country_codes: list):
    return {
        "id": faker.unique.uuid4(),
        "name":  faker.name(),
        "email": faker.email(),
        "passport": faker.passport_number(),
        "country_code": random.choice(country_codes),
        "iban": faker.iban(),
        "swift": faker.swift11(),
        "created_at": faker.past_date(start_date='-90d').strftime('%Y-%m-%d')
    }

In [6]:
def generate_dataset(num: int, seed: int):
    country_codes = ['US', 'CA', 'JP', 'KR', 'FR', 'GE', 'UK', 'BR', 'AR']
    Faker.seed(seed)
    faker = Faker()
    return [generate_entry(faker, country_codes) for _ in range(num)]

In [7]:
dataset = generate_dataset(num=100_000, seed=739)

In [8]:
df = spark.createDataFrame(dataset)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

25/01/06 03:55:40 WARN DFSPropertiesConfiguration: Properties file file:/etc/hudi/conf/hudi-defaults.conf not found. Ignoring to load props file
25/01/06 03:55:40 WARN DFSPropertiesConfiguration: Cannot find HUDI_CONF_DIR, please set it as the dir of hudi-defaults.conf


### Hudi-Hive Integration

In [9]:
df.write.format("hudi") \
    .option("hoodie.database.name", "hudi_raw") \
    .option("hoodie.table.name", "accounts") \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "created_at") \
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.meta.sync.enable", "true") \
    .option("hoodie.datasource.hive_sync.mode", "hms") \
    .option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083") \
    .option("hoodie.datasource.hive_sync.partition_fields", "year,month") \
    .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \
    .option("hoodie.datasource.write.hive_style_partitioning","true") \
    .partitionBy("year", "month") \
    .mode("overwrite") \
    .save("s3a://lakehouse-raw/hudi/accounts")

25/01/06 03:55:40 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/01/06 03:55:41 WARN HoodieSparkSqlWriterInternal: hoodie table at s3a://lakehouse-raw/hudi/accounts already exists. Deleting existing data & overwriting with new data.
25/01/06 03:55:43 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to hudi/accounts/.hoodie/metadata/files/.files-0000-0_00000000000000000.log.1_0-0-0. This is unsupported
25/01/06 03:55:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-hbase.properties,hadoop-metrics2.properties
25/01/06 03:55:45 WARN TaskSetManager: Stage 8 contains a task of very large size (1397 KiB). The maximum recommended task size is 1000 KiB.
                                                                                



### Upsert Dataset

In [10]:
entries = [
    # Existing entries
    dataset[2], 
    dataset[4], 
    dataset[7],
    dataset[11],
    # New entries
    *generate_dataset(4, seed=1037)
]

In [11]:
for entry in entries:
    username = entry['name'].lower().replace(" ", ".")
    entry['email'] = f"{username}@domain.com"

In [12]:
upsert_df = spark.createDataFrame(entries)\
        .withColumn("year", year(col("created_at")))\
        .withColumn("month", month(col("created_at")))\
        .withColumn("day", dayofmonth(col("created_at")))

In [13]:
upsert_df.show(8, truncate=False)

+------------+----------+---------------------------+----------------------+------------------------------------+----------------+---------+-----------+----+-----+---+
|country_code|created_at|email                      |iban                  |id                                  |name            |passport |swift      |year|month|day|
+------------+----------+---------------------------+----------------------+------------------------------------+----------------+---------+-----------+----+-----+---+
|US          |2024-12-08|ann.cruz@domain.com        |GB55LFTZ50027083194346|b7e33adb-9bfe-465f-a533-1d57f8d9c9f6|Ann Cruz        |T22953641|HMAQGBCSXE8|2024|12   |8  |
|BR          |2024-12-02|cassidy.jones.md@domain.com|GB14AYNQ55188150393152|0daad7bc-25b6-4469-8a2f-2ba767f86791|Cassidy Jones MD|595954695|VTHYGBZMNOI|2024|12   |2  |
|UK          |2024-12-03|kara.thomas@domain.com     |GB02LAAF80272115976869|4cbbf121-caae-42aa-8508-3fd99bb2f762|Kara Thomas     |661814813|DULPGBWLTDU|2024|12 

In [14]:
upsert_df.createOrReplaceTempView("upsert_data")

### Upsert Strategy

In [15]:
upsert_df.write.format("hudi") \
    .option("hoodie.database.name", "hudi_raw") \
    .option("hoodie.table.name", "accounts") \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "created_at") \
    .option("hoodie.datasource.write.table.type", "COPY_ON_WRITE") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .option("hoodie.datasource.meta.sync.enable", "true") \
    .option("hoodie.datasource.hive_sync.mode", "hms") \
    .option("hoodie.datasource.hive_sync.metastore.uris", "thrift://localhost:9083") \
    .option("hoodie.datasource.hive_sync.partition_fields", "year,month") \
    .option("hoodie.datasource.hive_sync.partition_extractor_class", "org.apache.hudi.hive.MultiPartKeysValueExtractor") \
    .option("hoodie.datasource.write.hive_style_partitioning","true") \
    .partitionBy("year", "month") \
    .mode("append") \
    .save("s3a://lakehouse-raw/hudi/accounts")

                                                                                

### Hudi Metadata (WIP)

In [16]:
spark.sql("""
    call show_commits (
        table => 'hudi_raw.accounts',
        from_commit => '0'
    )    
""").show(truncate=False)

+-----------------+---------------------+------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+
|commit_time      |state_transition_time|action|total_bytes_written|total_files_added|total_files_updated|total_partitions_written|total_records_written|total_update_records_written|total_errors|
+-----------------+---------------------+------+-------------------+-----------------+-------------------+------------------------+---------------------+----------------------------+------------+
|20250106035553934|20250106035556920    |commit|10431603           |0                |4                  |4                       |100004               |4                           |0           |
|20250106035542363|20250106035551210    |commit|10431115           |4                |0                  |4                       |100000               |0                           |0           |
+-----------------+-

In [17]:
hudi_changes = spark.read.format("hudi") \
    .option("hoodie.datasource.query.type", "incremental") \
    .option("hoodie.datasource.read.begin.instanttime", 0) \
    .load("s3a://lakehouse-raw/hudi/accounts")

In [18]:
hudi_changes.show()

+-------------------+--------------------+--------------------+----------------------+--------------------+------------+----------+--------------------+--------------------+--------------------+------------------+---------+-----------+---+----+-----+
|_hoodie_commit_time|_hoodie_commit_seqno|  _hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|country_code|created_at|               email|                iban|                  id|              name| passport|      swift|day|year|month|
+-------------------+--------------------+--------------------+----------------------+--------------------+------------+----------+--------------------+--------------------+--------------------+------------------+---------+-----------+---+----+-----+
|  20250106035542363|20250106035542363...|4297a060-3e46-425...|    year=2024/month=12|7443c079-1ca3-401...|          BR|2024-12-17|christianhughes@e...|GB82DJDV883608850...|4297a060-3e46-425...|    Carlos Parsons|C98606572|YFWOGBTO1WV| 17|2024|   