In [2]:
import requests
import json
import os
from dotenv import load_dotenv
import subprocess

load_dotenv()
KAFKA_PATH = os.getenv('KAFKA_PATH')

Activate Kafka-Connector

In [5]:
KAFKA_CONNECT_URL = 'http://localhost:8083'


def create_hdfs_connector(topic_name):
    config = {
        "name": f"hdfs-sink-{topic_name}",
        "config": {
            "connector.class": "io.confluent.connect.hdfs.HdfsSinkConnector",
            "topics": topic_name,
            "hdfs.url": "hdfs://localhost:9000",
            "flush.size": "1000",
            "format.class": "io.confluent.connect.hdfs.json.JsonFormat",
            "schema.compatibility": "NONE",
            "value.converter": "org.apache.kafka.connect.json.JsonConverter",
            "value.converter.schemas.enable": "false",
            "consumer.auto.offset.reset": "earliest" #Read existing data
        }
    }
    
    with open(f'{topic_name}-connector.json', 'w') as f:
        json.dump(config, f)
    
    # Create connector
    try:
        subprocess.run([
            'curl', '-X', 'POST',
            f'{KAFKA_CONNECT_URL}/connectors',
            '-H', 'Content-Type: application/json',
            '-d', f'@{topic_name}-connector.json'
        ], check=True)
        print(f"Connector for '{topic_name}' created")
    except subprocess.CalledProcessError as e:
        print(f"Failed to create connector for '{topic_name}', {e}")
        
create_hdfs_connector('superstore')

Connector for 'superstore' created


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

Buat Session Spark

In [15]:
spark = SparkSession.builder \
    .appName("KafkatoHDFS") \
    .master("local[*]") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0") \
    .getOrCreate()

print("session berhadil dibuat")
spark.version

session berhadil dibuat


'4.0.0'

PIndahkan data dari Kafka ke HDFS (Data Superstore)

In [16]:
df = spark.read\
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "superstore") \
    .option("startingOffsets", "earliest") \
    .option("endingOffsets", "latest") \
    .load()

df_superstore = df.select(col("value").cast("string").alias("message"), col("timestamp").alias("timestamp_kafka"))

df_superstore.write.mode("overwrite").format("parquet").save("hdfs://localhost:9000/hdfs_superstore")

print("berhasil")

berhasil


Cek Hasil di HDFS (Data Superstore)

In [17]:
sparkcheck = SparkSession.builder.appName("Cek output HDFS").getOrCreate()

df2 = sparkcheck.read.parquet("hdfs://localhost:9000/hdfs_superstore")
df2.show(truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------+
|message                                                                                                                                                                                                                                                                                                                                                                               

Simpan dari HDFS ke Fromat DLH di Delta Lake

In [28]:
!pip install delta-spark==4.0.0




[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
sparkDelta = SparkSession.builder \
    .appName("DeltaLake") \
    .master("local[*]") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,io.delta:delta-spark_2.13:4.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print("session berhadil dibuat")
sparkDelta.version

session berhadil dibuat


'4.0.0'

In [26]:
df2 = sparkDelta.read.parquet("hdfs://localhost:9000/hdfs_superstore")

df2.write.format("delta").mode("overwrite").save("hdfs://localhost:9000/delta_superstore")

print("berhasil")

berhasil


Cek hasilnya di Delta Lake

In [27]:
df_delta = sparkDelta.read.format("delta").load("hdfs://localhost:9000/delta_superstore")
df_delta.show()

+--------------------+--------------------+
|             message|     timestamp_kafka|
+--------------------+--------------------+
|{'Row ID': '1', '...|2025-06-25 23:05:...|
|{'Row ID': '2', '...|2025-06-25 23:05:...|
|{'Row ID': '3', '...|2025-06-25 23:05:...|
|{'Row ID': '4', '...|2025-06-25 23:05:...|
|{'Row ID': '5', '...|2025-06-25 23:05:...|
|{'Row ID': '6', '...|2025-06-25 23:05:...|
|{'Row ID': '7', '...|2025-06-25 23:05:...|
|{'Row ID': '8', '...|2025-06-25 23:05:...|
|{'Row ID': '9', '...|2025-06-25 23:05:...|
|{'Row ID': '10', ...|2025-06-25 23:05:...|
|{'Row ID': '11', ...|2025-06-25 23:05:...|
|{'Row ID': '12', ...|2025-06-25 23:05:...|
|{'Row ID': '13', ...|2025-06-25 23:05:...|
|{'Row ID': '14', ...|2025-06-25 23:05:...|
|{'Row ID': '15', ...|2025-06-25 23:05:...|
|{'Row ID': '16', ...|2025-06-25 23:05:...|
|{'Row ID': '17', ...|2025-06-25 23:05:...|
|{'Row ID': '18', ...|2025-06-25 23:05:...|
|{'Row ID': '19', ...|2025-06-25 23:05:...|
|{'Row ID': '20', ...|2025-06-25

In [28]:
df_delta.printSchema()

root
 |-- message: string (nullable = true)
 |-- timestamp_kafka: timestamp (nullable = true)



In [29]:
sparkDelta.sql("DESCRIBE DETAIL delta.`hdfs://localhost:9000/delta_superstore`").show()

+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+
|format|                  id|name|description|            location|           createdAt|        lastModified|partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties|minReaderVersion|minWriterVersion|       tableFeatures|
+------+--------------------+----+-----------+--------------------+--------------------+--------------------+----------------+-----------------+--------+-----------+----------+----------------+----------------+--------------------+
| delta|0b2a3436-8a88-48d...|NULL|       NULL|hdfs://localhost:...|2025-06-28 17:13:...|2025-06-28 17:13:...|              []|               []|       1|    1378385|        {}|               1|               2|[appendOnly, inva...|
+------+--------------------+----+-----------+--------------------+-----

In [30]:
sparkDelta.sql("DESCRIBE HISTORY delta.`hdfs://localhost:9000/delta_superstore`").show()

+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      0|2025-06-28 17:13:...|  NULL|    NULL|    WRITE|{mode -> Overwrit...|NULL|    NULL|     NULL|       NULL|  Serializable|        false|{numFiles -> 1, n...|        NULL|Apache-Spark/4.0....|
+-------+--------------------+------+--------+---------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+

