In [1]:
from pyspark.sql import SparkSession

Superstore pre-processing

In [2]:
# spark = SparkSession.builder \
#     .appName("DeltaLake") \
#     .master("local[*]") \
#     .config("spark.jars.packages", 
#             "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,io.delta:delta-spark_2.13:4.0.0") \
#     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
#     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
#     .getOrCreate()

spark = SparkSession.builder \
    .appName("DeltaLake with Hive Integration") \
    .master("local[*]") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,io.delta:delta-spark_2.13:4.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.sql.warehouse.dir", "hive") \
    .config("spark.python.worker.timeout", "1200") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.heartbeatInterval", "60s") \
    .config("spark.python.worker.reuse", "true") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .enableHiveSupport() \
    .getOrCreate()


df_superstore = spark.read.format("delta").load("hdfs://localhost:9000/delta_superstore")

In [3]:
print(df_superstore.count())
df_superstore.select("message").show(5, truncate=False)

9936
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|message                                                                                                                                                                                                                                                                                                                                                                                                                               

In [4]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

def parse_superstore(df):
    superstore_schema = StructType([
        StructField("Row ID", StringType(), True),
        StructField("Order ID", StringType(), True),
        StructField("Order Date", StringType(), True),
        StructField("Ship Date", StringType(), True),
        StructField("Ship Mode", StringType(), True),
        StructField("Customer ID", StringType(), True),
        StructField("Customer Name", StringType(), True),
        StructField("Segment", StringType(), True),
        StructField("Country", StringType(), True),
        StructField("City", StringType(), True),
        StructField("State", StringType(), True),
        StructField("Postal Code", StringType(), True),
        StructField("Region", StringType(), True),
        StructField("Product ID", StringType(), True),
        StructField("Category", StringType(), True),
        StructField("Sub-Category", StringType(), True),
        StructField("Product Name", StringType(), True),
        StructField("Sales", StringType(), True),
        StructField("Quantity", StringType(), True),
        StructField("Discount", StringType(), True),
        StructField("Profit", StringType(), True)
    ])
    
    parsed_superstore = df.select(
        col("timestamp_kafka"),
        from_json(col("message"), superstore_schema).alias("parsed_data")
    ).select(
        col("timestamp_kafka"),
        col("parsed_data.Row ID").cast(IntegerType()).alias("row_id"),
        col("parsed_data.Order ID").alias("order_id"),
        to_date(col("parsed_data.Order Date"), "M/d/yyyy").alias("order_date"),
        to_date(col("parsed_data.Ship Date"), "M/d/yyyy").alias("ship_date"),
        col("parsed_data.Ship Mode").alias("ship_mode"),
        col("parsed_data.Customer ID").alias("customer_id"),
        col("parsed_data.Customer Name").alias("customer_name"),
        col("parsed_data.Segment").alias("segment"),
        col("parsed_data.Country").alias("country"),
        col("parsed_data.City").alias("city"),
        col("parsed_data.State").alias("state"),
        col("parsed_data.Postal Code").cast(IntegerType()).alias("postal_code"),
        col("parsed_data.Region").alias("region"),
        col("parsed_data.Product ID").alias("product_id"),
        col("parsed_data.Category").alias("category"),
        col("parsed_data.Sub-Category").alias("sub_category"),
        col("parsed_data.Product Name").alias("product_name"),
        col("parsed_data.Sales").cast(DecimalType(10,2)).alias("sales"),
        col("parsed_data.Quantity").cast(IntegerType()).alias("quantity"),
        col("parsed_data.Discount").cast(DecimalType(5,4)).alias("discount"),
        col("parsed_data.Profit").cast(DecimalType(10,2)).alias("profit")
    )
    
    return parsed_superstore
df_parsed_superstore = parse_superstore(df_superstore)

In [5]:
print(len(df_parsed_superstore.columns))
print(f"{(df_parsed_superstore.columns)} \n")
print(df_parsed_superstore.show(5))

22
['timestamp_kafka', 'row_id', 'order_id', 'order_date', 'ship_date', 'ship_mode', 'customer_id', 'customer_name', 'segment', 'country', 'city', 'state', 'postal_code', 'region', 'product_id', 'category', 'sub_category', 'product_name', 'sales', 'quantity', 'discount', 'profit'] 

+--------------------+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+------+--------+--------+-------+
|     timestamp_kafka|row_id|      order_id|order_date| ship_date|     ship_mode|customer_id|  customer_name|  segment|      country|           city|     state|postal_code|region|     product_id|       category|sub_category|        product_name| sales|quantity|discount| profit|
+--------------------+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+---------

In [6]:
df_parsed_superstore.orderBy(col("row_id").desc()).limit(5).show()

+--------------------+------+--------------+----------+----------+--------------+-----------+----------------+--------+-------------+-----------+----------+-----------+------+---------------+---------------+------------+--------------------+------+--------+--------+------+
|     timestamp_kafka|row_id|      order_id|order_date| ship_date|     ship_mode|customer_id|   customer_name| segment|      country|       city|     state|postal_code|region|     product_id|       category|sub_category|        product_name| sales|quantity|discount|profit|
+--------------------+------+--------------+----------+----------+--------------+-----------+----------------+--------+-------------+-----------+----------+-----------+------+---------------+---------------+------------+--------------------+------+--------+--------+------+
|2025-06-26 15:46:...|  9994|CA-2017-119914|2017-05-04|2017-05-09|  Second Class|   CC-12220|    Chris Cortes|Consumer|United States|Westminster|California|      92683|  West|OFF

Check Null Value

In [7]:
all_null_rows = df_parsed_superstore.filter(
    col("row_id").isNull() & 
    col("order_id").isNull() & 
    col("customer_name").isNull()
)
all_null_rows.show(10, truncate=False)
print(f"Total null: {all_null_rows.count()}")

+-----------------------+------+--------+----------+---------+---------+-----------+-------------+-------+-------+----+-----+-----------+------+----------+--------+------------+------------+-----+--------+--------+------+
|timestamp_kafka        |row_id|order_id|order_date|ship_date|ship_mode|customer_id|customer_name|segment|country|city|state|postal_code|region|product_id|category|sub_category|product_name|sales|quantity|discount|profit|
+-----------------------+------+--------+----------+---------+---------+-----------+-------------+-------+-------+----+-----+-----------+------+----------+--------+------------+------------+-----+--------+--------+------+
|2025-06-26 15:42:39.472|NULL  |NULL    |NULL      |NULL     |NULL     |NULL       |NULL         |NULL   |NULL   |NULL|NULL |NULL       |NULL  |NULL      |NULL    |NULL        |NULL        |NULL |NULL    |NULL    |NULL  |
|2025-06-26 15:42:39.473|NULL  |NULL    |NULL      |NULL     |NULL     |NULL       |NULL         |NULL   |NULL  

In [8]:
df_parsed_superstore=df_parsed_superstore.dropna()
print(df_parsed_superstore.count())

9668


Check Data Anomali

In [9]:
negative_sales = df_parsed_superstore.filter(col("sales") < 0).count()
negative_quantity = df_parsed_superstore.filter(col("quantity") < 0).count()
negative_discount = df_parsed_superstore.filter(col("discount") < 0).count()
print (negative_sales, negative_quantity , negative_discount)

0 0 0


Check Duplicate

In [10]:
total_data = df_parsed_superstore.count()
distinct_data = df_parsed_superstore.distinct().count()
duplicate_data = total_data - distinct_data

print(f"Total data: {total_data}")
print(f"Distinct data: {distinct_data}")
print(f"Duplicate data: {duplicate_data}")

Total data: 9668
Distinct data: 9668
Duplicate data: 0


Create DW Scheme

In [11]:
spark.sql("CREATE DATABASE IF NOT EXISTS db_tgp2")

DataFrame[]

In [13]:
spark.sql("USE db_tgp2")

spark.sql("CREATE TABLE IF NOT EXISTS dim_date (date_id INT, date DATE, month INT, year INT) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_ship (ship_id INT, ship_mode STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_city (city_id INT, city_name STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_customer (customer_id INT, customer_name STRING, zipcode STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_state (state_id INT, state_name STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_segment (segment_id INT, segment STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_region (region_id INT, region STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_product (product_id INT, product_name STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_product_category (category_id INT, product_category STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_product_subcategory (subcategory_id INT, product_subcategory STRING) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS order_fact (order_id INT, sales DECIMAL(10,2), quantity INT, profits DECIMAL(10,2), discount DECIMAL(5,2)) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_product_hierarchy (product_id INT,category_id INT,subcategory_id INT) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_order (order_id INT,product_id INT,customer_id INT,ship_mode_id INT,order_date_id INT,shipment_date_id INT) USING hive")
spark.sql("CREATE TABLE IF NOT EXISTS dim_customer_location (customer_id INT, city_id INT, state_id INT, segment_id INT, region_id INT) USING hive")

DataFrame[]

Send to Hive (Data Warehouse)