In [None]:
from pyspark.sql import SparkSession

Superstore pre-processing

In [None]:
spark = SparkSession.builder \
    .appName("DeltaLake") \
    .master("local[*]") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,io.delta:delta-spark_2.13:4.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

df_superstore = spark.read.format("delta").load("hdfs://localhost:9000/delta_superstore")

In [None]:
print(df_superstore.count())
df_superstore.select("message").show(5, truncate=False)

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

def parse_superstore(df):
    superstore_schema = StructType([
        StructField("Row ID", StringType(), True),
        StructField("Order ID", StringType(), True),
        StructField("Order Date", StringType(), True),
        StructField("Ship Date", StringType(), True),
        StructField("Ship Mode", StringType(), True),
        StructField("Customer ID", StringType(), True),
        StructField("Customer Name", StringType(), True),
        StructField("Segment", StringType(), True),
        StructField("Country", StringType(), True),
        StructField("City", StringType(), True),
        StructField("State", StringType(), True),
        StructField("Postal Code", StringType(), True),
        StructField("Region", StringType(), True),
        StructField("Product ID", StringType(), True),
        StructField("Category", StringType(), True),
        StructField("Sub-Category", StringType(), True),
        StructField("Product Name", StringType(), True),
        StructField("Sales", StringType(), True),
        StructField("Quantity", StringType(), True),
        StructField("Discount", StringType(), True),
        StructField("Profit", StringType(), True)
    ])
    
    parsed_superstore = df.select(
        col("timestamp_kafka"),
        from_json(col("message"), superstore_schema).alias("parsed_data")
    ).select(
        col("timestamp_kafka"),
        col("parsed_data.Row ID").cast(IntegerType()).alias("row_id"),
        col("parsed_data.Order ID").alias("order_id"),
        to_date(col("parsed_data.Order Date"), "M/d/yyyy").alias("order_date"),
        to_date(col("parsed_data.Ship Date"), "M/d/yyyy").alias("ship_date"),
        col("parsed_data.Ship Mode").alias("ship_mode"),
        col("parsed_data.Customer ID").alias("customer_id"),
        col("parsed_data.Customer Name").alias("customer_name"),
        col("parsed_data.Segment").alias("segment"),
        col("parsed_data.Country").alias("country"),
        col("parsed_data.City").alias("city"),
        col("parsed_data.State").alias("state"),
        col("parsed_data.Postal Code").cast(IntegerType()).alias("postal_code"),
        col("parsed_data.Region").alias("region"),
        col("parsed_data.Product ID").alias("product_id"),
        col("parsed_data.Category").alias("category"),
        col("parsed_data.Sub-Category").alias("sub_category"),
        col("parsed_data.Product Name").alias("product_name"),
        col("parsed_data.Sales").cast(DecimalType(10,2)).alias("sales"),
        col("parsed_data.Quantity").cast(IntegerType()).alias("quantity"),
        col("parsed_data.Discount").cast(DecimalType(5,4)).alias("discount"),
        col("parsed_data.Profit").cast(DecimalType(10,2)).alias("profit")
    )
    
    return parsed_superstore
df_parsed_superstore = parse_superstore(df_superstore)

In [None]:
print(len(df_parsed_superstore.columns))
print(f"{(df_parsed_superstore.columns)} \n")
print(df_parsed_superstore.show(5))

In [None]:
df_parsed_superstore.orderBy(col("row_id").desc()).limit(5).show()

Check Null Value

In [None]:
all_null_rows = df_parsed_superstore.filter(
    col("row_id").isNull() & 
    col("order_id").isNull() & 
    col("customer_name").isNull()
)
all_null_rows.show(10, truncate=False)
print(f"Total null: {all_null_rows.count()}")

In [None]:
df_parsed_superstore=df_parsed_superstore.dropna()
print(df_parsed_superstore.count())

Check Data Anomali

In [None]:
negative_sales = df_parsed_superstore.filter(col("sales") < 0).count()
negative_quantity = df_parsed_superstore.filter(col("quantity") < 0).count()
negative_discount = df_parsed_superstore.filter(col("discount") < 0).count()
print (negative_sales, negative_quantity , negative_discount)

Check Duplicate

In [None]:
total_data = df_parsed_superstore.count()
distinct_data = df_parsed_superstore.distinct().count()
duplicate_data = total_data - distinct_data

print(f"Total data: {total_data}")
print(f"Distinct data: {distinct_data}")
print(f"Duplicate data: {duplicate_data}")

Create sparkHive Session

In [None]:
from os.path import abspath

warehouse_location = abspath('spark-warehouse')
sparkHive = SparkSession \
    .builder \
    .appName("Python Spark SQL Hive integration example") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()

Create DW Scheme

Send to Hive (Data Warehouse)