# BRONZE LAYER Raw Ingestion

## Reading Nov CSV and loading it to Bronze layer

In [0]:
from pyspark.sql.functions import *

In [0]:
bronze_df = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
                           header = True,
                           inferSchema = True)\
                        .withColumn("Injested_Time", current_timestamp())
bronze_df.display()


## Writing Bronze data to Delta Table

In [0]:
bronze_df.write.format("Delta")\
    .mode("append")\
    .option("mergeSchema","true")\
    .save("/Volumes/workspace/ecommerce/ecommerce_data/Delta/event_nov/")

# SILVER LAYER Clean & Standardized
## Remove invalid records
## Deduplicate events
## Standardize text columns

In [0]:
silver_df = (
    spark.read.format("delta")
    .load("/Volumes/workspace/ecommerce/ecommerce_data/Delta/event_nov/")
    .withColumn("event_time", to_timestamp("event_time"))
    .filter(col("user_id").isNotNull())
    .filter(col("price").isNull() | (col("price") >= 0))
    .dropDuplicates(["user_session", "event_time", "event_type"])
    .withColumn("brand", lower(col("brand")))
)

# Writing Silver data to Delta Table

In [0]:
silver_df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/workspace/ecommerce/ecommerce_data/Delta/event_nov/")

# GOLD LAYER Business Aggregates
## Serve analytics
## BI-ready tables
## No heavy transformations here

In [0]:
gold_df = (
    spark.read.format("delta")
    .load("/Volumes/workspace/ecommerce/ecommerce_data/Delta/event_nov/")
    .filter(col("event_type") == "purchase")
    .withColumn("event_date", to_date("event_time"))
    .groupBy("event_date", "category_code")
    .agg(sum("price").alias("daily_revenue"))
)

In [0]:
gold_revenue_df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/workspace/ecommerce/ecommerce_data/Delta/daily_Revenue/")

##  User Lifetime Value

In [0]:
gold_ltv_df = (
    spark.read.format("delta")
    .load("/Volumes/workspace/ecommerce/ecommerce_data/Delta/event_nov/")
    .filter(col("event_type") == "purchase")
    .groupBy("user_id")
    .agg(sum("price").alias("lifetime_value"))
)

In [0]:
gold_ltv_df.write.format("delta") \
    .mode("overwrite") \
    .save("/Volumes/workspace/ecommerce/ecommerce_data/Delta/user_lifetime_value/")