# Silver to Gold: Building BI ready tables

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, IntegerType, FloatType, TimestampType, DateType
from pyspark.sql import Row

catalog_name = 'ecommerce'

In [0]:
df_products = spark.table(f"{catalog_name}.silver.slv_products")
df_brands = spark.table(f"{catalog_name}.silver.slv_brands")
df_category = spark.table(f"{catalog_name}.silver.slv_category")

In [0]:
df_products.createOrReplaceTempView("products_v")
df_brands.createOrReplaceTempView("brands_v")
df_category.createOrReplaceTempView("category_v")

In [0]:
display(spark.sql("select * from products_v limit 5"))

In [0]:
display(spark.sql("select * from brands_v limit 5"))

In [0]:
display(spark.sql("select * from category_v limit 5"))

In [0]:
spark.sql(f"USE CATALOG {catalog_name}")

In [0]:
%sql
--  Build the mapping of category * brands mapping and write them into gold layer -> ecommerce.gold.gld_dim_products

CREATE OR REPLACE TABLE gold.gld_dim_products AS
WITH brands_category AS (
    SELECT
        b.brand_code,
        b.brand_name,
        c.category_code,
        c.category_name
    FROM brands_v b
    JOIN category_v c
    ON b.category_code = c.category_code
)
SELECT
  p.product_id,
  p.sku,
  p.category_code,
  COALESCE(bc.category_name, "Not Available") as category_name,
  p.brand_code,
  COALESCE(bc.brand_name, "Not Available") as brand_name,
  p.color,
  p.size,
  p.material,
  p.weight_grams,
  p.length_cm,
  p.width_cm,
  p.height_cm,
  p.rating_count,
  p.file_name,
  p.ingested_timestamp
FROM products_v p
LEFT JOIN brands_category bc
ON p.brand_code = bc.brand_code

# Customers

In [0]:
# India states
india_region = {
    "MH": "West", "GJ": "West", "RJ": "West",
    "KA": "South", "TN": "South", "TS": "South", "AP": "South", "KL": "South",
    "UP": "North", "WB": "North", "DL": "North"
}
# Australia states
australia_region = {
    "VIC": "SouthEast", "WA": "West", "NSW": "East", "QLD": "NorthEast"
}

# United Kingdom states
uk_region = {
    "ENG": "England", "WLS": "Wales", "NIR": "Northern Ireland", "SCT": "Scotland"
}

# United States states
us_region = {
    "MA": "NorthEast", "FL": "South", "NJ": "NorthEast", "CA": "West", 
    "NY": "NorthEast", "TX": "South"
}

# UAE states
uae_region = {
    "AUH": "Abu Dhabi", "DU": "Dubai", "SHJ": "Sharjah"
}

# Singapore states
singapore_region = {
    "SG": "Singapore"
}

# Canada states
canada_region = {
    "BC": "West", "AB": "West", "ON": "East", "QC": "East", "NS": "East", "IL": "Other"
}

# Combine into a master dictionary
country_state_map = {
    "India": india_region,
    "Australia": australia_region,
    "United Kingdom": uk_region,
    "United States": us_region,
    "United Arab Emirates": uae_region,
    "Singapore": singapore_region,
    "Canada": canada_region
}  


In [0]:
country_state_map

In [0]:
# Flatten couuntry_state_maps into list
rows = []
for country, states in country_state_map.items():
    for state_code, region in states.items():
        rows.append(Row(country=country, state=state_code, region=region))
rows[:10]

In [0]:
# create mapping dataframe
df_mapping_regions = spark.createDataFrame(rows)

df_mapping_regions.show(truncate=False)

In [0]:
df_silver = spark.table(f"{catalog_name}.silver.slv_customers")
display(df_silver.limit(5))

In [0]:
df_gold = df_silver.join(df_mapping_regions, on=['country', 'state'], how ="left")

df_gold = df_gold.fillna({"region":"Other"})

display(df_gold.limit(5))

In [0]:
# write raw data into gold layer -> ecommerce.gold.gld_dim_customers
df_gold.write.format("delta")\
    .mode("overwrite")\
    .option("mergeSchema", "true")\
    .saveAsTable(f"{catalog_name}.gold.gld_dim_customers")

# Date\Calendar

In [0]:
df_silver = spark.table(f"{catalog_name}.silver.slv_calendar")
display(df_silver.limit(5))

In [0]:
# add column date_id
df_gold = df_silver.withColumn("date_id",F.date_format(F.col("date"), "yyyyMMdd").cast(IntegerType()))

# add column month name
df_gold = df_gold.withColumn("month_name",F.date_format(F.col("date"),"MMMM"))

# add is_weekend column
df_gold = df_gold.withColumn(
  "is_weekend",
   F.when(F.col("day_name").isin("Saturday", "Sunday"), 1).otherwise(0)
)

display(df_gold.limit(5))

In [0]:
desired_columns_order = ['date_id', 'date','year','month_name','day_name','is_weekend','quarter','week','_ingested_at','_source_file']
df_gold = df_gold.select(desired_columns_order)

display(df_gold.limit(5))

In [0]:
# write raw data into gold layer -> ecommerce.gold.gld_dim_calendar
df_gold.write.format("delta")\
    .mode("overwrite")\
    .option("mergeSchema", "true")\
    .saveAsTable("ecommerce.gold.gld_dim_date")

In [0]:
%sql
DESCRIBE EXTENDED ecommerce.gold.gld_dim_date;