In [0]:

# Check if ecommerce_prod catalog exists
try:
    display(spark.sql("SHOW CATALOGS LIKE 'ecommerce_prod'"))
except:
    print("No ecommerce_prod catalog found")

# Check if my_shop catalog exists
try:
    display(spark.sql("SHOW CATALOGS LIKE 'my_shop'"))
except:
    print("No my_shop catalog found")

catalog
ecommerce_prod


catalog
my_shop


In [0]:
# Load the November CSV to see columns
nov_df = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)

print("=== CSV STRUCTURE ===")
print(f"Total rows: {nov_df.count():,}")
print(f"Columns: {len(nov_df.columns)}")
print("\nColumn names:")
for i, col in enumerate(nov_df.columns, 1):
    print(f"{i:2}. {col}")
    
print("\n=== Sample Data ===")
display(nov_df.limit(3))

print("\n=== Data Types ===")
nov_df.printSchema()

=== CSV STRUCTURE ===
Total rows: 67,501,979
Columns: 9

Column names:
 1. event_time
 2. event_type
 3. product_id
 4. category_id
 5. category_code
 6. brand
 7. price
 8. user_id
 9. user_session

=== Sample Data ===


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:00:00.000Z,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
2019-11-01T00:00:00.000Z,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2019-11-01T00:00:01.000Z,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387



=== Data Types ===
root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
%sql
-- Create the SAME catalog 
CREATE CATALOG IF NOT EXISTS ecommerce_prod;
USE CATALOG ecommerce_prod;

-- Create the three-layer schema
CREATE SCHEMA IF NOT EXISTS bronze;
CREATE SCHEMA IF NOT EXISTS silver;
CREATE SCHEMA IF NOT EXISTS gold;

-- Verify
SHOW SCHEMAS;

databaseName
bronze
default
gold
information_schema
silver


In [0]:
# Based on common e-commerce CSV structure, map columns


from pyspark.sql.functions import col

# Option A: If CSV has exact column names we need
events_nov_df = nov_df.select(
    col("event_time").cast("timestamp").alias("event_time"),
    col("event_type"),
    col("product_id").cast("integer"),
    col("category_code"),
    col("brand"),
    col("price").cast("double"),
    col("user_id").cast("integer"),
    col("user_session")
)

# If columns don't match, use this simpler version:
# events_nov_df = nov_df  # Use all columns as-is

# Write to table
events_nov_df.write.mode("overwrite").saveAsTable("ecommerce_prod.silver.events_nov")

print(f"✓ Loaded {events_nov_df.count():,} rows into silver.events_nov")

✓ Loaded 67,501,979 rows into silver.events_nov


In [0]:
%sql
USE CATALOG ecommerce_prod;

-- Check table structure
DESCRIBE EXTENDED silver.events_nov;

-- Sample data
SELECT * FROM silver.events_nov LIMIT 5;

-- Counts by event type
SELECT 
    event_type,
    COUNT(*) as count
FROM silver.events_nov 
GROUP BY event_type 
ORDER BY count DESC;

event_type,count
view,63556110
cart,3028930
purchase,916939


In [0]:
%sql
USE CATALOG ecommerce_prod;
DESCRIBE silver.events_nov;

col_name,data_type,comment
event_time,timestamp,
event_type,string,
product_id,int,
category_code,string,
brand,string,
price,double,
user_id,int,
user_session,string,


In [0]:
%sql
USE CATALOG ecommerce_prod;

CREATE OR REPLACE TABLE gold.product_performance AS
SELECT
    product_id,
    CONCAT(
        COALESCE(MAX(brand), 'Unknown'), 
        ' - ', 
        COALESCE(
            -- Take first part of category_code (e.g., "electronics" from "electronics.smartphone")
            MAX(SPLIT(category_code, '\\.')[0]), 
            'Product'
        )
    ) as product_name,
    COUNT(DISTINCT CASE WHEN event_type = 'view' THEN user_id END) AS unique_viewers,
    COUNT(DISTINCT CASE WHEN event_type = 'purchase' THEN user_id END) AS unique_buyers,
    SUM(CASE WHEN event_type = 'purchase' THEN price END) AS total_revenue,
    ROUND((COUNT(DISTINCT CASE WHEN event_type = 'purchase' THEN user_id END) * 100.0) /
          NULLIF(COUNT(DISTINCT CASE WHEN event_type = 'view' THEN user_id END), 0), 2) AS conversion_rate
FROM silver.events_nov
GROUP BY product_id;

num_affected_rows,num_inserted_rows


In [0]:
%sql
USE CATALOG ecommerce_prod;

-- Count total rows
SELECT COUNT(*) as total_rows FROM silver.events_nov;

-- Check event types
SELECT event_type, COUNT(*) as count 
FROM silver.events_nov 
GROUP BY event_type 
ORDER BY count DESC;

-- Check sample data
SELECT * FROM silver.events_nov LIMIT 5;

event_time,event_type,product_id,category_code,brand,price,user_id,user_session
2019-11-01T00:00:00.000Z,view,1003461,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
2019-11-01T00:00:00.000Z,view,5000088,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2019-11-01T00:00:01.000Z,view,17302664,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
2019-11-01T00:00:01.000Z,view,3601530,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
2019-11-01T00:00:01.000Z,view,1004775,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [0]:
%sql
USE CATALOG ecommerce_prod;

-- Check ALL event_type distribution
SELECT 
    event_type,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM silver.events_nov
GROUP BY event_type
ORDER BY count DESC;

-- Check if ANY purchase events exist
SELECT COUNT(*) as purchase_count 
FROM silver.events_nov 
WHERE event_type = 'purchase';

-- Check if ANY purchase events with price > 0
SELECT COUNT(*) as purchase_with_price
FROM silver.events_nov 
WHERE event_type = 'purchase' AND price > 0;

purchase_with_price
916939


In [0]:
%sql
USE CATALOG ecommerce_prod;

SELECT 
    event_type,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM silver.events_nov
GROUP BY event_type
ORDER BY count DESC;

event_type,count,percentage
view,63556110,94.15
cart,3028930,4.49
purchase,916939,1.36


In [0]:
%sql
USE CATALOG ecommerce_prod;

-- First, let's check purchase events have price > 0
SELECT 
    COUNT(*) as total_purchases,
    SUM(CASE WHEN price > 0 THEN 1 ELSE 0 END) as purchases_with_price,
    AVG(price) as avg_purchase_price
FROM silver.events_nov 
WHERE event_type = 'purchase';

-- Now create the table WITH category_code as product_name
CREATE OR REPLACE TABLE gold.product_performance AS
SELECT
    product_id,
    MAX(category_code) as product_name,
    COUNT(DISTINCT CASE WHEN event_type = 'view' THEN user_id END) AS unique_viewers,
    COUNT(DISTINCT CASE WHEN event_type = 'purchase' THEN user_id END) AS unique_buyers,
    SUM(CASE WHEN event_type = 'purchase' THEN price END) AS total_revenue,
    ROUND(
        (COUNT(DISTINCT CASE WHEN event_type = 'purchase' THEN user_id END) * 100.0) /
        NULLIF(COUNT(DISTINCT CASE WHEN event_type = 'view' THEN user_id END), 0), 
        2
    ) AS conversion_rate
FROM silver.events_nov
GROUP BY product_id;

-- Check result
SELECT COUNT(*) as total_products FROM gold.product_performance;
SELECT * FROM gold.product_performance LIMIT 10;

product_id,product_name,unique_viewers,unique_buyers,total_revenue,conversion_rate
5801482,electronics.audio.subwoofer,2256,42,2307.72,1.86
6600174,computers.components.memory,158,0,,0.0
5300605,,623,18,243.96000000000004,2.89
13104593,,3461,16,1330.19,0.46
54900005,apparel.costume,5037,30,2471.0400000000004,0.6
1401568,computers.desktop,328,0,,0.0
8801103,electronics.telephone,476,3,76.08000000000001,0.63
26401639,,409,9,1813.95,2.2
13103857,,22,0,,0.0
25800005,medicine.tools.tonometer,1465,69,3581.69,4.71


In [0]:
%sql
USE CATALOG ecommerce_prod;

SELECT
    product_name,
    SUM(unique_viewers) as views,
    SUM(unique_buyers) as purchases,
    ROUND(SUM(unique_buyers) * 100.0 / NULLIF(SUM(unique_viewers), 0), 2) as conversion_rate
FROM gold.product_performance
WHERE product_name IS NOT NULL
GROUP BY product_name
ORDER BY purchases DESC
LIMIT 15;

product_name,views,purchases,conversion_rate
electronics.smartphone,7235355,279194,3.86
electronics.audio.headphone,761434,29142,3.83
electronics.video.tv,950234,24925,2.62
electronics.clocks,1096978,18434,1.68
appliances.kitchen.washer,611685,16186,2.65
appliances.environment.vacuum,672970,15972,2.37
computers.notebook,1022983,14979,1.46
appliances.kitchen.refrigerators,691191,11204,1.62
apparel.shoes,1284688,8711,0.68
electronics.tablet,233251,5058,2.17


In [0]:
%sql
-- Create executive summary view
CREATE OR REPLACE VIEW gold.executive_summary AS
SELECT
    product_name,
    total_revenue,
    conversion_rate
FROM gold.product_performance
WHERE unique_buyers > 5
ORDER BY total_revenue DESC;

-- Grant permissions (use your email)
GRANT SELECT ON VIEW gold.executive_summary TO `yosephn22@gmail.com`;