## Statistical Summary on Curated Data

In [0]:
from pyspark.sql.functions import *

df = spark.sql("SELECT * FROM ecommerce_catalog.gold.vw_events_nov_analytics")


In [0]:
df.display()

In [0]:


df.select(
    count("*").alias("total_events"),
    countDistinct("brand").alias("distinct_brands"),
    countDistinct("product_id").alias("distinct_products"),
    avg("price").alias("avg_price"),
    min("price").alias("min_price"),
    max("price").alias("max_price"),
    expr("percentile(price, 0.5)").alias("median_price")
).display()


## Stats by Event Type

In [0]:
df.groupBy("event_type") \
  .agg(
      count("*").alias("event_count"),
      avg("price").alias("avg_price")
  ) \
  .orderBy("event_count", ascending=False) \
  .display()

## Weekday vs Weekend Hypothesis Testing

In [0]:
df_time = df.withColumn(
    "day_of_week", dayofweek("event_time")
).withColumn(
    "is_weekend",
    when(dayofweek("event_time").isin(1,7), 1).otherwise(0)
)

df_time.display()

## Compare Weekday vs Weekend Behavior

### is_weekend = 1 → Weekend
### is_weekend = 0 → Weekday

## Percentage Difference (Simple Hypothesis Insight)

In [0]:
weekday_avg = df_time.filter("is_weekend = 0").agg(avg("price")).collect()[0][0]
weekend_avg = df_time.filter("is_weekend = 1").agg(avg("price")).collect()[0][0]

percent_diff = ((weekend_avg - weekday_avg) / weekday_avg) * 100
display(percent_diff)

## Correlation Analysis
### Event Frequency per Product

In [0]:
product_stats = df.groupBy("product_id") \
    .agg(
        count("*").alias("event_count"),
        avg("price").alias("avg_price")
    )

### Correlation Check

### Interpretation:

- Close to 1  → strong positive
- Close to -1 → strong negative
- Near 0      → weak/no signal

In [0]:
product_stats.stat.corr("event_count", "avg_price")
display(product_stats)

## Feature Engineering for ML

### Time-Based Features

In [0]:
df_features = df_time.withColumn("hour_of_day", hour("event_time"))
df_features.display()

### Brand-Level Features

In [0]:
user_features = df_features.groupBy("brand") \
    .agg(
        count("*").alias("brand_event_count")
    )

df_features = df_features.join(user_features, "brand")
df_features.display()

### Product-Level Features

In [0]:
product_features = df_features.groupBy("product_id") \
    .agg(
        count("*").alias("product_event_count"),
        avg("price").alias("product_avg_price")
    )

df_features = df_features.join(product_features, "product_id")
df_features.display()

### Feature Bucketing (ML Friendly)

In [0]:
df_features = df_features.withColumn(
    "user_activity_bucket",
    when(df_features.brand_event_count < 5, "low")
    .when(df_features.brand_event_count < 20, "medium")
    .otherwise("high")
)
df_features.display()

## Create ML-Ready Delta Table

In [0]:
ml_df = df_features.select(
    "product_id",
    "event_type",
    "price",
    "hour_of_day",
    "day_of_week",
    "is_weekend",
    "brand_event_count",
    "product_event_count",
    "product_avg_price",
    "user_activity_bucket"
)

## Write as Delta Table

In [0]:
ml_df.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("ecommerce_catalog.gold.ml_features_events")