In [0]:
# Show me total revenue by category

df = spark.table("ecommerce.bronze.events")
revenue_by_category = (
    df.groupBy("category_id", "category_code")
      .agg({"price": "sum"})
      .withColumnRenamed("sum(price)", "total_revenue")
)
display(revenue_by_category)

In [0]:
# Which products have the highest conversion rate?

from pyspark.sql import functions as F

events_df = spark.table("ecommerce.silver.events")

conversion_df = (
    events_df
    .groupBy("product_id")
    .agg(
        F.countDistinct(F.when(F.col("event_type") == "view", F.col("user_session"))).alias("views"),
        F.countDistinct(F.when(F.col("event_type") == "purchase", F.col("user_session"))).alias("purchases")
    )
    .withColumn("conversion_rate", F.col("purchases") / F.col("views"))
    .filter(F.col("views") > 0)
    .orderBy(F.col("conversion_rate").desc())
)

display(conversion_df)

In [0]:

# What's the trend of daily purchases over time?


daily_purchases_df = (
    events_df
    .filter(F.col("event_type") == "purchase")
    .groupBy(F.to_date("event_time").alias("date"))
    .agg(F.countDistinct("user_session").alias("daily_purchases"))
    .orderBy("date")
)

display(daily_purchases_df)

In [0]:
# Find customers who viewed but never purchased

viewed_customers = (
    events_df
    .filter(F.col("event_type") == "view")
    .select("user_id")
    .distinct()
)

purchased_customers = (
    events_df
    .filter(F.col("event_type") == "purchase")
    .select("user_id")
    .distinct()
)

viewed_not_purchased = (
    viewed_customers
    .join(purchased_customers, on="user_id", how="left_anti")
)

display(viewed_not_purchased)

In [0]:
# Simple sentiment analysis or text classification

%pip install torch
%pip install transformers
import mlflow
from transformers import pipeline

# Example: Analyze product review sentiment

classifier = pipeline("sentiment-analysis")
reviews = ["This product is amazing!", "Terrible quality, waste of money"]
results = classifier(reviews)

# Log to MLflow
with mlflow.start_run(run_name= "sentimental_model"):
  mlflow.log_param("model", "distilbert-sentiment")
  mlflow.log_metric("accuracy", 0.95)

In [0]:
%restart_python