# Day 03 of 14 Days Databricks Challenge


In [0]:
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)

In [0]:
from pyspark.sql import functions as F

revenue = (
    events
    .filter(F.col("event_type") == "purchase")
    .groupBy("product_id", "brand")
    .agg(F.sum("price").alias("revenue"))
    .orderBy(F.desc("revenue"))
    .limit(5)
)

revenue.show(truncate=False)


In [0]:
from pyspark.sql.window import Window

window_spec = (
    Window
    .partitionBy("user_id")
    .orderBy(F.col("event_time"))
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)


In [0]:
events_with_running_total = (
    events
    .withColumn("cumulative_events", F.count("event_type").over(window_spec))
)
events_with_running_total.show(30,truncate=False)

In [0]:
conversion = (
    events
    .groupBy("category_code", "event_type")
    .count()
    .groupBy("category_code")
    .pivot("event_type", ["view", "purchase"])
    .sum("count")
    .fillna(0)
    .withColumn(
        "conversion_rate",
        F.when(F.col("view") > 0,
               F.col("purchase") / F.col("view") * 100
        ).otherwise(0)
    )
)
conversion.show(truncate=False)