In [0]:
# Load e-commerce dataset
df = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data",
    header=True,
    inferSchema=True
)

df.printSchema()
df.show(5)


root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 00:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 00:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|53

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
top_revenue_products = (
    df.filter(F.col("event_type") == "purchase")
      .groupBy("product_id", "brand")
      .agg(F.sum("price").alias("revenue"))
      .orderBy(F.desc("revenue"))
      .limit(5)
)

top_revenue_products.show()

+----------+-------+--------------------+
|product_id|  brand|             revenue|
+----------+-------+--------------------+
|   1005115|  apple|3.3032381669999924E7|
|   1005105|  apple| 2.168460337000001E7|
|   1004249|  apple|1.3545407540000014E7|
|   1005135|  apple|1.2654328769999998E7|
|   1004767|samsung|1.1004748489999998E7|
+----------+-------+--------------------+



In [0]:
window_spec = (
    Window.partitionBy("user_id")
          .orderBy("event_time")
          .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

df_running = df.withColumn(
    "cumulative_events",
    F.count("*").over(window_spec)
)

df_running.select(
    "user_id",
    "event_time",
    "event_type",
    "cumulative_events"
).show(10)

+--------+-------------------+----------+-----------------+
| user_id|         event_time|event_type|cumulative_events|
+--------+-------------------+----------+-----------------+
|65800726|2019-11-27 04:33:16|      view|                1|
|65800726|2019-11-27 04:35:24|      view|                2|
|81255481|2019-11-08 07:44:45|      view|                1|
|81255481|2019-11-21 14:11:26|      view|                2|
|82079354|2019-11-28 04:58:01|      view|                1|
|82079354|2019-11-28 04:58:22|      view|                2|
|82079354|2019-11-28 04:59:29|      view|                3|
|82079354|2019-11-28 04:59:54|      view|                4|
|82079354|2019-11-28 05:00:22|      view|                5|
|82079354|2019-11-28 05:00:47|      view|                6|
+--------+-------------------+----------+-----------------+
only showing top 10 rows


In [0]:
conversion_rate = (
    df.groupBy("category_code")
      .pivot("event_type", ["view", "purchase"])
      .count()
      .withColumn(
          "conversion_rate",
          F.when(F.col("view") > 0,
                 F.col("purchase") / F.col("view") * 100)
           .otherwise(0)
      )
)

conversion_rate.show()


+--------------------+-------+--------+------------------+
|       category_code|   view|purchase|   conversion_rate|
+--------------------+-------+--------+------------------+
|furniture.living_...| 632899|    2646|0.4180761859317206|
|      apparel.jumper|  33931|      84|0.2475612271963691|
| stationery.cartrige|  19323|     325|1.6819334471872898|
|       sport.bicycle| 234796|    1374|0.5851888447844086|
|        apparel.sock|   6076|      40|0.6583278472679395|
|appliances.enviro...|   5488|      59|1.0750728862973762|
|          kids.swing|  89026|     812|0.9120930963988049|
|auto.accessories....|   3705|      18|0.4858299595141701|
|auto.accessories....|  89495|    1038|1.1598413319179843|
|electronics.audio...|  73039|     919|1.2582319035036076|
|  electronics.clocks|3267223|   41143| 1.259265131275092|
|electronics.audio...|  95772|    1119|1.1683999498809672|
|appliances.kitche...| 296848|    2420|0.8152320379453457|
|appliances.kitche...| 398155|    5392|1.354246461805075