#### Day 11

Use Silver layer

Descriptive Statistics

In [0]:
%sql
SELECT
  COUNT(*)            AS total_rows,
  MIN(price)          AS min_price,
  MAX(price)          AS max_price,
  AVG(price)          AS avg_price,
  STDDEV(price)       AS std_price
FROM ecommerce_catalog.silver.events
WHERE price IS NOT NULL;


total_rows,min_price,max_price,avg_price,std_price
42410567,0.0,2574.07,290.3104869706175,358.29990736074024


Hypothesis Test: Weekday vs Weekend Activity

In [0]:
%sql
SELECT
  CASE
    WHEN dayofweek(event_time) IN (1,7) THEN 'weekend'
    ELSE 'weekday'
  END AS day_type,
  event_type,
  COUNT(*) AS event_count
FROM ecommerce_catalog.silver.events
GROUP BY day_type, event_type
ORDER BY day_type, event_type;


day_type,event_type,event_count
weekday,cart,642376
weekday,purchase,546365
weekday,view,29769981
weekend,cart,253336
weekend,purchase,196402
weekend,view,11002107


Correlation between price and purchase likelihood

In [0]:
%sql
SELECT
  corr(price, is_purchase) AS price_purchase_corr
FROM (
  SELECT
    price,
    CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END AS is_purchase
  FROM ecommerce_catalog.silver.events
  WHERE price IS NOT NULL
);


price_purchase_corr
0.0071731960874614


Feature Engineering

In [0]:
%sql
SELECT
  *,
  hour(event_time) AS event_hour,
  dayofweek(event_time) AS day_of_week,
  log(price + 1) AS price_log
FROM ecommerce_catalog.silver.events;


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,event_hour,day_of_week,price_log
2019-10-31T00:45:44.000Z,view,30100017,2053013556110033341,Not_Available,makita,82.55,513941928,ee8a4a8d-93b8-481d-a1db-48b194881349,0,5,4.425445255041403
2019-10-31T00:54:00.000Z,view,2501069,2053013564003713919,appliances.kitchen.oven,asel,45.84,517183027,e4991c52-3a6f-47b5-937a-ad0447210aad,0,5,3.8467375387295166
2019-10-31T00:54:28.000Z,view,26300348,2053013563584283495,Not_Available,Not_Available,596.67,526997599,f155d6f4-5a7e-4162-bd25-2494af1a7dbb,0,5,6.393038762166311
2019-10-31T01:20:02.000Z,view,1005186,2053013555631882655,electronics.smartphone,samsung,771.94,565947512,06a62057-4b01-4cd8-9fee-02e266105e55,1,5,6.650201425911212
2019-10-31T01:22:16.000Z,view,1004856,2053013555631882655,electronics.smartphone,samsung,130.99,565874651,b77fd3bb-8bd4-4b19-a108-bf78525cc6a7,1,5,4.882726162140863
2019-10-31T01:25:04.000Z,view,31501146,2053013558031024687,Not_Available,Not_Available,172.44,548544898,b65f63fd-1387-4eb1-9ed6-0cc21d7ed49a,1,5,5.155831718251282
2019-10-31T01:28:35.000Z,view,1005109,2053013555631882655,electronics.smartphone,apple,1003.78,515021441,7e78574f-6aaa-449f-92f8-2abc9dea6b29,1,5,6.912523891057239
2019-10-31T01:34:21.000Z,view,21405238,2053013561579406073,electronics.clocks,armani,393.83,515789383,b1fc13ec-fb0c-47ed-a94f-b3336c354c7c,1,5,5.978455292514342
2019-10-31T01:47:51.000Z,view,1004835,2053013555631882655,electronics.smartphone,samsung,223.95,565142388,81b8a337-d177-4788-9a06-a595edb674a0,1,5,5.4158781552871815
2019-10-31T01:48:51.000Z,view,4200542,2053013552351936731,appliances.environment.air_conditioner,elenberg,191.77,558080735,0f25a52a-0c2b-4a49-a82e-9ec708b2e910,1,5,5.2614977684095


#### Using PySpark

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


Descriptive stats

In [0]:
events = spark.table("ecommerce_catalog.silver.events")

events.select("price").describe().show()


+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          42410567|
|   mean| 290.3104869706537|
| stddev|358.29990736074024|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+



Hypothesis Test: Weekday vs Weekend Activity

In [0]:
events_with_day = events.withColumn(
    "is_weekend",
    F.dayofweek("event_time").isin([1, 7])
)

events_with_day \
    .groupBy("is_weekend", "event_type") \
    .count() \
    .orderBy("is_weekend", "event_type") \
    .show()


+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|     false|      cart|  642376|
|     false|  purchase|  546365|
|     false|      view|29769981|
|      true|      cart|  253336|
|      true|  purchase|  196402|
|      true|      view|11002107|
+----------+----------+--------+



Correlation analysis

In [0]:
events_corr = events.withColumn(
    "is_purchase",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

events_corr.stat.corr("price", "is_purchase")


0.007173196087461416

Feature Engineering

In [0]:
features = events \
    .withColumn("event_hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_time")) \
    .withColumn("price_log", F.log(F.col("price") + 1))


time since first event per user

In [0]:
window_spec = Window.partitionBy("user_id").orderBy("event_time")

features = features.withColumn(
    "time_since_first_event",
    F.unix_timestamp("event_time") -
    F.first("event_time").over(window_spec).cast("long")
)


Check the derived feature "time_since_first_event" for the first few users

In [0]:
from pyspark.sql.functions import col

features \
    .select("user_id", "event_time", "time_since_first_event") \
    .orderBy("user_id", "event_time") \
    .show(5, truncate=False)


+---------+-------------------+----------------------+
|user_id  |event_time         |time_since_first_event|
+---------+-------------------+----------------------+
|33869381 |2019-10-23 20:04:08|0                     |
|64078358 |2019-10-13 00:13:46|0                     |
|183503497|2019-10-02 21:43:00|0                     |
|184265397|2019-10-04 17:44:37|0                     |
|184265397|2019-10-04 17:45:18|41                    |
+---------+-------------------+----------------------+
only showing top 5 rows


For each user_id:

The first event will have
time_since_first_event = 0

Subsequent events show seconds elapsed since that user's first event