Day 2

In [0]:
# Load data - Here we first look at the smaller October dataset
df_oct = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)

In [0]:
# Display first 10 records - clean view
display(df_oct.limit(10))


event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-01T00:00:00.000Z,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
2019-10-01T00:00:00.000Z,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2019-10-01T00:00:01.000Z,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
2019-10-01T00:00:01.000Z,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
2019-10-01T00:00:04.000Z,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d
2019-10-01T00:00:05.000Z,view,1480613,2053013561092866779,computers.desktop,pulser,908.62,512742880,0d0d91c2-c9c2-4e81-90a5-86594dec0db9
2019-10-01T00:00:08.000Z,view,17300353,2053013553853497655,,creed,380.96,555447699,4fe811e9-91de-46da-90c3-bbd87ed3a65d
2019-10-01T00:00:08.000Z,view,31500053,2053013558031024687,,luminarc,41.16,550978835,6280d577-25c8-4147-99a7-abc6048498d6
2019-10-01T00:00:10.000Z,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.71,520571932,ac1cd4e5-a3ce-4224-a2d7-ff660a105880
2019-10-01T00:00:11.000Z,view,1004545,2053013555631882655,electronics.smartphone,huawei,566.01,537918940,406c46ed-90a4-4787-a43b-59a410c1a5fb


In [0]:
# Display 10 records
df_oct.select("event_type", "brand", "product_id", "price").show(10)

+----------+--------+----------+-------+
|event_type|   brand|product_id|  price|
+----------+--------+----------+-------+
|      view|shiseido|  44600062|  35.79|
|      view|    aqua|   3900821|   33.2|
|      view|    NULL|  17200506|  543.1|
|      view|  lenovo|   1307067| 251.74|
|      view|   apple|   1004237|1081.98|
|      view|  pulser|   1480613| 908.62|
|      view|   creed|  17300353| 380.96|
|      view|luminarc|  31500053|  41.16|
|      view|   baden|  28719074| 102.71|
|      view|  huawei|   1004545| 566.01|
+----------+--------+----------+-------+
only showing top 10 rows


In [0]:
# Perform some other basic operations

df_oct.filter("price > 100").count() # Lazy evaluation at work. Only when count() is called, the code is executed

27750807

In [0]:
df_oct.groupBy("event_type").count().show()

+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  742849|
|      cart|  926516|
|      view|40779399|
+----------+--------+



In [0]:
top_brands = df_oct.groupBy("brand").count().orderBy("count", ascending=False).limit(5)
top_brands.show()

+-------+-------+
|  brand|  count|
+-------+-------+
|   NULL|6113008|
|samsung|5282775|
|  apple|4122554|
| xiaomi|3083763|
| huawei|1111205|
+-------+-------+



In [0]:
df_oct.show(5)

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:01|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 00:00:04|      view|   1004237|2053013555631882655|electr

In [0]:
# Top 10 most viewed products

df_oct.filter(df_oct.event_type == "view") \
      .groupBy("product_id") \
      .count() \
      .orderBy("count", ascending=False) \
      .limit(10) \
      .show()


+----------+------+
|product_id| count|
+----------+------+
|   1004856|419287|
|   1004767|378777|
|   1005115|327715|
|   1004249|207422|
|   1004833|203018|
|   1005105|197930|
|   1004870|190435|
|   1002544|179249|
|   4804056|179092|
|   5100816|164608|
+----------+------+



In [0]:
# Missing values

from pyspark.sql.functions import col, sum

df_oct.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df_oct.columns
]).show()


+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     13515609|6113008|    0|      0|           2|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



In [0]:
# Top 10 brands excluding nulls

df_oct.filter(df_oct.brand.isNotNull()) \
      .groupBy("brand") \
      .count() \
      .orderBy("count", ascending=False) \
      .limit(10) \
      .show()


+-------+-------+
|  brand|  count|
+-------+-------+
|samsung|5282775|
|  apple|4122554|
| xiaomi|3083763|
| huawei|1111205|
|lucente| 655861|
|     lg| 562404|
|  bosch| 557090|
|   oppo| 482887|
|   sony| 456644|
|   acer| 428153|
+-------+-------+

