In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
# Load data - Explore the smaller October dataset
df_oct = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)
     

In [0]:

# Display first 5 records - clean view
display(df_oct.limit(5))
     

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-10-01T00:00:00.000Z,view,44600062,2103807459595387724,,shiseido,35.79,541312140,72d76fde-8bb3-4e00-8c23-a032dfed738c
2019-10-01T00:00:00.000Z,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
2019-10-01T00:00:01.000Z,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8
2019-10-01T00:00:01.000Z,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
2019-10-01T00:00:04.000Z,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d


In [0]:
# Load data - November dataset
df_nov = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)

In [0]:
display(df_nov.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:00:00.000Z,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
2019-11-01T00:00:00.000Z,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2019-11-01T00:00:01.000Z,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
2019-11-01T00:00:01.000Z,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
2019-11-01T00:00:01.000Z,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [0]:
# Check null values in df_oct

from pyspark.sql.functions import col, sum

df_oct.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df_oct.columns
]).show()

+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     13515609|6113008|    0|      0|           2|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



In [0]:
# Let's select relevant columns only before performing join as Smaller DataFrames = faster joins
df_oct_small = df_oct.select('event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id')
df_nov_small = df_nov.select('event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id')

     

In [0]:
# Keep all October records and matching November records based on user_id
df_joined = df_oct_small.join(df_nov_small, on=['user_id'], how='left')

df_joined

DataFrame[user_id: int, event_time: timestamp, event_type: string, product_id: int, category_id: bigint, category_code: string, brand: string, price: double, event_time: timestamp, event_type: string, product_id: int, category_id: bigint, category_code: string, brand: string, price: double]

In [0]:
print(f"Number of unique user_ids: {df_oct.select("user_id").distinct().count()}")
print(f"Number of unique user_ids: {df_joined.select("user_id").distinct().count()}")

Number of unique user_ids: 3022290
Number of unique user_ids: 3022290


In [0]:
from pyspark.sql.window import Window # Imports Spark’s window specification tool

user_time_window = Window \
    .partitionBy("user_id") \
    .orderBy("event_time")

from pyspark.sql.functions import sum # Imports Spark’s optimized aggregation function.

In [0]:
df_oct_running = df_oct.withColumn(
    "running_total_price",
    sum("price").over(user_time_window)
)

In [0]:
display(df_oct_running.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,running_total_price
2019-10-09T10:30:19.000Z,view,17301541,2053013553853497655,,,162.17,205053188,e1eadbc6-aef5-4cff-bb1f-07d2b983a26e,162.17
2019-10-09T10:30:44.000Z,view,17301541,2053013553853497655,,,162.17,205053188,e1eadbc6-aef5-4cff-bb1f-07d2b983a26e,324.34
2019-10-07T06:23:01.000Z,view,16200119,2053013556344914381,kids.fmcg.diapers,moony,18.47,222907508,cb653adc-46a2-4d90-9e34-5bdfb2be30ce,18.47
2019-10-07T06:26:23.000Z,view,16200162,2053013556344914381,kids.fmcg.diapers,moony,18.47,222907508,cb653adc-46a2-4d90-9e34-5bdfb2be30ce,36.94
2019-10-08T14:29:09.000Z,view,6200883,2053013552293216471,appliances.environment.air_heater,elenberg,46.31,244673419,e2f0524c-bfc4-4c69-b93a-56f983027af3,46.31


In [0]:
df_oct.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
# Handling NULL values in 'brand' is null, return "Not_Available"
     

from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


In [0]:

def normalize_brand(brand):
    if brand is None:
        return "Not_Available"
    return brand.lower()

In [0]:
#Translate Python logic to Spark compatible UDF
brand_udf = udf(normalize_brand, StringType()) # StringType() defines the output schema


In [0]:
df_oct_brand = df_oct.withColumn(
    "normalized_brand",
    brand_udf("brand")
)


In [0]:
# Verify if UDF has been implemented
display(
    df_oct_brand.filter(df_oct_brand.normalized_brand == "Not_Available").limit(10)
)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,normalized_brand
2019-10-01T00:00:01.000Z,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,Not_Available
2019-10-01T00:00:17.000Z,view,23100006,2053013561638126333,,,357.79,513642368,17566c27-0a8f-4506-9f30-c6a2ccbf583b,Not_Available
2019-10-01T00:00:24.000Z,view,34700031,2061717937420501730,,,151.87,539512263,f27a45f8-fb98-459a-96a6-45271f56a987,Not_Available
2019-10-01T00:00:26.000Z,view,13500046,2053013557099889147,furniture.bedroom.bed,,60.75,555446365,7f0062d8-ead0-4e0a-96f6-43a0b79a2fc4,Not_Available
2019-10-01T00:00:27.000Z,view,31501072,2053013558031024687,,,165.64,550978835,6280d577-25c8-4147-99a7-abc6048498d6,Not_Available
2019-10-01T00:00:28.000Z,view,28600026,2053013558282682943,,,399.73,555447224,889da81c-2cfc-4df6-a038-ed436c79ee80,Not_Available
2019-10-01T00:00:28.000Z,view,26200591,2053013563693335403,,,203.35,548449430,99617d1c-1b5a-42f8-99f1-42ad83a6155f,Not_Available
2019-10-01T00:00:34.000Z,view,26200591,2053013563693335403,,,203.35,555447748,b50d1ae8-1948-4517-8460-09b7601ceef6,Not_Available
2019-10-01T00:00:44.000Z,view,23100006,2053013561638126333,,,357.79,513642368,17566c27-0a8f-4506-9f30-c6a2ccbf583b,Not_Available
2019-10-01T00:01:09.000Z,view,15100337,2053013557024391671,,,257.15,519107250,566511c2-e2e3-422b-b695-cf8e6e792ca8,Not_Available
