In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# Loading Oct and Nov Data into Dataframe

In [0]:
df_oct = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv", header=True, inferSchema=True)
df_nov = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header = True, inferSchema = True)


# Find products that appeared in both October and November.

In [0]:
df_joined = df_oct.alias("oct") \
    .join(
        df_nov.alias("nov"),
        on="product_id",
        how="inner"
    )

df_joined.display()

# Detect price changes for the same product across months.

In [0]:
df_price_compare = df_oct.select(
        col("product_id"),
        col("price").alias("oct_price")
    ) \
    .join(
        df_nov.select(
            col("product_id"),
            col("price").alias("nov_price")
        ),
        on="product_id",
        how="inner"
    )

df_price_compare.show(10)


# Combine both months into a single DataFrame for analysis.

In [0]:
df_all = df_oct.union(df_nov)
df_all.display()

# Find number of events per day

In [0]:
df_all.withColumn("event_date",to_date("event_time"))\
    .groupBy("event_date").count()\
    .orderBy("event_date").display()

# How many users viewed and how many purchased

In [0]:
df_all.groupBy("user_id") \
      .agg(
          (sum(when(col("event_type") == "view", 1).otherwise(0))).alias("views"),
          (sum(when(col("event_type") == "purchase", 1).otherwise(0))).alias("purchases")
      ) \
      .show(10)

# Find the first event performed by each user.

In [0]:
window_spec = Window.partitionBy("user_id").orderBy("event_time")

df_all.withColumn("rn", row_number().over(window_spec)) \
      .filter(col("rn") == 1) \
      .drop("rn") \
      .show()