In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

oct_df = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
    header=True,
    inferSchema=True
)

nov_df = spark.read.csv(
    "/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
    header=True,
    inferSchema=True
)


oct_brand = oct_df.groupBy("brand").agg(
    F.count("*").alias("oct_events"),
    F.sum("price").alias("oct_revenue")
)

nov_brand = nov_df.groupBy("brand").agg(
    F.count("*").alias("nov_events"),
    F.sum("price").alias("nov_revenue")
)



JOINS

In [0]:
#inner join
inner_join = oct_brand.join(
    nov_brand,
    on="brand",
    how="inner"
)

inner_join.show(5)


In [0]:
#left join
left_join = oct_brand.join(
    nov_brand,
    on="brand",
    how="left"
)

left_join.show(5)


In [0]:
#Right Join
right_join = oct_brand.join(
    nov_brand, 
    on="brand", 
    how="right")

right_join.show(5)


In [0]:
#full outer join
full_join = oct_brand.join(
    nov_brand,
    on="brand",
    how="outer"
)

full_join.show(5)


WINDOW FUNCTIONS

In [0]:
#Running total
user_window = (
    Window
    .partitionBy("user_id")
    .orderBy("event_time")
)

oct_running_spend = oct_df.withColumn(
    "running_user_spend",
    F.sum("price").over(user_window)
)

oct_running_spend.show(5)


In [0]:
#rank
brand_rank_window = (
    Window
    .partitionBy("brand")
    .orderBy(F.desc("price"))
)

oct_ranked = oct_df.withColumn(
    "price_rank",
    F.rank().over(brand_rank_window)
)

oct_ranked.show(5)

In [0]:
#dense rank
w = Window.partitionBy("brand").orderBy("price")
oct_df.withColumn(
    "dense_price_rank",
    F.dense_rank().over(w)
).show(5)


In [0]:
#row number
w = Window.partitionBy("brand").orderBy("price")

oct_df.withColumn(
    "row_num",
    F.row_number().over(w)
).show(5)


In [0]:
#lag
w_time = Window.partitionBy("user_id").orderBy("event_time")

oct_df.withColumn(
    "prev_price",
    F.lag("price", 1).over(w_time)
).show(5)


In [0]:
#lead
w_time = Window.partitionBy("user_id").orderBy("event_time")
oct_df.withColumn(
    "next_price",
    F.lead("price", 1).over(w_time)
).show(5)


DERIVED FEATURES

In [0]:
from pyspark.sql import functions as F

oct_df = oct_df.withColumn(
    "is_high_value_txn",
    F.when(F.col("price") >= 1000, 1).otherwise(0)
)

oct_df.show(5)

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

brand_window = Window.partitionBy("brand")

events_features = oct_df.withColumn(
    "brand_total_revenue",
    F.sum("price").over(brand_window)
)

events_features = events_features.withColumn(
    "revenue_pct_of_brand",
    F.round(
        F.col("price") / F.col("brand_total_revenue"),
        6
    ))

events_features.show(5)

USER DEFINED FUNCTIONS (UDF)

In [0]:
from pyspark.sql.types import StringType
from pyspark.sql import functions as F

def price_category(price):
    if price is None:
        return "Unknown"
    elif price < 500:
        return "Cheap"
    elif price <= 2000:
        return "Moderate"
    else:
        return "Expensive"

price_category_udf = F.udf(price_category, StringType())

oct_df_with_category = oct_df.withColumn(
    "price_category",
    price_category_udf(F.col("price"))
)

oct_df_with_category.show(5)
