In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, current_date, datediff, to_date

spark = SparkSession.builder.appName("AnalyticsTask").getOrCreate()

base_path = "/home/jovyan/data"

df_purchases = spark.read.parquet(f"{base_path}/silver/sales/")

df_profiles = spark.read.parquet(f"{base_path}/gold/user_profiles_enriched/")

df_profiles = df_profiles.withColumn(
    "age", (datediff(current_date(), to_date(col("birth_date"))) / 365.25).cast("integer")
)

df_joined = df_purchases.join(
    df_profiles,
    on="client_id",
    how="inner"
)

df_filtered = df_joined.filter(
    (col("product_name") == "TV") &
    (col("age") >= 20) &
    (col("age") <= 30) &
    (month(col("purchase_date")) == 9) &
    (dayofmonth(col("purchase_date")) <= 10)
)

df_result = df_filtered.groupBy("state").count().orderBy(col("count").desc())

df_result.show(1)

+-----+-----+
|state|count|
+-----+-----+
| Iowa|  374|
+-----+-----+
only showing top 1 row

