In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder \
        .appName("ad_campaign_analysis") \
        .getOrCreate()

ad_path = "/tmp/spark_assignment_data/ad_campaigns_data.json"

ad_df = spark.read.option('multiline', 'true').json(ad_path)
ad_df.printSchema()
ad_df.show(10)

24/08/01 16:00:14 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

root
 |-- campaign_country: string (nullable = true)
 |-- campaign_id: string (nullable = true)
 |-- campaign_name: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- event_time: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- os_type: string (nullable = true)
 |-- place_id: string (nullable = true)
 |-- user_id: string (nullable = true)



                                                                                

+----------------+-----------+--------------------+-----------+--------------------+----------+-------+---------+-------------------+
|campaign_country|campaign_id|       campaign_name|device_type|          event_time|event_type|os_type| place_id|            user_id|
+----------------+-----------+--------------------+-----------+--------------------+----------+-------+---------+-------------------+
|             USA|    ABCDFAE|Food category tar...|      apple|2018-10-12T13:10:...|impression|    ios|CASSBB-11|1264374214654454321|
|             USA|    ABCDFAE|Food category tar...|   MOTOROLA|2018-10-12T13:09:...|impression|android|CADGBD-13|1674374214654454321|
|             USA|    ABCDFAE|Food category tar...|    SAMSUNG|2018-10-12T13:10:...|  video ad|android|BADGBA-12|   5747421465445443|
|             USA|    ABCDFAE|Food category tar...|    SAMSUNG|2018-10-12T13:10:...|     click|android|CASSBB-11|1864374214654454132|
+----------------+-----------+--------------------+-----------

In [3]:
# hdfs file path
user_profile_path = "/tmp/spark_assignment_data/user_profile_data.json"

# read user profile json data
up_df = spark.read.option("multiline", "true").json(user_profile_path)
up_df.show()

+---------+--------------------+-------+------+-------------------+
|age_group|            category|country|gender|            user_id|
+---------+--------------------+-------+------+-------------------+
|    18-25|  [shopper, student]|    USA|  male|1264374214654454321|
|    25-50|            [parent]|    USA|female|1674374214654454321|
|    25-50|[shopper, parent,...|    USA|  male|   5747421465445443|
|      50+|      [professional]|    USA|  male|1864374214654454132|
|    18-25|  [shopper, student]|    USA|female|  14537421465445443|
|      50+|[shopper, profess...|    USA|female|  25547421465445443|
+---------+--------------------+-------+------+-------------------+



In [3]:
store_file_path = "/tmp/spark_assignment_data/store_data.json"

st_df = spark.read.option("multiline", "true").json(store_file_path)
st_df.show()

+--------------------+-------------+
|           place_ids|   store_name|
+--------------------+-------------+
|[CASSBB-11, CADGB...|     McDonald|
|         [CASSBB-11]|   BurgerKing|
|[BADGBA-13, CASSB...|        Macys|
|         [BADGBA-12]|shoppers stop|
+--------------------+-------------+



In [5]:
# extract date, hour column from event_time of ad compaign dataframe

ad_df1 = ad_df.withColumn("event_time", F.col("event_time").cast("timestamp"))
ad_df2 = ad_df1.withColumn("date", F.to_date("event_time")) \
                .withColumn("hour", F.hour("event_time"))


In [9]:
#  Analyse data for each campaign_id, date, hour, os_type & value to get all  the events with counts


ad_df3 = ad_df2.groupBy("campaign_id", "date", "hour", "os_type", "event_type") \
            .agg(F.count("event_type").alias("event_count")) \
            .groupBy("campaign_id", "date", "hour", "os_type") \
            .pivot("event_type") \
            .agg(F.first("event_count")) \
            .fillna(0) \
            .select(
                "campaign_id",
                "date",
                "hour",
                "os_type",
                F.struct(
                    F.col("impression").alias("impression"),
                    F.col("click").alias("click"),
                    F.col("video ad").alias("video ad")
                ).alias("event")
)

ad_df3.show()

                

+-----------+----------+----+-------+---------+
|campaign_id|      date|hour|os_type|    event|
+-----------+----------+----+-------+---------+
|    ABCDFAE|2018-10-12|  13|android|{1, 1, 1}|
|    ABCDFAE|2018-10-12|  13|    ios|{1, 0, 0}|
+-----------+----------+----+-------+---------+



In [10]:
# save the results to HDFS 
ad_df3.write.json("/tmp/output_data/market_analysis1/", mode="overwrite")

                                                                                

In [19]:
# Analyse data for each campaign_id, date, hour, store_name & value to get all the events with counts

# STEP1: get individual elements from place_ids, then drop place_ids column 
st_explode_df = st_df.withColumn("place_id", F.explode("place_ids")).drop("place_ids")

# STEP2: join the ad campaign and store explode dataframes
campaign_store_join_df = ad_df2.join(st_explode_df, on="place_id", how="inner") \
                    .select("campaign_id",
                            "date",
                            "hour",
                            "event_type",
                            "store_name"
                           )
market_analysis2 = campaign_store_join_df.groupBy("campaign_id", "date", "hour", "store_name", "event_type") \
                    .agg(F.count("event_type").alias("event_count")) \
                    .groupBy("campaign_id", "date", "hour", "store_name") \
                    .pivot("event_type") \
                    .agg(F.first("event_count")) \
                    .fillna(0) \
                    .select(
                        "campaign_id",
                        "date",
                        "hour",
                        "store_name",
                        F.struct(
                            F.col("click").alias("click"),
                            F.col("impression").alias("impression"),
                            F.col("video ad").alias("video ad")
                        ).alias("event")
                    )
# STEP3: write the output to HDFS 
market_analysis2.write.json("/tmp/output_data/market_anlysis2/", mode="overwrite")
            

In [10]:
# Analyse data for each campaign_id, date, hour, gender_type & value to get all the events with counts

# STEP 1: join the campaign dataframe with user profile data 
campaign_user_df = ad_df2.join(up_df, on="user_id", how="inner") \
                    .select(
                        "campaign_id",
                        "date",
                        "hour",
                        "event_type",
                        "gender"
                    )
                       
# STEP2: do analysis
market_analysis3 = campaign_user_df.groupBy("campaign_id", "date", "hour", "event_type", "gender" ) \
                    .agg(F.count("event_type").alias("event_count")) \
                    .groupBy("campaign_id", "date", "hour", "gender") \
                    .pivot("event_type") \
                    .agg(F.first("event_count")) \
                    .fillna(0) \
                    .select(
                        "campaign_id",
                        "date",
                        "hour",
                        "gender",
                        F.struct(
                            F.col("click").alias("click"),
                            F.col("impression").alias("impression"),
                            F.col("video ad").alias("video ad")
                        ).alias("event")
)

# STEP3: write output to HDFS
market_analysis3.write.json("/tmp/output_data/market_analysis3/", mode="overwrite")    

                                                                                