In [0]:
from pyspark.sql import functions as F
import datetime
import requests
import json

In [0]:
# Load Files

google_media_path = '/mnt/capstone/silver/google/'
meta_media_path = '/mnt/capstone/silver/meta/'
internal_path = '/mnt/capstone/silver/internal/'

google_media_df = spark.read.format('delta').load(google_media_path)
display(google_media_df)
meta_media_df = spark.read.format('delta').load(meta_media_path)
display(meta_media_df)
internal_df = spark.read.format('delta').load(internal_path)
display(internal_df)

In [0]:
# Convert date_day to datetime object
google_media_df = google_media_df.withColumn('date_day', F.to_date('date_day'))
meta_media_df = meta_media_df.withColumn('date_day', F.to_date('date_day'))
internal_df = internal_df.withColumn('date_day', F.to_date('date_day'))


In [0]:
# Merge google media, meta media, and internal media into one dataframe     
df = (internal_df
      .join(google_media_df, on=['organization_id', 'date_day'], how='left')
      .join(meta_media_df, on=['organization_id', 'date_day'], how='left')
)

# Generate merged df columns names
print(df.columns)

['organization_id', 'date_day', 'new_customers', 'google_paid_search_spend', 'google_shopping_spend', 'google_pmax_spend', 'google_display_spend', 'google_video_spend', 'google_paid_search_clicks', 'google_shopping_clicks', 'google_pmax_clicks', 'google_display_clicks', 'google_video_clicks', 'google_paid_search_impressions', 'google_shopping_impressions', 'google_pmax_impressions', 'google_display_impressions', 'google_video_impressions', 'meta_facebook_spend', 'meta_instagram_spend', 'meta_other_spend', 'meta_facebook_clicks', 'meta_instagram_clicks', 'meta_other_clicks', 'meta_facebook_impressions', 'meta_instagram_impressions', 'meta_other_impressions']


In [0]:
# Calculate CTR to make CTR columns
channels = [
    ('google_paid_search_clicks', 'google_paid_search_impressions', 'google_paid_search_ctr'),
    ('google_shopping_clicks', 'google_shopping_impressions', 'google_shopping_ctr'),
    ('google_pmax_clicks', 'google_pmax_impressions', 'google_pmax_ctr'),
    ('meta_facebook_clicks', 'meta_facebook_impressions', 'meta_facebook_ctr'),
    ('meta_instagram_clicks', 'meta_instagram_impressions', 'meta_instagram_ctr'),
    ('meta_other_clicks', 'meta_other_impressions', 'meta_other_ctr'),
    ('google_display_clicks', 'google_display_impressions', 'google_display_ctr'),
    ('google_video_clicks', 'google_video_impressions', 'google_video_ctr')
]

ctr_vars = []

# Loop through channels
for clicks_col, impressions_col, ctr_col in channels:
    # Generate values in column if impressions and clicks are over zero; must impute 0 if impressions is zero to prevent divide by zero error
    df = df.withColumn(
        ctr_col,
        F.round(F.when(F.col(impressions_col) > 0, F.col(clicks_col) / F.col(impressions_col)).otherwise(0),4)
    )

    ctr_vars.append(ctr_col)

df.select(ctr_vars).show()

+----------------------+-------------------+---------------+-----------------+------------------+--------------+------------------+----------------+
|google_paid_search_ctr|google_shopping_ctr|google_pmax_ctr|meta_facebook_ctr|meta_instagram_ctr|meta_other_ctr|google_display_ctr|google_video_ctr|
+----------------------+-------------------+---------------+-----------------+------------------+--------------+------------------+----------------+
|                   0.0|             0.0127|            0.0|           0.0118|               0.0|           0.0|               0.0|             0.0|
|                   0.0|             0.0153|            0.0|            0.038|               0.0|           0.0|               0.0|             0.0|
|                   0.0|             0.0186|            0.0|           0.0134|               0.0|           0.0|               0.0|             0.0|
|                   0.0|             0.0109|            0.0|           0.0245|               0.0|         

## Make Holiday and Weekend Variables

In [0]:
# Load cleaned public holidays JSON
holidays_path = '/dbfs/mnt/capstone/silver/holiday_cleaned/cleaned_public_holidays.json'

all_public_holidays = set()

with open(holidays_path, 'r') as f:
    for line in f:
        record = json.loads(line)  
        extracted_date = record["holiday_date"]            
        all_public_holidays.add(datetime.date.fromisoformat(extracted_date))

print(sorted(all_public_holidays))


[datetime.date(2020, 1, 1), datetime.date(2020, 1, 20), datetime.date(2020, 2, 17), datetime.date(2020, 5, 25), datetime.date(2020, 7, 3), datetime.date(2020, 9, 7), datetime.date(2020, 10, 12), datetime.date(2020, 11, 11), datetime.date(2020, 11, 26), datetime.date(2020, 12, 25), datetime.date(2021, 1, 1), datetime.date(2021, 1, 18), datetime.date(2021, 2, 15), datetime.date(2021, 5, 31), datetime.date(2021, 6, 18), datetime.date(2021, 7, 5), datetime.date(2021, 9, 6), datetime.date(2021, 10, 11), datetime.date(2021, 11, 11), datetime.date(2021, 11, 25), datetime.date(2021, 12, 24), datetime.date(2021, 12, 31), datetime.date(2022, 1, 17), datetime.date(2022, 2, 21), datetime.date(2022, 5, 30), datetime.date(2022, 6, 20), datetime.date(2022, 7, 4), datetime.date(2022, 9, 5), datetime.date(2022, 10, 10), datetime.date(2022, 11, 11), datetime.date(2022, 11, 24), datetime.date(2022, 12, 26), datetime.date(2023, 1, 2), datetime.date(2023, 1, 16), datetime.date(2023, 2, 20), datetime.date(2

In [0]:
# Add fixed holidays

# Load cleaned public holidays JSON
holidays_path = '/dbfs/mnt/capstone/silver/holiday_cleaned/cleaned_public_holidays.json'

all_public_holidays = set()

with open(holidays_path, 'r') as f:
    for line in f:
        record = json.loads(line)  
        extracted_date = record["holiday_date"]            
        all_public_holidays.add(datetime.date.fromisoformat(extracted_date))

# Set min/max year
start_year = internal_df.agg({"date_day": "min"}).collect()[0][0].year
end_year = internal_df.agg({"date_day": "max"}).collect()[0][0].year

# Define fixed holidays
fixed_holidays = [
    {'month': 1, 'day': 1, 'name': "New Year's Day"},
    {'month': 6, 'day': 19, 'name': 'Juneteenth'},
    {'month': 7, 'day': 4, 'name': 'Independence Day'},
    {'month': 11, 'day': 11, 'name': 'Veterans Day'},
    {'month': 12, 'day': 25, 'name': 'Christmas Day'}
]

#Add fixed holidays for all years
for year in range(start_year, end_year + 1):
    for h in fixed_holidays:
        all_public_holidays.add(datetime.date(year, h['month'], h['day']))

# Sort list
all_public_holidays = sorted(all_public_holidays)

print(all_public_holidays)

[datetime.date(2020, 1, 1), datetime.date(2020, 1, 20), datetime.date(2020, 2, 17), datetime.date(2020, 5, 25), datetime.date(2020, 6, 19), datetime.date(2020, 7, 3), datetime.date(2020, 7, 4), datetime.date(2020, 9, 7), datetime.date(2020, 10, 12), datetime.date(2020, 11, 11), datetime.date(2020, 11, 26), datetime.date(2020, 12, 25), datetime.date(2021, 1, 1), datetime.date(2021, 1, 18), datetime.date(2021, 2, 15), datetime.date(2021, 5, 31), datetime.date(2021, 6, 18), datetime.date(2021, 6, 19), datetime.date(2021, 7, 4), datetime.date(2021, 7, 5), datetime.date(2021, 9, 6), datetime.date(2021, 10, 11), datetime.date(2021, 11, 11), datetime.date(2021, 11, 25), datetime.date(2021, 12, 24), datetime.date(2021, 12, 25), datetime.date(2021, 12, 31), datetime.date(2022, 1, 1), datetime.date(2022, 1, 17), datetime.date(2022, 2, 21), datetime.date(2022, 5, 30), datetime.date(2022, 6, 19), datetime.date(2022, 6, 20), datetime.date(2022, 7, 4), datetime.date(2022, 9, 5), datetime.date(2022, 

In [0]:
# Create Holiday Flag Variable
df = df.withColumn('date_only', F.to_date(F.col('date_day')))

# 3. Create the 'is_public_holiday' flag
df = df.withColumn('is_public_holiday', F.when(F.col('date_only').isin(all_public_holidays), 1).otherwise(0))

# Drop the temporary column
df = df.drop('date_only')

print('Public Holiday Flag Created Successfully.')

Public Holiday Flag Created Successfully.


In [0]:
# Create Weekend Flag Variable

# 0=Monday, 6=Sunday. Weekends are Saturday (5) and Sunday (6).
df = df.withColumn(
    'day_of_week',
    ((F.dayofweek(F.col("date_day")) + 5) % 7)
)

# Create the flag: 1 if day of week is 5 or 6, 0 otherwise
df = df.withColumn('is_weekend', F.when(F.col('day_of_week') >= 5, 1).otherwise(0))

# Drop the intermediate column
df = df.drop('day_of_week')

print("Weekend Flag Created Successfully.")
print(df[['date_day', 'is_public_holiday', 'is_weekend']].head()) # Output new variables

Weekend Flag Created Successfully.
Row(date_day=datetime.date(2020, 6, 14), is_public_holiday=0, is_weekend=1)


In [0]:
# Create month variable that has month name

df = df.withColumn('month', F.date_format(F.col('date_day'), 'MMMM'))

df[['date_day', 'month']].head(10)


[Row(date_day=datetime.date(2020, 6, 14), month='June'),
 Row(date_day=datetime.date(2020, 6, 15), month='June'),
 Row(date_day=datetime.date(2020, 6, 20), month='June'),
 Row(date_day=datetime.date(2020, 6, 17), month='June'),
 Row(date_day=datetime.date(2020, 6, 19), month='June'),
 Row(date_day=datetime.date(2020, 6, 22), month='June'),
 Row(date_day=datetime.date(2020, 6, 18), month='June'),
 Row(date_day=datetime.date(2020, 6, 21), month='June'),
 Row(date_day=datetime.date(2020, 6, 23), month='June'),
 Row(date_day=datetime.date(2020, 6, 16), month='June')]

In [0]:
df.columns

['organization_id',
 'date_day',
 'new_customers',
 'google_paid_search_spend',
 'google_shopping_spend',
 'google_pmax_spend',
 'google_display_spend',
 'google_video_spend',
 'google_paid_search_clicks',
 'google_shopping_clicks',
 'google_pmax_clicks',
 'google_display_clicks',
 'google_video_clicks',
 'google_paid_search_impressions',
 'google_shopping_impressions',
 'google_pmax_impressions',
 'google_display_impressions',
 'google_video_impressions',
 'meta_facebook_spend',
 'meta_instagram_spend',
 'meta_other_spend',
 'meta_facebook_clicks',
 'meta_instagram_clicks',
 'meta_other_clicks',
 'meta_facebook_impressions',
 'meta_instagram_impressions',
 'meta_other_impressions',
 'google_paid_search_ctr',
 'google_shopping_ctr',
 'google_pmax_ctr',
 'meta_facebook_ctr',
 'meta_instagram_ctr',
 'meta_other_ctr',
 'google_display_ctr',
 'google_video_ctr',
 'is_public_holiday',
 'is_weekend',
 'month']

In [0]:
df.groupBy('is_public_holiday').count().show()
df.groupBy('is_weekend').count().show()

+-----------------+-----+
|is_public_holiday|count|
+-----------------+-----+
|                1|  302|
|                0| 8142|
+-----------------+-----+

+----------+-----+
|is_weekend|count|
+----------+-----+
|         1| 2412|
|         0| 6032|
+----------+-----+



In [0]:
df['month']

Column<'month'>

In [0]:
# show aggregated months
# Make month number 
df = df.withColumn('month_num', F.month('date_day'))
df.groupBy('month_num', 'month').count().orderBy("month_num").show()

+---------+---------+-----+
|month_num|    month|count|
+---------+---------+-----+
|        1|  January|  744|
|        2| February|  703|
|        3|    March|  775|
|        4|    April|  750|
|        5|      May|  726|
|        6|     June|  599|
|        7|     July|  626|
|        8|   August|  674|
|        9|September|  690|
|       10|  October|  740|
|       11| November|  704|
|       12| December|  713|
+---------+---------+-----+



In [0]:
# Save the dataframe into a folder
gold_path = '/mnt/capstone/gold/digital_media/'
df.write.format("delta").mode("overwrite").save(gold_path)

In [0]:
# Create managed table using the saved data
spark.sql("""
CREATE TABLE IF NOT EXISTS digital_media
USING DELTA
LOCATION 'dbfs:/mnt/capstone/gold/digital_media'
""")


DataFrame[]

In [0]:
spark.sql("SHOW TABLES").show()

# alias count(*) as "row count"
spark.sql("SELECT COUNT(*) AS `total row count` FROM digital_media").show()


+--------+-------------+-----------+
|database|    tableName|isTemporary|
+--------+-------------+-----------+
| default|digital_media|      false|
+--------+-------------+-----------+

+---------------+
|total row count|
+---------------+
|           8444|
+---------------+

