In [0]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

from scipy.special import logit

In [0]:
# Load data from gold folder
path = '/mnt/capstone/gold/digital_media/'

spark_df = spark.read.format('delta').load(path)
display(spark_df)

In [0]:
df = spark_df.toPandas()
df.info()

In [0]:
# Convert 'date_day' to datetime
df['date_day'] = pd.to_datetime(df['date_day'])
print(df['date_day'].info)

## Check and Evaluate Outliers

In [0]:
sns.boxplot(x=df['new_customers'])
plt.title('New Customers Distribution')
plt.show()

print(df['new_customers'].describe())

- Average customers are about 73 new customers a day. 
- Mean is much higher than the media indicating a right-skewed distribution
    - Violates Linear Regression normality assumption
- Standard deviation is huge compared to the mean at 1,249 new customers.     
     - This could represent extreme variability in customer acquisitions between organizations or media spend. 
- Standard deviation is much higher than the mean indicating overdispersion
    - Violates linear regression homoscedascity assumption
    - Overdispersion meets negative binomial regression assumption
- Minimum days include no new customers, but this makes given some days are no-media/no-spend days. 75% or less of rows had 76 or less customers meaning that most of the data have low counts. 
- Max new customers is extremely high - most likely an input issue and outliers must be filtered out to minimize coefficient inflation.

In [0]:
# Gets Q1 and Q3 values 
q1, q3 = np.percentile(df['new_customers'], [25, 75])

# Calculate interquartile range
iqr = q3 - q1

# Calculate upper limit
upper = q3 + (3 * iqr)
print('Upper Limit:', upper)

# Set variable for count
upper_count = 0

# Count outliers in Population
for x in df['new_customers']:
    if x > upper: 
        upper_count += 1

print('Upper Outliers:', upper_count)

range_val = df['new_customers'].max() - upper
print('Upper Outlier Range:', range_val)

In [0]:
# Filter out extreme outliers
df = df[df['new_customers'] < upper]

## Outliers for Quantitative Variables
Will not check outliers for quantitative variables because they are mainly spends and click through rate based on impressions. They are daily spends and a limitation based on deployed days. Will keep as is based on outliers. 

In [0]:
df['new_customers'].describe()

In [0]:
# Make list of spend, clicks, impressions, and ctr
spend_cols = ['google_paid_search_spend', 'google_shopping_spend', 'google_pmax_spend', 'google_display_spend', 'google_video_spend', 'meta_facebook_spend', 'meta_instagram_spend', 'meta_other_spend']

df[spend_cols].describe()


### Spend Descriptive Stats Overview
- Every spend variable is right skewed with higher means than their median.
- There are huge maximum values (probably for promotional campaign days)
- There are many zero spend days (determined as the non-marketing days)
- Meta Facebook has the highest average spend value compared to other placements
- Google Display and Meta Other spend values is negligible
- Half of the datasets do not have Google PMax, Google Video, Google Display spends
    - Combining the variables with large zero values could help with coefficient stability
- Google Paid Search and Shopping has consistent presence across the dataset
- PMax has a high mean spend value so it looks like it was used for a testing or specific campaign


In [0]:
impressions_cols = ['google_paid_search_impressions', 'google_shopping_impressions', 'google_pmax_impressions', 'google_display_impressions', 'google_video_impressions', 'meta_facebook_impressions', 'meta_instagram_impressions', 'meta_other_impressions']

df[impressions_cols].describe()

## Impressions Descriptive Stats Overview
- Impressions follow the same pattern as spend (makes sense since impressions are tied to spend)
- Also right-skewed - mean is more than median
- Google Paid Search and Shopping are consistently acrive
- PMax has alot of zero values; half of the data doesn't have PMax
    - But of those rows that have PMax we see a high number of impressions

In [0]:
clicks_cols = ['google_paid_search_clicks', 'google_shopping_clicks', 'google_pmax_clicks', 'google_display_clicks', 'google_video_clicks', 'meta_facebook_clicks', 'meta_instagram_clicks', 'meta_other_clicks']

df[clicks_cols].describe()

## Clicks Descriptive Stats Overview
- Clicks follow the same pattern as spend and impressions (makes sense since clicks are tied to spend)
- Also right-skewed - mean is more than median
- Facebook has the highest clicks, followed by google pmax
- Low engagement (clicks) for Google Video, Google Display
- Google Shopping in ratio with impressions has very low clicks

In [0]:
ctr_cols = ['google_paid_search_ctr', 'google_shopping_ctr', 'google_pmax_ctr', 'google_display_ctr', 'google_video_ctr', 'meta_facebook_ctr', 'meta_instagram_ctr', 'meta_other_ctr']

df[ctr_cols].multiply(100).describe() # percent form

## CTR Descriptive Stats Overview
- Best performing: Google Paid Search (highest) and Meta Facebook (had high impressions and clicks; but low CTR)
- Worst performing: Google Display and Google Video
- High impressions but low efficiency:
    - Google Shopping and PMax
- Google PMax, Google Display, Google Video, Meta Instagram, and Meta Other barely have any performance


- First purchases is logged transformed to see if we can normalize the data for the linear regression model. But what we get is bi-modal.There are two sub-groups but we don't know what is causing it. Going to do bivariate analysis to weed it out.

In [0]:
plt.hist(df['new_customers'], edgecolor='white')
plt.title('Histogram of New Customers')
plt.xlabel('New Customers Count')
plt.ylabel('Frequency')
plt.show()

In [0]:
# Add a small constant to avoid log(0)
df['new_customers_log'] = np.log1p(df['new_customers'])

# QQ plot for log-transformed data
plt.hist(df['new_customers_log'], bins=30, edgecolor='white')
plt.title('Histogram of New Customers (Log)')
plt.xlabel('New Customers (Log)')
plt.ylabel('Frequency')
plt.show()

%md
- First purchases is logged transformed to see if we can normalize the data for the linear regression model. But what we get is bi-modal.There are two sub-groups but we don't know what is causing it. Going to do bivariate analysis to weed it out.

In [0]:

# make copy of a df but only with spend_cols > 0
media_on = df[df[spend_cols].sum(axis=1) > 0]

fig, ax = plt.subplots(figsize=(16, 8))

# create histogram
counts, bins, patches = plt.hist(media_on['new_customers_log'], bins=30, edgecolor='white')

# get bin midpoints
bin_mids = 0.5 * (bins[1:] + bins[:-1])

# Add log value labels so we can get the values of the peaks
for x_mid, y in zip(bin_mids, counts):
    if y > 0:  # only label non-empty bins
        plt.text(x_mid, y + max(counts)*0.01, f"{x_mid:.2f}", 
                 ha='center', va='bottom', fontsize=8, rotation=0)

plt.title('Histogram of New Customers (Log Value Labeled)')
plt.xlabel('New Customers (Log)')
plt.ylabel('Frequency')
plt.show()


In [0]:
# Revert logged value to raw value
peak_one = np.expm1(2.25)
peak_two = np.expm1(4.37)

print(f'Peak one new customers is {peak_one:.0f}')
print(f'Peak two new customers is {peak_two:.0f}')


## Univariate for Continuous Variables

### Spend
Filter out zero days to look at the active media dates only. I

In [0]:
# Loop over list of spend continuous variables to generate raw-value histograms

fig, (axes) = plt.subplots(3, 3, figsize=(15, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)

for var, ax in zip(spend_cols, axes.flat):
    
    # Filter out zero|no-spend days
    data = df[df[var] > 0][var]

    var_title = var.replace('_', ' ').title()
    
    ax.set_title(f'{var_title} Distribution')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')

axes[2,2].remove()
plt.show() 

All right-skewed with very minimal spends in some days above 0. 

In [0]:
# Look at values without zero for spend
for var in spend_cols:
    print(f'{var}: ${df[var][df[var] > 0].min()}')

Confirms that some days, literal pennies are spent.

In [0]:
fig, (axes) = plt.subplots(3, 3, figsize=(16, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)

for var, ax in zip(impressions_cols, axes.flat):
    
    # Filter out zero|no-spend days
    data = df[df[var] > 0][var]

    var_title = var.replace('_', ' ').title()
    
    ax.set_title(f'{var_title} Distribution')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')

axes[2,2].remove()
plt.show() 

In [0]:
fig, (axes) = plt.subplots(3, 3, figsize=(16, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)

for var, ax in zip(clicks_cols, axes.flat):
    
    # Filter out zero|no-spend days
    data = df[df[var] > 0][var]

    var_title = var.replace('_', ' ').title()
    
    ax.set_title(f'{var_title} Distribution')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')

axes[2,2].remove()
plt.show() 

In [0]:
fig, (axes) = plt.subplots(3, 3, figsize=(16, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)


for var, ax in zip(ctr_cols, axes.flat):
    
    # Filter out zero|no-spend days and multiply by 100 to get percent
    data = df[df[var] > 0][var]*100
    var_title = var.replace('_', ' ').title()
    
    ax.set_title(f'{var_title} (%) Distribution')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')

axes[2,2].remove() 
plt.show() 

## Log Transforming All Continuous Variables
All variables are extremely right skewed even with non-zero days filtered. Hoping that log-transforming them will help normalize the extreme values

In [0]:
cont_cols = spend_cols + impressions_cols + clicks_cols + ctr_cols
# Loop over continuous columns and log transform using np.log1p() to handle zero values safely 
# (avoid negative values and keeps zero as 0)
for col in cont_cols:
   df[f'{col}_log'] = np.log1p(df[col])
   print(df[f'{col}_log'].describe())



In [0]:
impressions_log_cols = [col + '_log' for col in impressions_cols]
clicks_log_cols = [col + '_log' for col in clicks_cols]
spend_log_cols = [col + '_log' for col in spend_cols]
ctr_log_cols = [col + '_log' for col in ctr_cols]
print(impressions_cols, clicks_log_cols, spend_log_cols, ctr_log_cols)

In [0]:
# # Define spend columns
# spend_cols = [
#     'google_paid_search_spend', 'google_shopping_spend', 'google_pmax_spend',
#     'google_display_spend', 'google_video_spend', 'meta_facebook_spend',
#     'meta_instagram_spend', 'meta_other_spend'
# ]

# # Create zero-spend flags for each channel individually
# for col in spend_cols:
#     df[f'{col}_zero_flag'] = (df[col] == 0).astype(int)

# # Print zero-spend days per channel
# print("Zero-spend days per channel:")
# for col in spend_cols:
#     zero_count = df[f'{col}_zero_flag'].sum()
#     print(f"{col}: {zero_count} days ({zero_count / len(df) * 100:.2f}%)")

# # Create a total zero-spend flag across all channels
# df['no_spend_day'] = (df[spend_cols].sum(axis=1) == 0).astype(int)

# # Filter for zero-spend days where new_customers > 0
# zero_spend_with_customers = df[(df['no_spend_day'] == 1) & (df['new_customers'] > 0)]

# # Show counts and proportions
# total_zero_spend = df['no_spend_day'].sum()
# rows_with_customers = len(zero_spend_with_customers)

# print("\nOverall zero-spend days across all channels:")
# print(f"Zero-spend days with new customers: {rows_with_customers}")
# print(f"Total zero-spend days: {total_zero_spend}")
# print(f"Percentage of zero-spend days with customers: {rows_with_customers / total_zero_spend:.2%}")


## Univariate Log-Transformed Spend

In [0]:
print(spend_log_cols)
df[spend_log_cols].describe()

In [0]:
fig, (axes) = plt.subplots(3, 3, figsize=(16, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)

for var, ax in zip(spend_log_cols, axes.flat): 
    var_title = var.replace('_', ' ').title()

    # Filter non-spend days
    data = df[df[var] > 0][var]
    
    ax.set_title(f'{var_title} Distribution (Logged)')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')
    
axes[2,2].remove()
plt.show() 

In [0]:
fig, (axes) = plt.subplots(3, 3, figsize=(16, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)


for var, ax in zip(impressions_log_cols, axes.flat): 
    var_title = var.replace('_', ' ').title()

    # Filter non-spend days
    data = df[df[var] > 0][var]
    
    ax.set_title(f'{var_title} Distribution (Logged)')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')
    
axes[2,2].remove()
plt.show() 

In [0]:
fig, (axes) = plt.subplots(3, 3, figsize=(16, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)


for var, ax in zip(clicks_log_cols, axes.flat): 
    var_title = var.replace('_', ' ').title()

    # Filter non-spend days
    data = df[df[var] > 0][var]
    
    ax.set_title(f'{var_title} Distribution (Logged)')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')
    
axes[2,2].remove()
plt.show() 

In [0]:
fig, (axes) = plt.subplots(3, 3, figsize=(16, 10))
fig.subplots_adjust(hspace= .5, wspace=.5)


for var, ax in zip(ctr_cols, axes.flat): 
    var_title = var.replace('_', ' ').title()

    # Filter non-spend days
    data = df[df[var] > 0][var]
    
    ax.set_title(f'{var_title} Distribution (Logged)')
    ax.set_ylabel('Frequency')
    ax.set_xlabel(var)
    
    data.hist(ax=ax, edgecolor='white')
    
axes[2,2].remove()
plt.show() 

Confirmed that long transforming normalizes the distribution of spend, impressions, clicks, and CTR. Let's do bivariate with new_customers

In [0]:
# Generate no_spend_day flag (e.g. google_paid_search_no_spend_day = 1 or 0)

for col in spend_cols:
    df[col.replace('_spend', '_no_spend_day')] = (df[col] == 0).astype(int)

# generate list of no_spend_day columns
no_spend_cols = [col for col in df.columns if '_no_spend_day' in col]
display(df[no_spend_cols].head(5))

In [0]:
# get unique organizations ids
orgs = df['organization_id'].unique()
print(orgs)


In [0]:
del dict

In [0]:

orgs = ['09113a73f8618b48b3cf53b24cd78d2f', '2b15eedfc4faa6293dc4b3bf8c1c9c1f',
 '4a762f02ca755b22d37393e8dbeab1a6', '560f5cf4dce8824a907c84162e553de0',
 '6be433bd50b2da88aa18f31056a71ea2', '70f3b9fb9a8f232814ce705d7641ee4f',
 '8c7d01375c3eca8e3ecfe8f1872b3136', 'ff5c2c32d7f1c21594c518fc6a6d4fce']

# Make variable where Organization Id has alias variable like Org Alias: 'A'
org_alias = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']

# Create dictionary of organization ids and aliases
org_dict = dict(zip(orgs, org_alias))

# Create variable with organization aliases that map back to organization 
# '09113a73f8618b48b3cf53b24cd78d2f' = 'Org. A'
org_alias = df['organization_id'].map(org_dict)
df['org_alias'] = org_alias

# make dataframe with count for organization_id and org_alias using pandas
org_map = df.groupby(['org_alias', 'organization_id']).count().reset_index()
org_count = df.groupby(['org_alias']).count().reset_index()

print('Organization Map:')
print(org_map[['org_alias', 'organization_id']])
print('Organization Counts:')
print(org_count[['org_alias', 'organization_id']])



In [0]:
# Generate categorical univariate
cat_vars = ['org_alias', 'month', 'is_weekend', 'is_public_holiday'] + no_spend_cols

# Univariate for Categorical; By Percentage Frequency

# Establish figure and axes
fig, axes = plt.subplots(4, 3, figsize=(18, 18))
fig.subplots_adjust(hspace=.25, wspace=.25)

def get_binary_labels(var):
    if var == 'is_weekend':
        labels = ['Weekday', 'Weekend']
    elif var == 'is_public_holiday':
        labels = ['Not Public Holiday', 'Public Holiday']
    else: 
        labels = ['Spend Day', 'No Spend Day']
    
    count = [df[df[var] == 0].shape[0], df[df[var] == 1].shape[0]]

    return labels, count

# Loop over list of variables to generate pie charts
for var, ax in zip(cat_vars, axes.flat):

    binary_vars = ['is_weekend', 'is_public_holiday'] + no_spend_cols

    # Make labels match order of binary values; else match values to labels
    if var in binary_vars:
        labels, count = get_binary_labels(var)
    else:
        count = df[var].value_counts().values.tolist()
        labels = df[var].value_counts().index.tolist()
    
    var_title = var.replace('_', ' ').title()
    ax.set_title(var_title)
    
    ax.pie(count, 
           labels=labels, 
           autopct='%1.1f%%', 
           wedgeprops = {"edgecolor" : "white", 
                          'linewidth': 1})
plt.show()



In [0]:
# Count how many non '0' (integer) days there are in meta_facebook_spend

for var in ['meta_facebook_spend', 'meta_instagram_spend', 'meta_other_spend', 'google_paid_search_spend', 'google_shopping_spend', 'google_pmax_spend', 'google_display_spend', 'google_video_spend']:
    count_spend_day = df[df[var] > 0][var].count()
    count_no_spend_day = df[df[var] == 0][var].count()
    
    # Generate percentage of spend days
    spend_day_pct = round(count_spend_day / (count_spend_day + count_no_spend_day) * 100, 2)

    # Generate percentage of no spend days
    no_spend_day_pct = round(count_no_spend_day / (count_spend_day + count_no_spend_day) * 100, 2)

    print(f'Percentage of Spend Days for {var}: {spend_day_pct}%')
    print(f'Percentage of No Spend Days for {var}: {no_spend_day_pct}%')

In [0]:
# Bivariate for categorical variables against daily purchase purchases

# Set figure options
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
fig.subplots_adjust(hspace=.5, wspace=.25)

for var, ax in zip(cat_vars, axes.flat):

    ax.set_ylabel("New Customers")
    ax.set_label(var)
   
    var_title = var.replace('_', ' ').title()
    ax.set_title(f'Daily First Purchases by {var_title}')

    # var is months
    if var == 'month':
        order = ['January', 'February', 'March', 'April', 'May', 'June',
             'July', 'August', 'September', 'October', 'November', 'December']
        df['month'] = pd.Categorical(df['month'], categories=order, ordered=True)
        df = df.sort_values('month')
    
    elif var == 'is_weekend':
        df['is_weekend'] = df['is_weekend'].replace({0: 'Weekday', 1: 'Weekend'})

    elif var == 'is_public_holiday':
        df['is_public_holiday'] = df['is_public_holiday'].replace({0: 'Non-Holiday', 1: 'Holiday'})

    elif '_no_spend_day' in var:
        df[var] = df[var].replace({0: 'No Spend Day', 1: 'Spend Day'})

    ax.tick_params(axis='x', labelrotation=90)

    # Draw boxplot
    sns.boxplot(data=df, x=var, y='new_customers', color="pink", ax=ax)

# Show figure
plt.show()



In [0]:
# Descriptive Stats for each against new customers

# Loop through and get desc_stats
for var in cat_vars:
    desc_stats = (
        df.groupby(var)['new_customers']
      .describe()               
      .reset_index()
    )

    # show desc_stats
    display(desc_stats)