### this notebook (analysis.ipynb) contains R code that analyzes the data from the simulation 
### it answers the following questions
### 1. what are the effect of different ad campaigns on purchase probability and sales?
### 2. how does ad type affects funnel progression?
### 3. how does ad type affects purchase probability and sales for users in the purchase stage?
### 4. what are the ROIs of different ad campaigns 
### 5. how effective is the full-funnel campaign when funnel stage is predicted with high, medium, and low accuracy?

In [1]:
suppressMessages(library(tidyverse))

In [2]:
df = read.csv('data.csv')
df = arrange(df, campaign_type, user_id, visit)

In [3]:
# control group as the baseline
df$campaign_type = as.factor(df$campaign_type)
df$campaign_type <- relevel(df$campaign_type, ref = 'control')

In [4]:
# no ad as the baseline
df$ad_type = as.factor(df$ad_type)
df$ad_type <- relevel(df$ad_type, ref = 'none')

In [5]:
# inspect the data structure 
head(df)

Unnamed: 0_level_0,user_id,current_funnel_stage,next_funnel_stage,ad_type,purchase,sales,date,campaign_type,visit
Unnamed: 0_level_1,<int>,<chr>,<chr>,<fct>,<int>,<int>,<chr>,<fct>,<int>
1,30001,awareness,awareness,branding,0,0,2025-04-26,brandformance,1
2,30001,awareness,consideration,branding,0,0,2025-04-30,brandformance,2
3,30001,consideration,consideration,performance,0,0,2025-05-05,brandformance,3
4,30001,consideration,consideration,performance,0,0,2025-05-07,brandformance,4
5,30002,awareness,consideration,branding,0,0,2025-05-17,brandformance,1
6,30002,consideration,consideration,branding,0,0,2025-05-24,brandformance,2


In [6]:
# all possible funnel stages
unique(as.character(df$current_funnel_stage))

In [7]:
# all ad types
unique(as.character(df$ad_type))

In [8]:
# all campaign types
unique(as.character(df$campaign_type))

#### 1. overall effect of campaign on purchase probability and sales

In [9]:
# purchase probability as the outcome (all campaigns)
# intercept shows purchase probability without any ad
# data is aggregated from user-visit level to user level
model = lm(purchase ~ campaign_type, 
           df %>% 
           group_by(user_id) %>% 
           mutate(purchase = sum(purchase)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = purchase ~ campaign_type, data = df %>% group_by(user_id) %>% 
    mutate(purchase = sum(purchase)) %>% distinct(user_id, .keep_all = T))

Residuals:
    Min      1Q  Median      3Q     Max 
-0.1058 -0.0836 -0.0639 -0.0386  0.9614 

Coefficients:
                            Estimate Std. Error t value Pr(>|t|)    
(Intercept)                 0.038700   0.002471  15.659  < 2e-16 ***
campaign_typebrandformance  0.025200   0.003495   7.210 5.66e-13 ***
campaign_typebranding      -0.000100   0.003495  -0.029    0.977    
campaign_typefull_funnel    0.067100   0.003495  19.199  < 2e-16 ***
campaign_typeperformance    0.044900   0.003495  12.847  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2471 on 49995 degrees of freedom
Multiple R-squared:  0.01099,	Adjusted R-squared:  0.01091 
F-statistic: 138.9 on 4 and 49995 DF,  p-value: < 2.2e-16


In [10]:
# purchase probability as the outcome (directly comparing any two campaigns)
model = lm(purchase ~ campaign_type, 
           df %>% 
           group_by(user_id) %>% 
           mutate(purchase = sum(purchase)) %>% 
           distinct(user_id, .keep_all = T) %>%
           filter(campaign_type %in% c('brandformance', 'full_funnel')))
summary(model)


Call:
lm(formula = purchase ~ campaign_type, data = df %>% group_by(user_id) %>% 
    mutate(purchase = sum(purchase)) %>% distinct(user_id, .keep_all = T) %>% 
    filter(campaign_type %in% c("brandformance", "full_funnel")))

Residuals:
    Min      1Q  Median      3Q     Max 
-0.1058 -0.1058 -0.0639 -0.0639  0.9361 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)              0.063900   0.002779   23.00   <2e-16 ***
campaign_typefull_funnel 0.041900   0.003930   10.66   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2779 on 19998 degrees of freedom
Multiple R-squared:  0.005652,	Adjusted R-squared:  0.005603 
F-statistic: 113.7 on 1 and 19998 DF,  p-value: < 2.2e-16


In [11]:
# sales as the outcome (all campaigns)
# intercept shows average sales without any ad
# data is aggregated from user-visit level to user level
model = lm(sales ~ campaign_type, 
           df %>% 
           group_by(user_id) %>% 
           mutate(sales = sum(sales)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = sales ~ campaign_type, data = df %>% group_by(user_id) %>% 
    mutate(sales = sum(sales)) %>% distinct(user_id, .keep_all = T))

Residuals:
   Min     1Q Median     3Q    Max 
-10.58  -8.36  -6.39  -3.86  96.14 

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)    
(Intercept)                  3.8700     0.2471  15.659  < 2e-16 ***
campaign_typebrandformance   2.5200     0.3495   7.210 5.66e-13 ***
campaign_typebranding       -0.0100     0.3495  -0.029    0.977    
campaign_typefull_funnel     6.7100     0.3495  19.199  < 2e-16 ***
campaign_typeperformance     4.4900     0.3495  12.847  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 24.71 on 49995 degrees of freedom
Multiple R-squared:  0.01099,	Adjusted R-squared:  0.01091 
F-statistic: 138.9 on 4 and 49995 DF,  p-value: < 2.2e-16


In [12]:
# sales as the outcome (directly comparing any two campaigns)
model = lm(sales ~ campaign_type, 
           df %>% 
           group_by(user_id) %>% 
           mutate(sales = sum(sales)) %>% 
           distinct(user_id, .keep_all = T) %>%
           filter(campaign_type %in% c('brandformance', 'full_funnel')))
summary(model)


Call:
lm(formula = sales ~ campaign_type, data = df %>% group_by(user_id) %>% 
    mutate(sales = sum(sales)) %>% distinct(user_id, .keep_all = T) %>% 
    filter(campaign_type %in% c("brandformance", "full_funnel")))

Residuals:
   Min     1Q Median     3Q    Max 
-10.58 -10.58  -6.39  -6.39  93.61 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)                6.3900     0.2779   23.00   <2e-16 ***
campaign_typefull_funnel   4.1900     0.3930   10.66   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 27.79 on 19998 degrees of freedom
Multiple R-squared:  0.005652,	Adjusted R-squared:  0.005603 
F-statistic: 113.7 on 1 and 19998 DF,  p-value: < 2.2e-16


#### 2. effect of ad type on funnel progression

In [13]:
df$stage_progress = ifelse(df$current_funnel_stage == df$next_funnel_stage, 0, 1)

In [14]:
model = lm(stage_progress ~ ad_type, 
          filter(df, current_funnel_stage == 'awareness'))
summary(model)


Call:
lm(formula = stage_progress ~ ad_type, data = filter(df, current_funnel_stage == 
    "awareness"))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.49890 -0.09881 -0.09847  0.50110  0.90153 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)        0.098466   0.002732  36.039   <2e-16 ***
ad_typebranding    0.400434   0.003516 113.896   <2e-16 ***
ad_typeperformance 0.000340   0.003734   0.091    0.927    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3946 on 76700 degrees of freedom
Multiple R-squared:  0.1998,	Adjusted R-squared:  0.1998 
F-statistic:  9577 on 2 and 76700 DF,  p-value: < 2.2e-16


In [15]:
model = lm(stage_progress ~ ad_type, 
          filter(df, current_funnel_stage == 'consideration'))
summary(model)


Call:
lm(formula = stage_progress ~ ad_type, data = filter(df, current_funnel_stage == 
    "consideration"))

Residuals:
     Min       1Q   Median       3Q      Max 
-0.10049 -0.10049 -0.01118 -0.00959  0.99041 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)         0.011183   0.001857   6.024 1.71e-09 ***
ad_typebranding    -0.001596   0.002231  -0.716    0.474    
ad_typeperformance  0.089306   0.002114  42.235  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2255 on 97682 degrees of freedom
Multiple R-squared:  0.03863,	Adjusted R-squared:  0.03861 
F-statistic:  1962 on 2 and 97682 DF,  p-value: < 2.2e-16


#### 3. effect of ad type on purchase probability and sales

In [16]:
# purchase probability as the outcome 
model = lm(purchase ~ ad_type, 
          filter(df, current_funnel_stage == 'purchase'))
summary(model)


Call:
lm(formula = purchase ~ ad_type, data = filter(df, current_funnel_stage == 
    "purchase"))

Residuals:
    Min      1Q  Median      3Q     Max 
-0.1968 -0.1968 -0.1038 -0.1025  0.8975 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)         0.103781   0.005892  17.613   <2e-16 ***
ad_typebranding    -0.001245   0.007627  -0.163     0.87    
ad_typeperformance  0.093073   0.006749  13.790   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.3598 on 21199 degrees of freedom
Multiple R-squared:  0.01645,	Adjusted R-squared:  0.01635 
F-statistic: 177.3 on 2 and 21199 DF,  p-value: < 2.2e-16


In [17]:
# sales as the outcome 
model = lm(sales ~ ad_type, 
          filter(df, current_funnel_stage == 'purchase'))
summary(model)


Call:
lm(formula = sales ~ ad_type, data = filter(df, current_funnel_stage == 
    "purchase"))

Residuals:
   Min     1Q Median     3Q    Max 
-19.68 -19.68 -10.38 -10.25  89.75 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)         10.3781     0.5892  17.613   <2e-16 ***
ad_typebranding     -0.1245     0.7627  -0.163     0.87    
ad_typeperformance   9.3073     0.6749  13.790   <2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 35.98 on 21199 degrees of freedom
Multiple R-squared:  0.01645,	Adjusted R-squared:  0.01635 
F-statistic: 177.3 on 2 and 21199 DF,  p-value: < 2.2e-16


#### 4. ROI

In [18]:
# cpm (cost per thousand impressions) for branding ad, can be changed 
cpm = 30 
# cpa (cost per action/purchase) for performance ad, can be changed
cpa = 10

In [19]:
# cost of branding campaign
branding_ad_cost = cpm/1000 * df %>% filter(campaign_type == 'branding') %>% nrow
performance_ad_cost = 0
cost_branding = branding_ad_cost + performance_ad_cost

print(paste("branding ad cost:", branding_ad_cost))
print(paste("performance ad cost:", performance_ad_cost))
print(paste("total cost:", cost_branding))

[1] "branding ad cost: 1181.82"
[1] "performance ad cost: 0"
[1] "total cost: 1181.82"


In [20]:
# cost of performance campaign
branding_ad_cost = 0
performance_ad_cost = cpa * df %>% filter(campaign_type == 'performance') %>% pull(purchase) %>% sum
cost_performance = branding_ad_cost + performance_ad_cost

print(paste("branding ad cost:", branding_ad_cost))
print(paste("performance ad cost:", performance_ad_cost))
print(paste("total cost:", cost_performance))

[1] "branding ad cost: 0"
[1] "performance ad cost: 8360"
[1] "total cost: 8360"


In [21]:
# cost of brandformance campaign
branding_ad_cost = cpm/1000 * df %>% filter(campaign_type == 'brandformance', ad_type == 'branding') %>% nrow
performance_ad_cost = cpa * df %>% filter(campaign_type == 'brandformance') %>% pull(purchase) %>% sum
cost_brandformance = branding_ad_cost + performance_ad_cost

print(paste("branding ad cost:", branding_ad_cost))
print(paste("performance ad cost:", performance_ad_cost))
print(paste("total cost:", cost_brandformance))

[1] "branding ad cost: 597.27"
[1] "performance ad cost: 6390"
[1] "total cost: 6987.27"


In [22]:
# cost of full-funnel campaign
branding_ad_cost = cpm/1000 * df %>% filter(campaign_type == 'full_funnel', ad_type == 'branding') %>% nrow
performance_ad_cost = cpa * df %>% filter(campaign_type == 'full_funnel') %>% pull(purchase) %>% sum
cost_full_funnel = branding_ad_cost + performance_ad_cost

print(paste("branding ad cost:", branding_ad_cost))
print(paste("performance ad cost:", performance_ad_cost))
print(paste("total cost:", cost_full_funnel))

[1] "branding ad cost: 338.91"
[1] "performance ad cost: 10580"
[1] "total cost: 10918.91"


In [23]:
# ROI of branding campaign
roi_branding = ((df %>% filter(campaign_type == 'branding') %>% pull(sales) %>% sum - 
                 df %>% filter(campaign_type == 'control') %>% pull(sales) %>% sum) - cost_branding) / cost_branding
print(paste("branding campaign ROI:", roi_branding))

[1] "branding campaign ROI: -1.08461525443807"


In [24]:
# ROI of performance campaign
roi_performance = ((df %>% filter(campaign_type == 'performance') %>% pull(sales) %>% sum - 
                    df %>% filter(campaign_type == 'control') %>% pull(sales) %>% sum) - cost_performance) / cost_performance
print(paste("performance campaign ROI:", roi_performance))

[1] "performance campaign ROI: 4.37081339712919"


In [25]:
# ROI of brandformance campaign
roi_brandformance = ((df %>% filter(campaign_type == 'brandformance') %>% pull(sales) %>% sum - 
                    df %>% filter(campaign_type == 'control') %>% pull(sales) %>% sum) - cost_brandformance) / cost_brandformance
print(paste("brandformance campaign ROI:", roi_brandformance))

[1] "brandformance campaign ROI: 2.60655878476143"


In [26]:
# ROI of full-funnel campaign
roi_full_funnel = ((df %>% filter(campaign_type == 'full_funnel') %>% pull(sales) %>% sum - 
                    df %>% filter(campaign_type == 'control') %>% pull(sales) %>% sum) - cost_full_funnel) / cost_full_funnel
print(paste("full-funnel campaign ROI:", roi_full_funnel))

[1] "full-funnel campaign ROI: 5.14530204938039"


#### 5. adding a full_funnel group where funnel stages are predicted instead of known

##### high prediction accuracy (90%)

In [27]:
df_predicted = read.csv('data_predicted_high.csv')

In [28]:
df_predicted$stage_progress = ifelse(df_predicted$current_funnel_stage == df_predicted$next_funnel_stage, 0, 1)

In [29]:
df_predicted = rbind(df_predicted, 
                     df %>% mutate(current_funnel_stage_predicted = current_funnel_stage, # add two predicted funnel stage columns in other conditions so we can stack them up 
                                  next_funnel_stage_predicted = next_funnel_stage))
df_predicted = arrange(df_predicted, campaign_type, user_id, visit)

In [30]:
# control group as the baseline
df_predicted$campaign_type = as.factor(df_predicted$campaign_type)
df_predicted$campaign_type <- relevel(df_predicted$campaign_type, ref = 'control')

In [31]:
# no ad as the baseline
df_predicted$ad_type = as.factor(df_predicted$ad_type)
df_predicted$ad_type <- relevel(df_predicted$ad_type, ref = 'none')

In [32]:
# purchase probability as the outcome (all campaigns)
# intercept shows average probability without any ad
# data is aggregated from user-visit level to user level
model = lm(purchase ~ campaign_type, 
           df_predicted %>% 
           group_by(user_id) %>% 
           mutate(purchase = sum(purchase)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = purchase ~ campaign_type, data = df_predicted %>% 
    group_by(user_id) %>% mutate(purchase = sum(purchase)) %>% 
    distinct(user_id, .keep_all = T))

Residuals:
    Min      1Q  Median      3Q     Max 
-0.1058 -0.1026 -0.0639 -0.0387  0.9614 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         0.038700   0.002574  15.036  < 2e-16 ***
campaign_typebrandformance          0.025200   0.003640   6.923 4.46e-12 ***
campaign_typebranding              -0.000100   0.003640  -0.027    0.978    
campaign_typefull_funnel            0.067100   0.003640  18.435  < 2e-16 ***
campaign_typefull_funnel_predicted  0.063900   0.003640  17.556  < 2e-16 ***
campaign_typeperformance            0.044900   0.003640  12.336  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2574 on 59994 degrees of freedom
Multiple R-squared:  0.0112,	Adjusted R-squared:  0

In [33]:
# sales as the outcome (all campaigns)
# intercept shows average sales without any ad
# data is aggregated from user-visit level to user level
model = lm(sales ~ campaign_type, 
           df_predicted %>% 
           group_by(user_id) %>% 
           mutate(sales = sum(sales)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = sales ~ campaign_type, data = df_predicted %>% group_by(user_id) %>% 
    mutate(sales = sum(sales)) %>% distinct(user_id, .keep_all = T))

Residuals:
   Min     1Q Median     3Q    Max 
-10.58 -10.26  -6.39  -3.87  96.14 

Coefficients:
                                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)                          3.8700     0.2574  15.036  < 2e-16 ***
campaign_typebrandformance           2.5200     0.3640   6.923 4.46e-12 ***
campaign_typebranding               -0.0100     0.3640  -0.027    0.978    
campaign_typefull_funnel             6.7100     0.3640  18.435  < 2e-16 ***
campaign_typefull_funnel_predicted   6.3900     0.3640  17.556  < 2e-16 ***
campaign_typeperformance             4.4900     0.3640  12.336  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 25.74 on 59994 degrees of freedom
Multiple R-squared:  0.0112,	Adjusted R-squared:  0.01112 
F-statistic:   136 on 5 

In [34]:
# cost of predicted full-funnel campaign
branding_ad_cost = cpm/1000 * df_predicted %>% filter(campaign_type == 'full_funnel_predicted', ad_type == 'branding') %>% nrow
performance_ad_cost = cpa * df_predicted %>% filter(campaign_type == 'full_funnel_predicted') %>% pull(purchase) %>% sum
cost_full_funnel_predicted = branding_ad_cost + performance_ad_cost

print(paste("branding ad cost:", branding_ad_cost))
print(paste("performance ad cost:", performance_ad_cost))
print(paste("total cost:", cost_full_funnel_predicted))

[1] "branding ad cost: 365.67"
[1] "performance ad cost: 10260"
[1] "total cost: 10625.67"


In [35]:
# ROI of predicted full-funnel campaign
roi_full_funnel_predicted = ((df_predicted %>% filter(campaign_type == 'full_funnel_predicted') %>% pull(sales) %>% sum - 
                    df %>% filter(campaign_type == 'control') %>% pull(sales) %>% sum) - cost_full_funnel_predicted) / cost_full_funnel_predicted
print(paste("predicted full-funnel campaign ROI:", roi_full_funnel_predicted))

[1] "predicted full-funnel campaign ROI: 5.01373842778855"


##### medium prediction accuracy (70%)

In [36]:
df_predicted = read.csv('data_predicted_medium.csv')

In [37]:
df_predicted$stage_progress = ifelse(df_predicted$current_funnel_stage == df_predicted$next_funnel_stage, 0, 1)

In [38]:
df_predicted = rbind(df_predicted, 
                     df %>% mutate(current_funnel_stage_predicted = current_funnel_stage,
                                  next_funnel_stage_predicted = next_funnel_stage))
df_predicted = arrange(df_predicted, campaign_type, user_id, visit)

In [39]:
# control group as the baseline
df_predicted$campaign_type = as.factor(df_predicted$campaign_type)
df_predicted$campaign_type <- relevel(df_predicted$campaign_type, ref = 'control')

In [40]:
# no ad as the baseline
df_predicted$ad_type = as.factor(df_predicted$ad_type)
df_predicted$ad_type <- relevel(df_predicted$ad_type, ref = 'none')

In [41]:
# purchase probability as the outcome (all campaigns)
# intercept shows average probability without any ad
# data is aggregated from user-visit level to user level
model = lm(purchase ~ campaign_type, 
           df_predicted %>% 
           group_by(user_id) %>% 
           mutate(purchase = sum(purchase)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = purchase ~ campaign_type, data = df_predicted %>% 
    group_by(user_id) %>% mutate(purchase = sum(purchase)) %>% 
    distinct(user_id, .keep_all = T))

Residuals:
    Min      1Q  Median      3Q     Max 
-0.1058 -0.0896 -0.0639 -0.0387  0.9614 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         0.038700   0.002540  15.239  < 2e-16 ***
campaign_typebrandformance          0.025200   0.003592   7.017  2.3e-12 ***
campaign_typebranding              -0.000100   0.003592  -0.028    0.978    
campaign_typefull_funnel            0.067100   0.003592  18.683  < 2e-16 ***
campaign_typefull_funnel_predicted  0.050900   0.003592  14.173  < 2e-16 ***
campaign_typeperformance            0.044900   0.003592  12.502  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.254 on 59994 degrees of freedom
Multiple R-squared:  0.009862,	Adjusted R-squared:  

In [42]:
# sales as the outcome (all campaigns)
# intercept shows average sales without any ad
# data is aggregated from user-visit level to user level
model = lm(sales ~ campaign_type, 
           df_predicted %>% 
           group_by(user_id) %>% 
           mutate(sales = sum(sales)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = sales ~ campaign_type, data = df_predicted %>% group_by(user_id) %>% 
    mutate(sales = sum(sales)) %>% distinct(user_id, .keep_all = T))

Residuals:
   Min     1Q Median     3Q    Max 
-10.58  -8.96  -6.39  -3.87  96.14 

Coefficients:
                                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)                          3.8700     0.2540  15.239  < 2e-16 ***
campaign_typebrandformance           2.5200     0.3591   7.017  2.3e-12 ***
campaign_typebranding               -0.0100     0.3591  -0.028    0.978    
campaign_typefull_funnel             6.7100     0.3591  18.683  < 2e-16 ***
campaign_typefull_funnel_predicted   5.0900     0.3591  14.173  < 2e-16 ***
campaign_typeperformance             4.4900     0.3591  12.502  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 25.4 on 59994 degrees of freedom
Multiple R-squared:  0.009862,	Adjusted R-squared:  0.009779 
F-statistic: 119.5 on 

In [43]:
# cost of predicted full-funnel campaign
branding_ad_cost = cpm/1000 * df_predicted %>% filter(campaign_type == 'full_funnel_predicted', ad_type == 'branding') %>% nrow
performance_ad_cost = cpa * df_predicted %>% filter(campaign_type == 'full_funnel_predicted') %>% pull(purchase) %>% sum
cost_full_funnel_predicted = branding_ad_cost + performance_ad_cost

print(paste("branding ad cost:", branding_ad_cost))
print(paste("performance ad cost:", performance_ad_cost))
print(paste("total cost:", cost_full_funnel_predicted))

[1] "branding ad cost: 393.99"
[1] "performance ad cost: 8960"
[1] "total cost: 9353.99"


In [44]:
# ROI of predicted full-funnel campaign
roi_full_funnel_predicted = ((df_predicted %>% filter(campaign_type == 'full_funnel_predicted') %>% pull(sales) %>% sum - 
                    df %>% filter(campaign_type == 'control') %>% pull(sales) %>% sum) - cost_full_funnel_predicted) / cost_full_funnel_predicted
print(paste("predicted full-funnel campaign ROI:", roi_full_funnel_predicted))

[1] "predicted full-funnel campaign ROI: 4.44152816071003"


##### low prediction accuracy (50%)

In [45]:
df_predicted = read.csv('data_predicted_low.csv')

In [46]:
df_predicted$stage_progress = ifelse(df_predicted$current_funnel_stage == df_predicted$next_funnel_stage, 0, 1)

In [47]:
df_predicted = rbind(df_predicted, 
                     df %>% mutate(current_funnel_stage_predicted = current_funnel_stage,
                                  next_funnel_stage_predicted = next_funnel_stage))
df_predicted = arrange(df_predicted, campaign_type, user_id, visit)

In [48]:
# control group as the baseline
df_predicted$campaign_type = as.factor(df_predicted$campaign_type)
df_predicted$campaign_type <- relevel(df_predicted$campaign_type, ref = 'control')

In [49]:
# no ad as the baseline
df_predicted$ad_type = as.factor(df_predicted$ad_type)
df_predicted$ad_type <- relevel(df_predicted$ad_type, ref = 'none')

In [50]:
# purchase probability as the outcome (all campaigns)
# intercept shows average probability without any ad
# data is aggregated from user-visit level to user level
model = lm(purchase ~ campaign_type, 
           df_predicted %>% 
           group_by(user_id) %>% 
           mutate(purchase = sum(purchase)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = purchase ~ campaign_type, data = df_predicted %>% 
    group_by(user_id) %>% mutate(purchase = sum(purchase)) %>% 
    distinct(user_id, .keep_all = T))

Residuals:
    Min      1Q  Median      3Q     Max 
-0.1058 -0.0837 -0.0639 -0.0387  0.9614 

Coefficients:
                                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)                         0.038700   0.002524  15.336  < 2e-16 ***
campaign_typebrandformance          0.025200   0.003569   7.061 1.67e-12 ***
campaign_typebranding              -0.000100   0.003569  -0.028    0.978    
campaign_typefull_funnel            0.067100   0.003569  18.802  < 2e-16 ***
campaign_typefull_funnel_predicted  0.045000   0.003569  12.609  < 2e-16 ***
campaign_typeperformance            0.044900   0.003569  12.581  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2523 on 59994 degrees of freedom
Multiple R-squared:  0.009468,	Adjusted R-squared: 

In [51]:
# sales as the outcome (all campaigns)
# intercept shows average sales without any ad
# data is aggregated from user-visit level to user level
model = lm(sales ~ campaign_type, 
           df_predicted %>% 
           group_by(user_id) %>% 
           mutate(sales = sum(sales)) %>% 
           distinct(user_id, .keep_all = T))
summary(model)


Call:
lm(formula = sales ~ campaign_type, data = df_predicted %>% group_by(user_id) %>% 
    mutate(sales = sum(sales)) %>% distinct(user_id, .keep_all = T))

Residuals:
   Min     1Q Median     3Q    Max 
-10.58  -8.37  -6.39  -3.87  96.14 

Coefficients:
                                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)                          3.8700     0.2524  15.336  < 2e-16 ***
campaign_typebrandformance           2.5200     0.3569   7.061 1.67e-12 ***
campaign_typebranding               -0.0100     0.3569  -0.028    0.978    
campaign_typefull_funnel             6.7100     0.3569  18.802  < 2e-16 ***
campaign_typefull_funnel_predicted   4.5000     0.3569  12.609  < 2e-16 ***
campaign_typeperformance             4.4900     0.3569  12.581  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 25.23 on 59994 degrees of freedom
Multiple R-squared:  0.009468,	Adjusted R-squared:  0.009385 
F-statistic: 114.7 on

In [52]:
# cost of predicted full-funnel campaign
branding_ad_cost = cpm/1000 * df_predicted %>% filter(campaign_type == 'full_funnel_predicted', ad_type == 'branding') %>% nrow
performance_ad_cost = cpa * df_predicted %>% filter(campaign_type == 'full_funnel_predicted') %>% pull(purchase) %>% sum
cost_full_funnel_predicted = branding_ad_cost + performance_ad_cost

print(paste("branding ad cost:", branding_ad_cost))
print(paste("performance ad cost:", performance_ad_cost))
print(paste("total cost:", cost_full_funnel_predicted))

[1] "branding ad cost: 404.07"
[1] "performance ad cost: 8370"
[1] "total cost: 8774.07"


In [53]:
# ROI of predicted full-funnel campaign
roi_full_funnel_predicted = ((df_predicted %>% filter(campaign_type == 'full_funnel_predicted') %>% pull(sales) %>% sum - 
                    df %>% filter(campaign_type == 'control') %>% pull(sales) %>% sum) - cost_full_funnel_predicted) / cost_full_funnel_predicted
print(paste("predicted full-funnel campaign ROI:", roi_full_funnel_predicted))

[1] "predicted full-funnel campaign ROI: 4.12874868789513"
