### this notebook (simulation.ipynb) contains python code that simulates an experiment that compares the performance of different ad campaigns
### the simulated data is then used for analysis in R (analysis.ipynb)
### instructors should have access to both the simulation and analysis notebook and can change the parameters in the simulation
### students should only have access to the simulated data to perform analysis
### the analysis notebook can be shared with students afterwards as the solution 

#### setup:
#### there are 3 funnel stages: awareness, consideration, and purchase
#### there are 2 ad types: branding and performance
#### each user has up to 4 visits and is shown an ad in each visit
#### users can progress in the funnel between visits and make a purchase only when they are in the purchase stage
#### when a user makes a purchase they stop visiting

#### the function simulates a randomized experiment with five conditions as below:
#### 1. control group where no ad is shown
#### 2. branding group where users only see branding ad regardless of their funnel stages
#### 3. performance group where users only see performance ad regardless of their funnel stages
#### 4. brandformance group where users see branding ad in the first (up to) two visits and performance ad in the next (up to) two visits
#### 5. full-funnel group where users see branding ad when they are in awareness stage and performance ad when they are in consideration and purchase stage

In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from collections import Counter

In [2]:
# logic for funnel transition:
# users progress in the funnel with some baseline probability without any ad  
# branding ad moves users from awareness to consideration stage with higher probability, performance ad moves users from consideration to purchase stage with higher probability
# the transition probabilities can be changed 
# branding ad does not work for users in consideration stage, performance ad does not work for users in awareness stage
def next_funnel_stage(current_stage, ad_type):
  if current_stage == "awareness":
      if ad_type == "branding":
          return "consideration" if random.random() < 0.5 else "awareness"
      elif ad_type == "performance":
          return "consideration" if random.random() < 0.1 else "awareness"
      else: # no ad
          return "consideration" if random.random() < 0.1 else "awareness"
  elif current_stage == "consideration":
      if ad_type == "branding":
          return "purchase" if random.random() < 0.01 else "consideration"
      elif ad_type == "performance":
          return "purchase" if random.random() < 0.1 else "consideration"
      else: # no ad
          return "purchase" if random.random() < 0.01 else "consideration"
  else:  
      return "purchase"

# logic for purchase decision
# users can purchase with some probability only when they are in purchase stage
# performance ad increases that probability, branding ad does not 
def purchase_decision(current_stage, ad_type, price):
  if current_stage == "purchase":
      if ad_type == "branding":
          return (1,price) if random.random() < 0.1 else (0,0)
      elif ad_type == "performance":
          return (1,price) if random.random() < 0.2 else (0,0)
      else: # no ad 
          return (1,price) if random.random() < 0.1 else (0,0)
  return (0,0)

In [3]:
def simulate(campaign_type,
                            initial_user_id,
                            num_users, # number of users in each group
                            initial_weights = [0.6, 0.3, 0.1], # the distribution of initial funnel stages
                            price = 100):
  num_users = num_users
  initial_funnel_stages = ["awareness", "consideration", "purchase"]
  start_date = datetime.now().date() - timedelta(days=30)

  # funnel progression and purchase for each campaign type
  if campaign_type == "control":
      ad_type = "none"
      user_data = []
      for user_id in range(initial_user_id, (initial_user_id+num_users)):
          initial_stage = random.choices(initial_funnel_stages, weights = initial_weights)[0]
          initial_date = start_date + timedelta(days=random.randint(0, 30))
          user_data.append({
              "user_id": user_id,
              "next_funnel_stage": initial_stage,
              "ad_type": ad_type,
              "purchase": 0,
              "sales": 0,
              "date": initial_date
          })
      active_users = user_data[:]  # copy initial user data
      all_users = []
      for stage in range(1, 5):
          new_data = []
          for user in active_users:
              current_stage = user["next_funnel_stage"]
              new_stage = next_funnel_stage(current_stage, user["ad_type"])
              (new_purchase, new_sales) = purchase_decision(current_stage, "none", price)
              new_date = user["date"] + timedelta(days=random.randint(1, 7))

              # update user data with new information
              updated_user_info = {
                "user_id": user["user_id"],
                "current_funnel_stage": current_stage,
                "next_funnel_stage": new_stage,
                "ad_type": ad_type,
                "purchase": new_purchase,
                "sales": new_sales,
                "date": new_date,
                "campaign_type": campaign_type,
                "visit": stage
            }
              all_users.append(updated_user_info)

              # continue only if no purchase was made
              if new_purchase == 0:
                  new_data.append(updated_user_info)
          active_users = new_data  # update active users to only those who didn't make a purchase


  elif campaign_type in ["branding","performance"]:
      ad_type = campaign_type
      user_data = []
      for user_id in range(initial_user_id, (initial_user_id+num_users)):
          initial_stage = random.choices(initial_funnel_stages, weights = initial_weights)[0]
          initial_date = start_date + timedelta(days=random.randint(0, 30))
          user_data.append({
              "user_id": user_id,
              "next_funnel_stage": initial_stage,
              "ad_type": ad_type,
              "purchase": 0,
              "sales": 0,
              "date": initial_date
          })
      active_users = user_data[:]  # copy initial user data
      all_users = []
      for stage in range(1, 5):
          new_data = []
          for user in active_users:
              current_stage = user["next_funnel_stage"]
              new_stage = next_funnel_stage(current_stage, user["ad_type"])
              (new_purchase, new_sales) = purchase_decision(current_stage, user["ad_type"], price)
              new_date = user["date"] + timedelta(days=random.randint(1, 7))

              # update user data with new information
              updated_user_info = {
                  "user_id": user["user_id"],
                  "current_funnel_stage": current_stage,
                  "next_funnel_stage": new_stage,
                  "ad_type": ad_type,
                  "purchase": new_purchase,
                  "sales": new_sales,
                  "date": new_date,
                  "campaign_type": campaign_type,
                  "visit": stage
              }
              all_users.append(updated_user_info)

              # continue only if no purchase was made
              if new_purchase == 0:
                  new_data.append(updated_user_info)
          active_users = new_data  # update active users to only those who didn't make a purchase
  elif campaign_type == "brandformance":
      ad_type = []
      user_data = []
      for user_id in range(initial_user_id, (initial_user_id+num_users)):
          initial_stage = random.choices(initial_funnel_stages, weights = initial_weights)[0]
          initial_date = start_date + timedelta(days=random.randint(0, 30))
          user_data.append({
              "user_id": user_id,
              "next_funnel_stage": initial_stage,
              "ad_type": ad_type,
              "purchase": 0,
              "sales": 0,
              "date": initial_date
          })
      active_users = user_data[:]  # copy initial user data
      all_users = []
      for stage in range(1, 5):
          new_data = []
          if stage in [1,2]:
            for user in active_users:
                current_stage = user["next_funnel_stage"]
                new_stage = next_funnel_stage(current_stage, 'branding')
                (new_purchase, new_sales) = purchase_decision(current_stage, 'branding', price)
                new_date = user["date"] + timedelta(days=random.randint(1, 7))

                # update user data with new information
                updated_user_info = {
                  "user_id": user["user_id"],
                  "current_funnel_stage": current_stage,
                  "next_funnel_stage": new_stage,
                  "ad_type": "branding",
                  "purchase": new_purchase,
                  "sales": new_sales,
                  "date": new_date,
                  "campaign_type": campaign_type,
                  "visit": stage
                }
                all_users.append(updated_user_info)

                # continue only if no purchase was made
                if new_purchase == 0:
                    new_data.append(updated_user_info)
            active_users = new_data  # update active users to only those who didn't make a purchase
          else:
            for user in active_users:
                current_stage = user["next_funnel_stage"]
                new_stage = next_funnel_stage(current_stage, 'performance')
                (new_purchase, new_sales) = purchase_decision(current_stage, 'performance', price)
                new_date = user["date"] + timedelta(days=random.randint(1, 7))

                # update user data with new information
                updated_user_info = {
                  "user_id": user["user_id"],
                  "current_funnel_stage": current_stage,
                  "next_funnel_stage": new_stage,
                  "ad_type": "performance",
                  "purchase": new_purchase,
                  "sales": new_sales,
                  "date": new_date,
                  "campaign_type": campaign_type,
                  "visit": stage
                }
                all_users.append(updated_user_info)

                # continue only if no purchase was made
                if new_purchase == 0:
                    new_data.append(updated_user_info)
          active_users = new_data  # update active users to only those who didn't make a purchase
  elif campaign_type == "full_funnel":
      ad_type = []
      user_data = []
      for user_id in range(initial_user_id, (initial_user_id+num_users)):
          initial_stage = random.choices(initial_funnel_stages, weights = initial_weights)[0]
          initial_date = start_date + timedelta(days=random.randint(0, 30))
          user_data.append({
              "user_id": user_id,
              "next_funnel_stage": initial_stage,
              "ad_type": ad_type,
              "purchase": 0,
              "sales": 0,
              "date": initial_date
          })
      active_users = user_data[:]  # copy initial user data
      all_users = []
      for stage in range(1, 5):
          new_data = []
          for user in active_users:
              current_stage = user["next_funnel_stage"]
              new_stage = next_funnel_stage(current_stage, "branding" if current_stage == "awareness" else "performance")
              ad_type = "branding" if current_stage == "awareness" else "performance"
              (new_purchase, new_sales) = purchase_decision(current_stage, "branding" if current_stage == "awareness" else "performance", price)
              new_date = user["date"] + timedelta(days=random.randint(1, 7))

              # update user data with new information
              updated_user_info = {
                  "user_id": user["user_id"],
                  "current_funnel_stage": current_stage,
                  "next_funnel_stage": new_stage,
                  "ad_type": ad_type,
                  "purchase": new_purchase,
                  "sales": new_sales,
                  "date": new_date,
                  "campaign_type": campaign_type,
                  "visit": stage
                }
              all_users.append(updated_user_info)

              # continue only if no purchase was made
              if new_purchase == 0:
                  new_data.append(updated_user_info)
          active_users = new_data  # update active users to only those who didn't make a purchase
  else:
      return

  df_output = pd.DataFrame(all_users)
  return df_output

In [4]:
random.seed(10)

# number of users in each group
n = 10000

df_control = simulate(campaign_type = 'control', initial_user_id = 1, num_users = n)
df_brand = simulate(campaign_type = 'branding', initial_user_id = 1+n, num_users = n)
df_performance = simulate(campaign_type = 'performance', initial_user_id = 1+2*n, num_users = n)
df_brandformance = simulate(campaign_type = 'brandformance', initial_user_id = 1+3*n, num_users = n)
df_full_funnel = simulate(campaign_type = 'full_funnel', initial_user_id = 1+4*n, num_users = n)

In [5]:
df = pd.concat([df_control, df_brand, df_performance, df_brandformance, df_full_funnel], ignore_index=True)

In [6]:
df

Unnamed: 0,user_id,current_funnel_stage,next_funnel_stage,ad_type,purchase,sales,date,campaign_type,visit
0,1,awareness,awareness,none,0,0,2025-05-01,control,1
1,2,awareness,awareness,none,0,0,2025-04-14,control,1
2,3,awareness,awareness,none,0,0,2025-05-11,control,1
3,4,awareness,awareness,none,0,0,2025-04-23,control,1
4,5,consideration,consideration,none,0,0,2025-04-24,control,1
...,...,...,...,...,...,...,...,...,...
195585,49995,consideration,consideration,performance,0,0,2025-05-17,full_funnel,4
195586,49996,purchase,purchase,performance,1,100,2025-05-17,full_funnel,4
195587,49997,awareness,consideration,branding,0,0,2025-05-04,full_funnel,4
195588,49998,consideration,consideration,performance,0,0,2025-05-03,full_funnel,4


In [7]:
df.to_csv('data.csv', index = False)

#### now suppose we do not know the true funnel stage and have to target ad in full-funnel group with predicted funnel stage 
#### predicted funnel stage is the same as true funnel stage with some probability, otherwise it is randomly sampled from other stages
#### funnel progression and purchase decision are based on true funnel stage and ad type are targeted based on predicted funnel stage
#### in analysis, this is added as condition 6. predicted full-funnel group where users see branding ad when they are in predicted awareness stage and performance ad when they are in predicted consideration and purchase stage

In [8]:
def simulate_predicted(initial_user_id,
                            num_users, # number of users in each group
                            initial_weights = [0.6, 0.3, 0.1], # the distribution of initial funnel stages
                            accuracy = 0.9, # accuracy of funnel stage prediction
                            price = 100):
  num_users = num_users
  initial_funnel_stages = ["awareness", "consideration", "purchase"]
  start_date = datetime.now().date() - timedelta(days=30)
 
  ad_type = []
  user_data = []
  for user_id in range(initial_user_id, (initial_user_id+num_users)):
      initial_stage = random.choices(initial_funnel_stages, weights = initial_weights)[0]
      initial_date = start_date + timedelta(days=random.randint(0, 30))
      user_data.append({
          "user_id": user_id,
          "next_funnel_stage": initial_stage,
          "ad_type": ad_type,
          "purchase": 0,
          "sales": 0,
          "date": initial_date
      })
  active_users = user_data[:]  # copy initial user data
  all_users = []
  for stage in range(1, 5):
      new_data = []
      for user in active_users:
          current_stage = user["next_funnel_stage"]
          # predicted funnel stage equals to true stage with probability = accuracy, otherwise it is randomly sampled from other stages
          current_stage_predicted = user["next_funnel_stage"] if random.random() < accuracy else random.choice([x for x in initial_funnel_stages if x != user["next_funnel_stage"]])
          # funnel transition is based on true funnel stage and ad targeted with predicted funnel stage
          new_stage = next_funnel_stage(current_stage, "branding" if current_stage_predicted == "awareness" else "performance")
          new_stage_predicted = new_stage if random.random() < accuracy else random.choice([x for x in initial_funnel_stages if x != new_stage])
          ad_type = "branding" if current_stage_predicted == "awareness" else "performance"
          (new_purchase, new_sales) = purchase_decision(current_stage, "branding" if current_stage_predicted == "awareness" else "performance", price)
          new_date = user["date"] + timedelta(days=random.randint(1, 7))

          # update user data with new information
          updated_user_info = {
              "user_id": user["user_id"],
              "current_funnel_stage": current_stage,
              "current_funnel_stage_predicted": current_stage_predicted,
              "next_funnel_stage": new_stage,
              "next_funnel_stage_predicted": new_stage_predicted,
              "ad_type": ad_type,
              "purchase": new_purchase,
              "sales": new_sales,
              "date": new_date,
              "campaign_type": "full_funnel_predicted",
              "visit": stage
            }
          all_users.append(updated_user_info)

          # continue only if no purchase was made
          if new_purchase == 0:
              new_data.append(updated_user_info)
      active_users = new_data  # update active users to only those who didn't make a purchase

  df_output = pd.DataFrame(all_users)
  return df_output

In [9]:
random.seed(10)

# number of users in each group
n = 10000

# simulate data with 90% prediction accuracy
df_predicted_high = simulate_predicted(initial_user_id = 1+5*n, num_users = n, accuracy = 0.9)

# simulate data with 70% prediction accuracy
df_predicted_medium = simulate_predicted(initial_user_id = 1+5*n, num_users = n, accuracy = 0.7)

# simulate data with 50% prediction accuracy
df_predicted_low = simulate_predicted(initial_user_id = 1+5*n, num_users = n, accuracy = 0.5)

In [10]:
df_predicted_high.to_csv('data_predicted_high.csv', index = False)
df_predicted_medium.to_csv('data_predicted_medium.csv', index = False)
df_predicted_low.to_csv('data_predicted_low.csv', index = False)