In [None]:
# Install missing packages
%pip install pandas numpy

import pandas as pd
import numpy as np
import os
import random
import base64

from pathlib import Path

base_path = Path.cwd().parent

In [8]:
print(base_path)

c:\Users\Indugu Rao\myrepos\google-gemini-agents


In [9]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [10]:
# Define the date range for event_date
date_range = pd.date_range(start="2025-01-01", end="2025-03-31", freq="D")

# Define possible values for categorical columns
campaign_products = ["Auto Insurance", "Homeowners", "Renters", "Credit Card", "Deposits", "Consumer Loans"]
campaign_cosas = {"Auto Insurance": "P&C", "Homeowners": "P&C", "Renters": "P&C",
                  "Credit Card": "Bank", "Deposits": "Bank", "Consumer Loans": "Bank"}
conversion_channels = ["Internet", "Mobile", "Offline"]

num_camps = 2 # number of campaigns for each product / cosa

campaign_names = []

for prod in campaign_products:
    for i in range(1, num_camps + 1):
        # Create campaign names for each product 
        campaign_names.append(f"{campaign_cosas[prod]}_{prod}_Campaign_{i}") # TODO: Create campaign names for each campaign product, 2 per

In [11]:
campaign_names

['P&C_Auto Insurance_Campaign_1',
 'P&C_Auto Insurance_Campaign_2',
 'P&C_Homeowners_Campaign_1',
 'P&C_Homeowners_Campaign_2',
 'P&C_Renters_Campaign_1',
 'P&C_Renters_Campaign_2',
 'Bank_Credit Card_Campaign_1',
 'Bank_Credit Card_Campaign_2',
 'Bank_Deposits_Campaign_1',
 'Bank_Deposits_Campaign_2',
 'Bank_Consumer Loans_Campaign_1',
 'Bank_Consumer Loans_Campaign_2']

In [12]:
#Tables
# campaign taxonomy
# channel peformance
# --- One table for each channel - Paid Search, Paid Display, Paid Social
#   - Columns - event_date, campaign_product, campaign_cosa, converted_product, converted_cosa, campaign_nm, conversion_channel_nm, spend_amt, 
#   -         - quote_start_qty, quote_complete_qty, app_start_qty, app_complete_qty, prod_acq_qty, impression_qty, click_qty, campaign_funding_source      

# Function to generate a single dataset
def generate_dataset():
    data = []
    for date in date_range:
        for campaign in campaign_names:
            for camp_prod in campaign_products:
                for conv_prod in campaign_products:
                    for chnl in conversion_channels:

                        # Randomly select campaign_product and ensure campaign_cosa matches
                        campaign_product = camp_prod #random.choice(campaign_products)
                        campaign_cosa = campaign_cosas[campaign_product]
                        campaign_funding_source = f"{campaign_cosa} {campaign_product}"
                        
                        # Randomly select converted_product and ensure converted_cosa matches
                        converted_product = conv_prod #random.choice(campaign_products)
                        converted_cosa = campaign_cosas[converted_product]
                        
                        # Generate random values for other columns
                        conversion_channel = chnl # random.choice(conversion_channels)

                        # TODO: Create a function that generates the conversion quantities based on the impression / click qty (???? -- Future work if want output to look reasonable...)
                        # TODO: Create a function that also changes conversion quantities based on the conversion channel and product - to weight some products 
                        #         towards different conversion channels.
                        # TODO: Create a function that generates the impressions/clicks based on spend 
                        
                        impression_qty = random.randint(1000, 20000)
                        click_qty = random.randint(2000, 40000)
                        spend_amt = round(random.uniform(1000, 10000), 2)
                        quote_start_qty = random.randint(10, 600)
                        quote_complete_qty = random.randint(8, min(580, quote_start_qty))
                        app_start_qty = random.randint(5, min(490, quote_complete_qty))
                        app_complete_qty = random.randint(4, min(400, app_start_qty))
                        prod_acq_qty = random.randint(0, min(250, app_complete_qty))
                    
                        # Append the row to the dataset
                        data.append([
                            date, campaign_product, campaign_cosa, converted_product, converted_cosa,
                            campaign, conversion_channel, spend_amt, quote_start_qty, quote_complete_qty,
                            app_start_qty, app_complete_qty, prod_acq_qty, impression_qty, click_qty,
                            campaign_funding_source
                        ])
        
    # Create a DataFrame
    columns = [
        "event_date", "campaign_product", "campaign_cosa", "converted_product", "converted_cosa",
        "campaign_nm", "conversion_channel_nm", "spend_amt", "quote_start_qty", "quote_complete_qty",
        "app_start_qty", "app_complete_qty", "prod_acq_qty", "impression_qty", "click_qty",
        "campaign_funding_source"
    ]
    return pd.DataFrame(data, columns=columns)



In [13]:
# Generate the three datasets
Paid_Display = generate_dataset()
Paid_Social = generate_dataset()
Paid_Search = generate_dataset()

# Example: Display the first few rows of one dataset
print(Paid_Display.head())

  event_date campaign_product campaign_cosa converted_product converted_cosa  \
0 2025-01-01   Auto Insurance           P&C    Auto Insurance            P&C   
1 2025-01-01   Auto Insurance           P&C    Auto Insurance            P&C   
2 2025-01-01   Auto Insurance           P&C    Auto Insurance            P&C   
3 2025-01-01   Auto Insurance           P&C        Homeowners            P&C   
4 2025-01-01   Auto Insurance           P&C        Homeowners            P&C   

                     campaign_nm conversion_channel_nm  spend_amt  \
0  P&C_Auto Insurance_Campaign_1              Internet    7673.95   
1  P&C_Auto Insurance_Campaign_1                Mobile    6314.43   
2  P&C_Auto Insurance_Campaign_1               Offline    6051.21   
3  P&C_Auto Insurance_Campaign_1              Internet    7829.27   
4  P&C_Auto Insurance_Campaign_1                Mobile    1919.89   

   quote_start_qty  quote_complete_qty  app_start_qty  app_complete_qty  \
0              260           

In [14]:
len(Paid_Display)

116640

In [19]:
Paid_Display.to_csv(str(base_path / "data/Paid_Display_toy_data.csv"), index=False)
Paid_Social.to_csv(str(base_path / "data/Paid_Social_toy_data.csv"), index=False)
Paid_Search.to_csv(str(base_path / "data/Paid_Search_toy_data.csv"), index=False)

In [20]:
#generate toy member data file / table

# Function to generate member data
def generate_member_data(num_members=1000):
    member_data = []
    for i in range(num_members):
        #Generate a random member number - Base64 hashed
        random_number = random.randint(10000000, 99999999)
        # Convert to Base64
        member_id = base64.b64encode(str(random_number).encode()).decode()
        age_grp = random.choice(["18-24", "25-34", "35-44", "45-54", "55-64", "65+"])
        marital_status = random.choice(["Single", "Married", "Divorced", "Widowed"])
        military_status = random.choice(["Active", "Separated","Retired","Spouse","Child", "None"])
        member_status = random.choice(["Active", "Inactive"])
        
        if member_status == "Inactive":
            active_pnc = False
            active_bank = False
            active_life = False
        else: 
            active_pnc = random.choice([True, False])
            active_bank = random.choice([True, False])
            active_life = random.choice([True, False])
        if active_pnc:
            active_auto_insurance = random.choice([True, False])
            active_homeowners = random.choice([True, False])
            if active_homeowners:
                active_renters = False
            else:
                active_renters = random.choice([True, False])
        else:
            active_auto_insurance = False
            active_homeowners = False
            active_renters = False
        if active_bank:
            active_credit_card = random.choice([True, False])
            active_deposits = random.choice([True, False])
        else:
            active_credit_card = False
            active_deposits = False
        if active_life:
            active_life_insurance = random.choice([True, False])
        else:
            active_life_insurance = False

        member_data.append([member_id, age_grp, marital_status, military_status, member_status,
                            active_pnc, active_bank, active_life, active_credit_card,
                            active_deposits, active_auto_insurance, active_homeowners,
                            active_renters, active_life_insurance])
    
    columns = ["member_id", "age_grp", "marital_status", "military_status", "member_status",
               "active_pnc", "active_bank", "active_life", "active_credit_card",
               "active_deposits", "active_auto_insurance", "active_homeowners",
               "active_renters", "active_life_insurance"]
    return pd.DataFrame(member_data, columns=columns)

In [21]:
mbr_data = generate_member_data(100)

In [22]:
mbr_data.head()

Unnamed: 0,member_id,age_grp,marital_status,military_status,member_status,active_pnc,active_bank,active_life,active_credit_card,active_deposits,active_auto_insurance,active_homeowners,active_renters,active_life_insurance
0,Mjc1MzY4MDM=,18-24,Widowed,Child,Inactive,False,False,False,False,False,False,False,False,False
1,ODIyODM5MDg=,55-64,Married,,Inactive,False,False,False,False,False,False,False,False,False
2,NzM0ODE3MTU=,25-34,Single,Separated,Active,False,True,True,True,False,False,False,False,True
3,ODE1OTY2OTc=,35-44,Widowed,,Active,True,True,True,False,False,False,False,False,False
4,ODkzMzYzMzc=,65+,Married,Retired,Inactive,False,False,False,False,False,False,False,False,False


In [23]:
mbr_data.to_csv(str(base_path / "data/mbr_toy_data.csv"), index=False)

In [None]:
import pandas as pd
import random
import base64
from datetime import datetime, timedelta

# Generate Member Detail File
def generate_member_detail(num_members=1000):
    us_states = [
        "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida",
        "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
        "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska",
        "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio",
        "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas",
        "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
    ]
    member_data = []
    for i in range(num_members):
        member_number = base64.b64encode(str(random.randint(10000000, 99999999)).encode()).decode()
        age_group = random.choice(["18-24", "25-34", "35-44", "45-54", "55-64", "65+"])
        marital_status = random.choice(["Single", "Married", "Divorced", "Widowed"])
        state = random.choice(us_states)
        member_data.append([member_number, age_group, marital_status, state])
    
    columns = ["member_number", "age_group", "marital_status", "state"]
    return pd.DataFrame(member_data, columns=columns)

# Generate Paid Channel Campaign Fact Table
def generate_campaign_fact_table(num_records=1000):
    date_range = pd.date_range(start="2025-01-01", end="2025-03-31", freq="D")
    campaign_products = ["Auto Insurance", "Homeowners", "Renters", "Credit Card", "Deposits", "Consumer Loans"]
    campaign_cosas = {
        "Auto Insurance": "P&C", "Homeowners": "P&C", "Renters": "P&C",
        "Credit Card": "Bank", "Deposits": "Bank", "Consumer Loans": "Bank"
    }
    conversion_channels = ["Internet", "Mobile", "Offline"]
    campaign_names = [f"{cosa}_{prod}_Campaign_{i}" for prod, cosa in campaign_cosas.items() for i in range(1, 3)]
    
    data = []
    for _ in range(num_records):
        event_date = random.choice(date_range)
        member_number = base64.b64encode(str(random.randint(10000000, 99999999)).encode()).decode()
        campaign_product = random.choice(campaign_products)
        campaign_cosa = campaign_cosas[campaign_product]
        converted_product = random.choice(campaign_products)
        converted_cosa = campaign_cosas[converted_product]
        campaign_nm = random.choice(campaign_names)
        conversion_channel_nm = random.choice(conversion_channels)
        spend_amt = round(random.uniform(1000, 10000), 2)
        quote_start_qty = random.randint(10, 600)
        quote_complete_qty = random.randint(8, min(580, quote_start_qty))
        app_start_qty = random.randint(5, min(490, quote_complete_qty))
        app_complete_qty = random.randint(4, min(400, app_start_qty))
        prod_acq_qty = random.randint(0, min(250, app_complete_qty))
        impression_qty = random.randint(1000, 20000)
        click_qty = random.randint(2000, 40000)
        campaign_funding_source = f"{campaign_cosa} {campaign_product}"
        
        data.append([
            event_date, member_number, campaign_product, campaign_cosa, converted_product, converted_cosa,
            campaign_nm, conversion_channel_nm, spend_amt, quote_start_qty, quote_complete_qty, app_start_qty,
            app_complete_qty, prod_acq_qty, impression_qty, click_qty, campaign_funding_source
        ])
    
    columns = [
        "event_date", "member_number", "campaign_product", "campaign_cosa", "converted_product", "converted_cosa",
        "campaign_nm", "conversion_channel_nm", "spend_amt", "quote_start_qty", "quote_complete_qty", "app_start_qty",
        "app_complete_qty", "prod_acq_qty", "impression_qty", "click_qty", "campaign_funding_source"
    ]
    return pd.DataFrame(data, columns=columns)

# Generate and Save Files
if __name__ == "__main__":
    # Generate member detail file
    member_df = generate_member_detail(1000)
    member_df.to_csv(base_path / "data/member_detail.csv", index=False)
    print("Member detail file created: member_detail.csv")
    
    # Generate campaign fact table
    campaign_df = generate_campaign_fact_table(1000)
    campaign_df.to_csv(base_path / "data/campaign_fact_table.csv", index=False)
    print("Campaign fact table file created: campaign_fact_table.csv")

Member detail file created: member_detail.csv
Campaign fact table file created: campaign_fact_table.csv
