In [1]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Parameters
num_customers = 500000  # Number of customers
email_styles = ['style_1', 'style_2', 'style_3']
marital_statuses = ['single', 'married', 'divorced']
education_levels = ['primary', 'secondary', 'tertiary']
start_date = datetime(2022, 1, 1)

# Helper functions to generate fake data
def random_age():
    return random.randint(25, 70)  # Age starts from 25

def random_click_purchase():
    click = fake.boolean(chance_of_getting_true=50)  # 50% chance for click
    purchase = fake.boolean(chance_of_getting_true=20) if click else False  # 30% chance for purchase if clicked
    return click, purchase

def random_date(start_date):
    return start_date + timedelta(days=random.randint(0, 365))

# Generate dataset
data = []
for i in range(num_customers):
    customer_no = i + 1
    age = random_age()
    email_style = random.choice(email_styles)
    marital = random.choice(marital_statuses)
    education = random.choice(education_levels)
    click, purchase = random_click_purchase()
    send_date = random_date(start_date)
    click_date = random_date(send_date) if click else None
    purchase_date = random_date(click_date) if purchase else None
    
    data.append([customer_no, age, marital, education, email_style, send_date, click, click_date, purchase, purchase_date])

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_no', 'age', 'marital_status', 'education', 'email_style', 'send_date', 'clicked', 'click_date', 'purchased', 'purchase_date'])

# Preview the dataset
df.head(10)


Unnamed: 0,customer_no,age,marital_status,education,email_style,send_date,clicked,click_date,purchased,purchase_date
0,1,27,divorced,tertiary,style_1,2022-01-07,True,2022-09-02,False,NaT
1,2,67,married,tertiary,style_3,2022-10-17,False,NaT,False,NaT
2,3,25,married,tertiary,style_2,2022-08-28,True,2023-02-11,False,NaT
3,4,65,single,tertiary,style_1,2022-12-23,False,NaT,False,NaT
4,5,49,married,secondary,style_1,2022-09-17,True,2023-07-05,False,NaT
5,6,48,divorced,primary,style_2,2022-05-04,True,2023-04-25,False,NaT
6,7,54,single,primary,style_1,2022-01-20,False,NaT,False,NaT
7,8,38,married,tertiary,style_2,2022-07-07,False,NaT,False,NaT
8,9,58,single,secondary,style_1,2022-09-04,False,NaT,False,NaT
9,10,46,married,tertiary,style_2,2022-04-03,False,NaT,False,NaT


In [2]:
df.education.value_counts()

education
primary      166858
tertiary     166613
secondary    166529
Name: count, dtype: int64

In [3]:
df.marital_status.value_counts()

marital_status
single      166816
married     166608
divorced    166576
Name: count, dtype: int64

In [4]:
df.purchased.value_counts()

purchased
False    449981
True      50019
Name: count, dtype: int64

In [5]:
# Helper functions to calculate CTR and purchase rates by age group, marital status, education, and email style
def calculate_performance(df):
    performance = df.groupby(['age', 'marital_status', 'education', 'email_style']).agg(
        total_sent=('customer_no', 'count'),
        total_clicks=('clicked', 'sum'),
        total_purchases=('purchased', 'sum')
    ).reset_index()

    performance['CTR'] = performance['total_clicks'] / performance['total_sent']
    performance['PurchaseRate'] = performance['total_purchases'] / performance['total_sent']
    return performance

# Calculate initial performance
initial_performance = calculate_performance(df)
print(initial_performance)
#print("Initial performance metrics:\n", initial_performance)


      age marital_status  education email_style  total_sent  total_clicks  \
0      25       divorced    primary     style_1         382           183   
1      25       divorced    primary     style_2         424           193   
2      25       divorced    primary     style_3         421           219   
3      25       divorced  secondary     style_1         405           197   
4      25       divorced  secondary     style_2         386           183   
...   ...            ...        ...         ...         ...           ...   
1237   70         single  secondary     style_2         426           223   
1238   70         single  secondary     style_3         388           202   
1239   70         single   tertiary     style_1         412           200   
1240   70         single   tertiary     style_2         395           194   
1241   70         single   tertiary     style_3         397           196   

      total_purchases       CTR  PurchaseRate  
0                  40  0.47

In [6]:
# Function to get best-performing email style based on CTR and Purchase Rate
def get_best_email_styles(df):
    performance = calculate_performance(df)

    # Get the email style with the highest CTR for each group of age, marital status, and education
    best_by_ctr = performance.loc[performance.groupby(['age', 'marital_status', 'education'])['CTR'].idxmax()]
    best_by_ctr = best_by_ctr[['age', 'marital_status', 'education', 'email_style', 'CTR']]
    best_by_ctr = best_by_ctr.rename(columns={'email_style': 'best_email_style_ctr', 'CTR': 'best_ctr'})
    
    # Get the email style with the highest Purchase Rate for each group of age, marital status, and education
    best_by_purchase_rate = performance.loc[performance.groupby(['age', 'marital_status', 'education'])['PurchaseRate'].idxmax()]
    best_by_purchase_rate = best_by_purchase_rate[['age', 'marital_status', 'education', 'email_style', 'PurchaseRate']]
    best_by_purchase_rate = best_by_purchase_rate.rename(columns={'email_style': 'best_email_style_purchase_rate', 'PurchaseRate': 'best_purchase_rate'})

    # Merge both results into one DataFrame
    best_styles = pd.merge(best_by_ctr, best_by_purchase_rate, on=['age', 'marital_status', 'education'])

    return best_styles

# Calculate and display the best email styles
best_email_styles = get_best_email_styles(df)
print(best_email_styles)


     age marital_status  education best_email_style_ctr  best_ctr  \
0     25       divorced    primary              style_3  0.520190   
1     25       divorced  secondary              style_1  0.486420   
2     25       divorced   tertiary              style_1  0.516667   
3     25        married    primary              style_3  0.526761   
4     25        married  secondary              style_2  0.560096   
..   ...            ...        ...                  ...       ...   
409   70        married  secondary              style_3  0.497423   
410   70        married   tertiary              style_3  0.494924   
411   70         single    primary              style_1  0.543237   
412   70         single  secondary              style_2  0.523474   
413   70         single   tertiary              style_3  0.493703   

    best_email_style_purchase_rate  best_purchase_rate  
0                          style_1            0.104712  
1                          style_3            0.101449  


In [7]:

# Dynamic adjustment function: Increase email sends for best-performing email style by age, marital status, and education
def adjust_campaign(df, performance):
    adjustments = {}
    
    # For each combination of age, marital status, and education, find the best-performing email style
    for _, group in performance.groupby(['age', 'marital_status', 'education']):
        best_email_style = group.loc[group['CTR'].idxmax(), 'email_style']
        
        # Create a key for adjustments based on age, marital status, and education
        key = (group['age'].iloc[0], group['marital_status'].iloc[0], group['education'].iloc[0])
        adjustments[key] = best_email_style
    
    # Adjust email styles based on the new strategy
    def adjust_email_style(row):
        key = (row['age'], row['marital_status'], row['education'])
        if key in adjustments:
            return adjustments[key]
        return row['email_style']
    
    df['adjusted_email_style'] = df.apply(adjust_email_style, axis=1)
    return df

# Apply adjustments and recalculate performance
adjusted_df = adjust_campaign(df.copy(), initial_performance)
adjusted_performance = calculate_performance(adjusted_df)

# Compare pre- and post-adjustment metrics
#print("\nAdjusted performance metrics:\n", adjusted_performance)
