# CLV Event Generation

This script generates data for the CLV experiment.  

The general idea is the effect of ads and emails can help prevent churn, however too many of them may promote churn.

Churn is also affected by fixed effects associated with each user; in this simple case there are 3 region and 2 demographic 
categories assigned to each user.  


In [1]:
import numpy as np
import pandas as pd

#
# main method, which generates churn
#
def generate_synthetic_clv_data(n_customers=100, n_weeks=12, seed=101):
    rng = np.random.default_rng(seed)

    demo   = rng.integers(0, 2, size=n_customers)
    region = rng.integers(0, 3, size=n_customers)

    # 
    print( 'demo: ',   demo )
    print( 'region: ', region )

    decay = 0.7
    gamma_e = 1.2
    gamma_a = 1.0
    delta = 0.015

    emails = rng.poisson(1.5, size=(n_customers, n_weeks))
    ads = rng.poisson(2.5, size=(n_customers, n_weeks))

    carry_emails = np.zeros_like(emails, dtype=float)
    carry_ads = np.zeros_like(ads, dtype=float)

    for t in range(1, n_weeks):
        carry_emails[:, t] = decay * carry_emails[:, t-1] + emails[:, t-1]
        carry_ads[:, t] = decay * carry_ads[:, t-1] + ads[:, t-1]

    total_exposure = carry_emails + carry_ads

    effectiveness = (
        gamma_e * (1 - np.exp(-carry_emails)) +
        gamma_a * (1 - np.exp(-carry_ads)) +
        -1.0 * delta * total_exposure**2
    )

    # smaller intercept should push out the churn week
    intercept      = 0.0
    week_intercept = [ -1.75, 0.5, 0.4, 0.3, 0.2, 0.1, 0.05, 0.05, 0.05, 0.04, 0.03, 0.02 ]
    week_intercept = np.reshape( np.tile( week_intercept, n_customers ), (n_customers, n_weeks) )

    
    base_logit = (
        intercept + 
        week_intercept +
        -0.7 * demo[:, None] +
        -0.5 * (region[:, None] == 0).astype(float) +         # region[:, None] +
        -2.5 * effectiveness
    )
    churn_prob = 1 / (1 + np.exp(-base_logit))

    print( "SHAPE: churn_prob: ", np.shape(churn_prob) )
    print( "churn_prob: ", churn_prob )

    churned = np.zeros_like(churn_prob, dtype=int)
    
    for i in range(n_customers):
        
        churned_flag = False

        for t in range(n_weeks):
            if rng.uniform() < churn_prob[i, t]:

                churned[i, t] = 1
                emails[i, t+1:] = 0
                ads[i, t+1:] = 0
                effectiveness[i, t+1:] = 0
                churned_flag = True
                break

    purchase_mean = (
        0.5 * demo[:, None] +
        0.3 * (region[:, None] == 1).astype(float) +
        1.2 * effectiveness
    )
    purchase_mean = np.clip( purchase_mean, 0.01, None )
    purchase = rng.gamma(shape=2.0, scale=purchase_mean)
    purchase = np.round(purchase * 1000).astype(int)

    # Zero out purchases after churn
    for i in range(n_customers):
        churn_weeks = np.where(churned[i] == 1)[0]
        if len(churn_weeks) > 0:
            first_churn_week = churn_weeks[0]
            purchase[i, first_churn_week+1:] = 0

    customer_ids = np.repeat(np.arange(n_customers), n_weeks)
    week_ids = np.tile(np.arange(n_weeks), n_customers)

    return emails, ads, demo, region, churned, purchase, customer_ids, week_ids, effectiveness, churn_prob, np.exp(-carry_emails), np.exp(-carry_ads)


In [10]:
# 
#
#

emails, ads, demo, region, churned, purchase, customer_ids, week_ids, eff, ch_prob, c_em, c_ads = generate_synthetic_clv_data()

df = pd.DataFrame({
    "customer_id": customer_ids,
    "week": week_ids,
    "emails": emails.flatten(),
    "ads": ads.flatten(),
    "churned": churned.flatten(),
    "purchase": purchase.flatten(),
    "eff":     eff.flatten(),
    "ch_prob": ch_prob.flatten(),
    "c_em":    c_em.flatten(),
    "c_ads":   c_ads.flatten()
})

df["demo"] = df["customer_id"].map(dict(enumerate(demo)))
df["region"] = df["customer_id"].map(dict(enumerate(region)))

print( "len churned: ", len( df[ df['churned'] == 1 ] ) )
df[ df['churned'] == 1 ]

demo:  [0 1 1 0 0 1 0 1 1 0 0 1 1 1 0 0 0 1 1 0 1 1 0 1 0 0 1 0 1 1 1 1 1 0 0 0 0
 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 0 1
 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1]
region:  [2 1 0 2 2 1 1 2 0 0 0 1 1 2 1 0 2 2 0 0 0 2 0 0 0 0 0 0 2 2 0 2 0 1 2 1 0
 0 2 1 0 1 1 2 2 1 0 2 2 1 0 2 0 2 2 2 2 0 0 2 1 1 1 1 2 1 1 0 1 2 1 1 0 0
 1 2 0 0 0 1 1 2 0 1 2 0 2 1 1 1 1 1 2 0 2 0 2 2 0 1]
SHAPE: churn_prob:  (100, 12)
churn_prob:  [[0.1480472  0.03433023 0.03478968 ... 0.10546363 0.20996677 0.34216965]
 [0.07943855 0.03806994 0.2649404  ... 0.96127304 0.97072746 0.80659739]
 [0.04973651 0.01246984 0.02184616 ... 0.57033468 0.08057733 0.03576028]
 ...
 [0.1480472  0.03304853 0.13458187 ... 0.6771201  0.15772617 0.09649089]
 [0.09534946 0.03466773 0.0348095  ... 0.04205669 0.2037632  0.08597727]
 [0.07943855 0.09870813 0.0200961  ... 0.19667277 0.84201652 0.35417081]]
NOT CHURNED ...
NOT CHURNED ...
len churned:  98


Unnamed: 0,customer_id,week,emails,ads,churned,purchase,eff,ch_prob,c_em,c_ads,demo,region
5,0,5,3,3,1,2059,1.428300,0.030156,0.085418,0.015937,0,2
14,1,2,1,3,1,4796,0.288179,0.264940,0.024724,0.000553,1,1
30,2,6,2,3,1,29,-1.059911,0.817541,0.021041,0.000020,1,0
42,3,6,1,2,1,29,-1.272606,0.962003,0.000588,0.000421,0,2
56,4,8,3,3,1,34,-1.205480,0.955370,0.008490,0.000034,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1130,94,2,2,4,1,5834,1.077339,0.091675,0.049787,0.004517,0,2
1145,95,5,0,2,1,741,-0.025153,0.261707,0.016538,0.000328,1,0
1156,96,4,2,2,1,22,-0.308116,0.725177,0.001281,0.001907,0,2
1173,97,9,1,0,1,20,-0.280227,0.677120,0.020568,0.000135,0,2


In [11]:
# diagnostics: ensure there is a probability of churn that generally grows over time
#   mirroring typical real-life scenarios
df.groupby('week').agg( { 'ch_prob': 'mean' } )

Unnamed: 0_level_0,ch_prob
week,Unnamed: 1_level_1
0,0.104011
1,0.085559
2,0.093883
3,0.224982
4,0.284402
5,0.32375
6,0.408257
7,0.473914
8,0.554388
9,0.569256


In [12]:
print( 'carry em: ', c_em.max(), '|', c_em.min() )
print( 'c ads: ', c_ads.max(), '|', c_ads.min() ) 


carry em:  1.0 | 4.512338641801924e-05
c ads:  1.0 | 9.589615017724404e-08


In [13]:
#
# now, add the censored column and the time to churn column 
# censored ==> did not churn during the observation window
#
customer_id_list = df['customer_id'].sort_values().unique()

df['time_to_churn'] = 0
df['censored']      = 0
df['churn_week']    = 0

#
# keep as array also, for convenience
#
churn_week_array     = np.ones( len(customer_id_list) )
churn_observed_array = np.zeros( len(customer_id_list) )

WEEKS_PER_CUSTOMER = 12
NUM_CENSORED = 0

churn_week_array = (churn_week_array * WEEKS_PER_CUSTOMER)

#
# for loop ...
#
for id in customer_id_list:
    
    df_ss = df[ df['customer_id'] == id ]
    
    sum_churned = df_ss['churned'].sum()
    
    if sum_churned == 0:
        df.loc[ df['customer_id'] == id, 'censored' ] = 1
        NUM_CENSORED = NUM_CENSORED + 1
    else:
        churn_observed_array[id] = 1
        churn_week = -1
        time_to_churn = np.zeros( len( df_ss ) )
        churned_data_ss = df_ss['churned'].to_numpy()
        min_idx = WEEKS_PER_CUSTOMER * id 
        
        
        for idx in range(0,len(df_ss)):
            c_val = df_ss.iloc[ idx ]['churned']
            if c_val == 1:
                churn_week = idx
                # print( "c_week: ", churn_week, " | min idx: ", min_idx )
                break

        #
        # rolling backwards... set weeks to churn 
        #
        num2go = 0
        for w2c in range( churn_week, 0, -1 ):
            # print( "    rolling: ", (min_idx + w2c), " <- ", num2go )
            df.iloc[ min_idx + w2c, df.columns.get_loc('time_to_churn') ] = num2go
            num2go = num2go + 1

        # 
        df.loc[ df['customer_id'] == id, 'churn_week' ] = churn_week
        churn_week_array[id] = churn_week


#
print( "NUM_CENSORED: ", NUM_CENSORED )
print( "churn_week_arr: \n", churn_week_array )
# 


df.groupby('churn_week').agg( { 'churned': 'sum' } )


NUM_CENSORED:  2
churn_week_arr: 
 [ 5.  2.  6.  6.  8.  2.  1.  7.  3.  2.  5.  3.  0.  7.  3.  7.  1.  5.
  2.  4.  9.  4.  2. 12.  9.  2.  7.  3.  9.  9.  2.  2.  7.  1.  6.  1.
  4.  1.  4.  0.  4.  3.  3.  5.  3. 10.  7.  3.  5.  1.  1.  9. 10.  3.
  9.  3.  3.  4.  8.  6.  0.  7.  8.  4.  5.  1.  1.  2.  7.  4.  5.  6.
  7.  5.  5.  8.  7.  5.  4.  0.  0.  5.  3.  0.  1.  3.  6.  7.  7.  3.
  2.  5.  9.  8.  2.  5.  4.  9. 12.  5.]


Unnamed: 0_level_0,churned
churn_week,Unnamed: 1_level_1
0,6
1,10
2,11
3,14
4,10
5,14
6,6
7,12
8,5
9,8


In [1]:
#
# replace purchase amounts with a model influenced by time, region, demo, and user's idiosyncratic effects
#

import numpy as np
import pandas as pd

def replace_purchase_amounts(df_original, sigma=0.5, seed=42):
    """
    Replaces the 'purchase' column in a copy of the dataframe for rows where churned != 2.
    Purchase amounts are generated using a structured log-normal model.
    """
    np.random.seed(seed)
    df = df_original.copy()

    # Normalized region and demo columns
    demo_effect = {0: -0.2, 1: 0.2}
    region_effect = {0: -0.1, 1: 0.0, 2: 0.1}

    # User-level latent effects
    user_ids = df['customer_id'].unique()
    user_effect = {uid: np.random.normal(0.0, 0.3) for uid in user_ids}

    # Week-level seasonality
    week_ids = df['week'].unique()
    week_effect = {w: np.random.normal(0.0, 0.1) for w in week_ids}

    # Replace purchase values only if churned != 2
    new_purchases = []
    for idx, row in df.iterrows():
        if row['churned'] == 2:
            new_purchases.append(0.0)
        else:
            mu = (
                user_effect[row['customer_id']] +
                demo_effect.get(row['demo'], 0.0) +
                region_effect.get(row['region'], 0.0) +
                week_effect[row['week']]
            )
            purchase_amt = np.exp(np.random.normal(mu, sigma))
            new_purchases.append(purchase_amt)

    df['purchase'] = new_purchases
    return df


In [None]:
df = replace_purchase_amounts( df )

df.to_csv( "Data/syn_clv_dataset_blog_test.csv", index=False )