# Generate Fake Data

## In this notebook, we generate fake data that codifies basic behavior that we want to see in our workshop.

Here's the main rules:
1. Churn is mostly decided via tenure which follows an exponential survival curve: $S(t) = \exp(-\lambda t)$
2. The second most important variable is segment. Here's a table describing the different segments:

| Segment        | Lambda | Description (Churn Rationale)                                         |
|----------------|--------|------------------------------------------------------------------------|
| High Roller    | 0.05   | Loyal, consistent, high-value, hard to lose                            |
| Mature         | 0.08   | Steady, routine-driven, gradual churn                                  |
| Value Seeker   | 0.12   | Promo-dependent, churn if value disappears                             |
| Youth          | 0.18   | High experimentation, low loyalty, easy churn                          |
| New User       | 0.25   | Onboarding drop-offs, likely to churn within 1–2 months                |

3. Data usage affects churn.



In [1]:
src_path = "drive/MyDrive/dlsu_workshop"

## Generate Profile Table


In [151]:
import numpy as np
import pandas as pd

from datetime import timedelta
from pandas.tseries.offsets import DateOffset

# ------------------------------
# Helper Configs and Lookups
# ------------------------------

def get_distribution():
    return [
        {"tenure_class": "01_new", "segment": "01_new_user", "pct_subs": 0.12},
        {"tenure_class": "01_new", "segment": "02_youth", "pct_subs": 0.10},
        {"tenure_class": "01_new", "segment": "03_value_seeker", "pct_subs": 0.02},
        {"tenure_class": "01_new", "segment": "04_mature", "pct_subs": 0.01},
        {"tenure_class": "01_new", "segment": "05_high_roller", "pct_subs": 0.00},
        {"tenure_class": "02_post_onboarding", "segment": "01_new_user", "pct_subs": 0.03},
        {"tenure_class": "02_post_onboarding", "segment": "02_youth", "pct_subs": 0.12},
        {"tenure_class": "02_post_onboarding", "segment": "03_value_seeker", "pct_subs": 0.05},
        {"tenure_class": "02_post_onboarding", "segment": "04_mature", "pct_subs": 0.05},
        {"tenure_class": "02_post_onboarding", "segment": "05_high_roller", "pct_subs": 0.00},
        {"tenure_class": "03_established", "segment": "02_youth", "pct_subs": 0.08},
        {"tenure_class": "03_established", "segment": "03_value_seeker", "pct_subs": 0.07},
        {"tenure_class": "03_established", "segment": "04_mature", "pct_subs": 0.06},
        {"tenure_class": "03_established", "segment": "05_high_roller", "pct_subs": 0.05},
        {"tenure_class": "03_established", "segment": "01_new_user", "pct_subs": 0.00},
        {"tenure_class": "04_6_to_10_years", "segment": "03_value_seeker", "pct_subs": 0.08},
        {"tenure_class": "04_6_to_10_years", "segment": "04_mature", "pct_subs": 0.05},
        {"tenure_class": "04_6_to_10_years", "segment": "05_high_roller", "pct_subs": 0.03},
        {"tenure_class": "04_6_to_10_years", "segment": "02_youth", "pct_subs": 0.00},
        {"tenure_class": "04_6_to_10_years", "segment": "01_new_user", "pct_subs": 0.00},
        {"tenure_class": "05_over_10_years", "segment": "03_value_seeker", "pct_subs": 0.03},
        {"tenure_class": "05_over_10_years", "segment": "04_mature", "pct_subs": 0.03},
        {"tenure_class": "05_over_10_years", "segment": "05_high_roller", "pct_subs": 0.02},
        {"tenure_class": "05_over_10_years", "segment": "02_youth", "pct_subs": 0.00},
        {"tenure_class": "05_over_10_years", "segment": "01_new_user", "pct_subs": 0.00},
    ]

def get_lambda_map():
    return {
        '01_new_user': 0.25,
        '02_youth': 0.18,
        '03_value_seeker': 0.12,
        '04_mature': 0.08,
        '05_high_roller': 0.05
    }

def get_tenure_ranges():
    return {
        '01_new': (1, 6),
        '02_post_onboarding': (7, 24),
        '03_established': (25, 60),
        '04_6_to_10_years': (61, 120),
        '05_over_10_years': (121, 200)
    }

# ------------------------------
# Simulation Functions
# ------------------------------

def simulate_tenure(tenure_class, tenure_ranges):
    low, high = tenure_ranges[tenure_class]
    return np.random.randint(low, high + 1)

def simulate_device():
    return np.random.choice(['android', 'iphone', 'feature_phone'], p=[0.6, 0.3, 0.1])

def simulate_gender():
    return np.random.choice(['male', 'female', 'unknown'], p=[0.45, 0.45, 0.10])

def simulate_age(segment, tenure_months):
    bounds = {
        '01_new_user': (18, 40),
        '02_youth': (16, 25),
        '03_value_seeker': (20, 55),
        '04_mature': (30, 65),
        '05_high_roller': (25, 60)
    }
    min_base, max_age = bounds[segment]
    min_required = (tenure_months // 12) + 16
    min_age = max(min_base, min_required)
    return np.random.randint(min_age, max_age + 1)

# ------------------------------
# Main Generator
# ------------------------------
def generate_profiles_with_acquisition(total_initial_subs=10000, monthly_avg_acquisition=1000,
                                       seed=42, start_month='2024-07-01', end_month='2025-06-30'):
    np.random.seed(seed)

    lambda_map = get_lambda_map()
    tenure_ranges = get_tenure_ranges()
    dist = pd.DataFrame(get_distribution())

    start_date = pd.to_datetime(start_month)
    end_date = pd.to_datetime(end_month)
    month_list = pd.date_range(start=start_date, end=end_date, freq='MS')

    def classify_tenure_class(tenure):
        if tenure <= 6:
            return '01_new'
        elif tenure <= 24:
            return '02_post_onboarding'
        elif tenure <= 60:
            return '03_established'
        elif tenure <= 120:
            return '04_6_to_10_years'
        else:
            return '05_over_10_years'

    rows = []
    user_counter = 0

    # Step 1: Generate initial base
    dist['n_subs'] = (dist['pct_subs'] * total_initial_subs).round().astype(int)

    for _, row in dist.iterrows():
        for _ in range(row['n_subs']):
            segment = row['segment']
            tenure_class = row['tenure_class']
            tenure = simulate_tenure(tenure_class, tenure_ranges)
            registration_date = start_date - DateOffset(months=tenure)

            device_type = simulate_device()
            gender = simulate_gender()
            age = simulate_age(segment, tenure)
            base_lambda = lambda_map[segment]

            for i, month in enumerate(month_list):
                tenure_months = tenure + i
                t_class = classify_tenure_class(tenure_months)
                rows.append({
                    "user_id": f"U{user_counter:07}",
                    "month": month.strftime('%Y_%m'),
                    "segment": segment,
                    "tenure_months": tenure_months,
                    "tenure_class": t_class,
                    "registration_date": registration_date.date(),
                    "lambda": base_lambda,
                    "device_type": device_type,
                    "gender": gender,
                    "age": age
                })
            user_counter += 1

    # Step 2: Add monthly acquisitions
    for cohort_month in month_list:
        n_subs = int(np.random.normal(loc=monthly_avg_acquisition, scale=20))
        n_subs = max(n_subs, 0)
        dist['n_subs'] = (dist['pct_subs'] * n_subs).round().astype(int)

        for _, row in dist.iterrows():
            for _ in range(row['n_subs']):
                segment = row['segment']
                registration_date = cohort_month
                tenure = 1

                device_type = simulate_device()
                gender = simulate_gender()
                age = simulate_age(segment, tenure)
                base_lambda = lambda_map[segment]

                future_months = pd.date_range(start=registration_date, end=end_date, freq='MS')
                for i, month in enumerate(future_months):
                    tenure_months = tenure + i
                    t_class = classify_tenure_class(tenure_months)
                    rows.append({
                        "user_id": f"U{user_counter:07}",
                        "month": month.strftime('%Y_%m'),
                        "segment": segment,
                        "tenure_months": tenure_months,
                        "tenure_class": t_class,
                        "registration_date": registration_date.date(),
                        "lambda": base_lambda,
                        "device_type": device_type,
                        "gender": gender,
                        "age": age
                    })
                user_counter += 1

    return pd.DataFrame(rows)



# Example usage
N_subs = 10000
ave_new_subs = 1000
df_profile = generate_profiles_with_acquisition(total_initial_subs=N_subs,
                                                  monthly_avg_acquisition=ave_new_subs)
df_profile.head()

Unnamed: 0,user_id,month,segment,tenure_months,tenure_class,registration_date,lambda,device_type,gender,age
0,U0000000,2024_07,01_new_user,4,01_new,2024-03-01,0.25,feature_phone,female,38
1,U0000000,2024_08,01_new_user,5,01_new,2024-03-01,0.25,feature_phone,female,38
2,U0000000,2024_09,01_new_user,6,01_new,2024-03-01,0.25,feature_phone,female,38
3,U0000000,2024_10,01_new_user,7,02_post_onboarding,2024-03-01,0.25,feature_phone,female,38
4,U0000000,2024_11,01_new_user,8,02_post_onboarding,2024-03-01,0.25,feature_phone,female,38


In [152]:
df_profile.user_id.nunique()

21930

In [153]:
(df_profile.groupby(['month', 'segment'])
    .user_id.count()
    .reset_index()
    .pivot(index='month',
           columns='segment',
           values='user_id')
    .fillna(0))

segment,01_new_user,02_youth,03_value_seeker,04_mature,05_high_roller
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024_07,1651,3303,2751,2200,1100
2024_08,1800,3602,3001,2400,1200
2024_09,1949,3899,3249,2599,1300
2024_10,2099,4199,3499,2799,1400
2024_11,2253,4507,3756,3004,1503
2024_12,2402,4804,4004,3204,1603
2025_01,2548,5096,4247,3399,1700
2025_02,2696,5391,4494,3596,1799
2025_03,2851,5702,4754,3803,1903
2025_04,2995,5989,4993,3995,1999


## Generate Usage Summary Table

In [154]:
def simulate_usage_with_lambda_adjustment(profile_df, seed=42, noise_sd=0.01):
    np.random.seed(seed)

    # -------------------------------
    # Segment-based data usage rules
    # -------------------------------
    base_usage_mean = {
        '01_new_user': 150,
        '02_youth': 800,
        '03_value_seeker': 400,
        '04_mature': 300,
        '05_high_roller': 1200
    }

    base_usage_std = {
        '01_new_user': 50,
        '02_youth': 200,
        '03_value_seeker': 150,
        '04_mature': 100,
        '05_high_roller': 200
    }

    usage_change_distribution = {
        '01_new_user': [-0.6, -0.3, 0, 0.2],
        '02_youth': [-0.5, -0.2, 0, 0.3],
        '03_value_seeker': [-0.3, 0, 0.2],
        '04_mature': [-0.1, 0, 0.1],
        '05_high_roller': [-0.1, 0, 0.1]
    }

    usage_change_probs = {
        '01_new_user': [0.2, 0.4, 0.3, 0.1],
        '02_youth': [0.3, 0.3, 0.2, 0.2],
        '03_value_seeker': [0.2, 0.5, 0.3],
        '04_mature': [0.1, 0.6, 0.3],
        '05_high_roller': [0.1, 0.6, 0.3]
    }

    def simulate_voice_segment(segment):
        probs = {
            '01_new_user': (0.30, 2),
            '02_youth': (0.20, 1.5),
            '03_value_seeker': (0.35, 2.5),
            '04_mature': (0.50, 3.0),
            '05_high_roller': (0.40, 2)
        }
        p_use, scale = probs[segment]
        return round(np.random.pareto(a=2.5) * scale) if np.random.rand() < p_use else 0

    def simulate_sms_segment(segment):
        probs = {
            '01_new_user': (0.40, 2),
            '02_youth': (0.60, 1.5),
            '03_value_seeker': (0.50, 2.5),
            '04_mature': (0.45, 3.0),
            '05_high_roller': (0.25, 2)
        }
        p_use, scale = probs[segment]
        return round(np.random.pareto(a=3) * scale) if np.random.rand() < p_use else 0

    # -------------------------------
    # Lambda adjustment rules
    # -------------------------------
    def get_usage_modifier(data_mb):
        if data_mb < 100:
            return 1.2
        elif data_mb <= 500:
            return 1.0
        else:
            return 0.9

    def get_delta_modifier(delta):
        if pd.isna(delta):
            return 1.0
        elif delta < -0.5:
            return 1.15
        elif delta < -0.2:
            return 1.1
        elif delta <= 0.2:
            return 1.0
        else:
            return 0.9

    # -------------------------------
    # Main simulation loop
    # -------------------------------
    usage_rows = []
    prev_usage = {}

    profile_df = profile_df.sort_values(['user_id', 'month'])

    for _, row in profile_df.iterrows():
        user_id = row['user_id']
        segment = row['segment']
        month = row['month']
        base_lambda = row['lambda']
        tenure = row['tenure_months']

        base_mean = base_usage_mean[segment]
        base_std = base_usage_std[segment]
        deltas = usage_change_distribution[segment]
        probs = usage_change_probs[segment]

        if user_id not in prev_usage:
            data_mb = np.round(max(0, np.random.normal(base_mean, base_std)), 2)
            prev_data_mb = np.nan
        else:
            prev_data_mb = prev_usage[user_id]
            delta = np.random.choice(deltas, p=probs)
            data_mb = np.round(max(0, prev_data_mb * (1 + delta)), 2)

        prev_usage[user_id] = data_mb

        # Lambda modifiers
        lambda_usage_modifier = get_usage_modifier(data_mb)
        usage_delta = (data_mb - prev_data_mb) / (prev_data_mb + 1e-6) if not pd.isna(prev_data_mb) else np.nan
        lambda_trend_modifier = get_delta_modifier(usage_delta)

        # Final adjusted lambda
        lambda_adj = base_lambda * lambda_usage_modifier * lambda_trend_modifier
        if noise_sd > 0:
            lambda_adj += np.random.normal(loc=0.0, scale=noise_sd)

        p_churn = 1 - np.exp(-lambda_adj)

        usage_rows.append({
            'user_id': user_id,
            'month': month,
            'data_mb': data_mb,
            'voice_minutes': simulate_voice_segment(segment),
            'sms_count': simulate_sms_segment(segment),
            'lambda_usage_modifier': lambda_usage_modifier,
            'usage_delta': usage_delta,
            'lambda_trend_modifier': lambda_trend_modifier,
            'lambda_adj': lambda_adj,
            'p_churn': p_churn
        })

    return pd.DataFrame(usage_rows)


df_usage = simulate_usage_with_lambda_adjustment(df_profile)
df_usage.head()

Unnamed: 0,user_id,month,data_mb,voice_minutes,sms_count,lambda_usage_modifier,usage_delta,lambda_trend_modifier,lambda_adj,p_churn
0,U0000000,2024_07,174.84,0,0,1.0,,1.0,0.248617,0.220122
1,U0000000,2024_08,69.94,0,4,1.2,-0.599977,1.15,0.34779,0.293753
2,U0000000,2024_09,69.94,0,0,1.2,0.0,1.0,0.310105,0.26663
3,U0000000,2024_10,48.96,0,0,1.2,-0.299971,1.1,0.312751,0.268568
4,U0000000,2024_11,34.27,0,0,1.2,-0.300041,1.1,0.324377,0.277022


## Simulate Monthly Churn

In [156]:
def simulate_monthly_churn_events_from_profiles(profile_df, usage_df, p_col='p_churn'):
    """
    Simulates monthly churn using profile and usage tables.
    Merges on ['user_id', 'month'], simulates churn using p_churn.
    Keeps user rows only up to the month they churn.
    """
    # Merge the profile and usage data
    df = profile_df.merge(usage_df, on=['user_id', 'month'], how='inner')
    df = df.sort_values(['user_id', 'month']).copy()
    df['churned'] = 0

    churned_users = set()
    final_records = []

    for month in sorted(df['month'].unique()):
        this_month_df = df[df['month'] == month].copy()

        # Remove already churned users
        this_month_df = this_month_df[~this_month_df['user_id'].isin(churned_users)]

        # Simulate churn based on p_churn
        this_month_df['churned'] = np.random.binomial(n=1, p=this_month_df[p_col])

        # Record users who churned this month
        churned_now = set(this_month_df[this_month_df['churned'] == 1]['user_id'])
        churned_users.update(churned_now)

        final_records.append(this_month_df)

    return pd.concat(final_records, ignore_index=True)


df_churn = simulate_monthly_churn_events_from_profiles(df_profile, df_usage)
df_churn.head()

Unnamed: 0,user_id,month,segment,tenure_months,tenure_class,registration_date,lambda,device_type,gender,age,data_mb,voice_minutes,sms_count,lambda_usage_modifier,usage_delta,lambda_trend_modifier,lambda_adj,p_churn,churned
0,U0000000,2024_07,01_new_user,4,01_new,2024-03-01,0.25,feature_phone,female,38,174.84,0,0,1.0,,1.0,0.248617,0.220122,0
1,U0000001,2024_07,01_new_user,2,01_new,2024-05-01,0.25,android,male,38,134.94,0,1,1.0,,1.0,0.236533,0.21064,0
2,U0000002,2024_07,01_new_user,4,01_new,2024-03-01,0.25,android,female,38,218.59,0,0,1.0,,1.0,0.251756,0.222565,0
3,U0000003,2024_07,01_new_user,2,01_new,2024-05-01,0.25,iphone,unknown,19,133.9,0,0,1.0,,1.0,0.258135,0.227509,0
4,U0000004,2024_07,01_new_user,4,01_new,2024-03-01,0.25,android,male,39,143.79,0,0,1.0,,1.0,0.257403,0.226944,0


In [161]:
(df_churn.groupby(['month', 'segment'])['churned'].mean()
    .reset_index()
    .pivot(index='month',
           columns='segment',
           values='churned')
    .fillna(0))

segment,01_new_user,02_youth,03_value_seeker,04_mature,05_high_roller
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024_07,0.227741,0.149864,0.11414,0.082727,0.043636
2024_08,0.219101,0.149984,0.109416,0.083408,0.038194
2024_09,0.243458,0.162015,0.123059,0.077957,0.05298
2024_10,0.250906,0.162925,0.101715,0.080602,0.044212
2024_11,0.252803,0.153435,0.110851,0.082858,0.048762
2024_12,0.26644,0.171769,0.113618,0.071553,0.040632
2025_01,0.269861,0.158737,0.121676,0.080499,0.044461
2025_02,0.257221,0.160488,0.100124,0.075868,0.046099
2025_03,0.211511,0.170394,0.118658,0.079137,0.043478
2025_04,0.277457,0.164799,0.107397,0.070946,0.035088


In [162]:
(df_churn.groupby(['month', 'tenure_class'])['churned'].mean()
    .reset_index()
    .pivot(index='month',
           columns='tenure_class',
           values='churned')
    .fillna(0))

tenure_class,01_new,02_post_onboarding,03_established,04_6_to_10_years,05_over_10_years
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024_07,0.173181,0.1272,0.104231,0.096875,0.08
2024_08,0.144298,0.143993,0.107293,0.087897,0.073203
2024_09,0.149158,0.15618,0.109533,0.098337,0.085948
2024_10,0.128813,0.15861,0.107143,0.091195,0.096681
2024_11,0.130093,0.154108,0.10213,0.10119,0.071429
2024_12,0.127849,0.168482,0.112693,0.081784,0.068404
2025_01,0.130157,0.145788,0.109635,0.097441,0.061538
2025_02,0.121107,0.140789,0.094675,0.068376,0.072695
2025_03,0.12225,0.125118,0.113893,0.08613,0.087687
2025_04,0.124171,0.132386,0.081395,0.068592,0.065476


## Generate Topup and Promo Tables
Note that for this activity, these tables should be inconsequential to the final output. However, they may affect the model accuracy due to **confounding** since these tables are simulated using the usage and profile tables.

In [163]:
def generate_topup_transactions(churn_df):
    np.random.seed(42)
    topup_records = []

    segment_topup_profiles = {
        '01_new_user': {"channels": ['app', 'retailer'], "amounts": [20, 50, 100], "weights": [0.5, 0.3, 0.2]},
        '02_youth': {"channels": ['app', 'ussd'], "amounts": [10, 30, 50], "weights": [0.4, 0.4, 0.2]},
        '03_value_seeker': {"channels": ['ussd', 'retailer'], "amounts": [10, 20, 50], "weights": [0.6, 0.3, 0.1]},
        '04_mature': {"channels": ['retailer', 'app'], "amounts": [50, 100, 200], "weights": [0.3, 0.4, 0.3]},
        '05_high_roller': {"channels": ['app'], "amounts": [200, 300, 500], "weights": [0.2, 0.5, 0.3]},
    }

    for _, row in churn_df.iterrows():
        usage = row['data_mb']
        segment = row['segment']
        profile = segment_topup_profiles[segment]

        if usage > 1000:
            n_topups = np.random.choice([2, 3])
        elif usage > 500:
            n_topups = np.random.choice([1, 2])
        elif usage > 100:
            n_topups = np.random.choice([0, 1, 2], p=[0.2, 0.6, 0.2])
        else:
            n_topups = np.random.choice([0, 1], p=[0.7, 0.3])

        for _ in range(n_topups):
            amount = np.random.choice(profile["amounts"], p=profile["weights"])
            channel = np.random.choice(profile["channels"])
            year, month = row['month'].split('_')
            topup_date = pd.to_datetime(f"{year}-{month}-01") + timedelta(days=np.random.randint(0, 28))
            topup_records.append({
                "user_id": row["user_id"],
                "month": row["month"],
                "topup_date": topup_date,
                "topup_amount": amount,
                "channel": channel
            })

    return pd.DataFrame(topup_records)

df_topup = generate_topup_transactions(df_churn)
df_topup.head()

Unnamed: 0,user_id,month,topup_date,topup_amount,channel
0,U0000000,2024_07,2024-07-08,100,app
1,U0000001,2024_07,2024-07-23,20,app
2,U0000003,2024_07,2024-07-03,50,retailer
3,U0000003,2024_07,2024-07-24,20,retailer
4,U0000004,2024_07,2024-07-28,20,retailer


In [167]:
(df_topup.groupby(['month', 'channel'])['topup_amount'].sum()
    .reset_index()
    .pivot(index='month',
           columns='channel',
           values='topup_amount')
    .fillna(0))

channel,app,retailer,ussd
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024_07,1122380,183780,98080
2024_08,1138850,178730,85230
2024_09,1156450,174500,79400
2024_10,1181970,168780,74360
2024_11,1229190,167450,68950
2024_12,1244950,166690,65440
2025_01,1283160,162710,61650
2025_02,1320680,172780,57310
2025_03,1344510,183080,55890
2025_04,1370610,172730,56060


In [164]:
def generate_promo_registration(churn_df):
    np.random.seed(42)
    promo_records = []

    promo_types = ['data_booster', 'combo_pack', 'weekend_special', 'social_bundle']
    segment_promo_probs = {
        '01_new_user': 0.3,
        '02_youth': 0.6,
        '03_value_seeker': 0.7,
        '04_mature': 0.3,
        '05_high_roller': 0.2
    }

    for _, row in churn_df.iterrows():
        usage = row['data_mb']
        segment = row['segment']
        base_prob = segment_promo_probs[segment]

        if usage > 1000:
            p_join = min(1.0, base_prob + 0.2)
        elif usage > 500:
            p_join = base_prob
        elif usage > 100:
            p_join = base_prob * 0.5
        else:
            p_join = base_prob * 0.25

        joined = np.random.rand() < p_join
        if joined:
            n_promos = np.random.choice([1, 2], p=[0.8, 0.2])
            for _ in range(n_promos):
                year, month = row['month'].split('_')
                promo_date = pd.to_datetime(f"{year}-{month}-01") + timedelta(days=np.random.randint(0, 28))
                promo_type = np.random.choice(promo_types)
                promo_records.append({
                    "user_id": row["user_id"],
                    "month": row["month"],
                    "registration_date": promo_date,
                    "promo_type": promo_type
                })

    return pd.DataFrame(promo_records)


df_promo = generate_promo_registration(df_churn)
df_promo.head()

Unnamed: 0,user_id,month,registration_date,promo_type
0,U0000006,2024_07,2024-07-04,social_bundle
1,U0000006,2024_07,2024-07-24,weekend_special
2,U0000007,2024_07,2024-07-12,combo_pack
3,U0000007,2024_07,2024-07-06,combo_pack
4,U0000023,2024_07,2024-07-21,data_booster


In [168]:
(df_promo.groupby(['month', 'promo_type'])['user_id'].count()
    .reset_index()
    .pivot(index='month',
           columns='promo_type',
           values='user_id')
    .fillna(0))

promo_type,combo_pack,data_booster,social_bundle,weekend_special
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024_07,1288,1283,1239,1216
2024_08,1106,1074,1151,1115
2024_09,1058,1051,1017,1037
2024_10,987,1057,949,1073
2024_11,970,984,992,969
2024_12,988,995,958,953
2025_01,950,918,909,936
2025_02,922,888,859,854
2025_03,947,870,861,915
2025_04,860,944,839,848


## Validation

`df_churn` contains all the valid month and user_id combinations. Other entries should be excluded from the other tables.

In [169]:
df_churn_label = df_churn.loc[:, ['user_id', 'month', 'p_churn', 'churned']]
df_churn_label.head()

Unnamed: 0,user_id,month,p_churn,churned
0,U0000000,2024_07,0.220122,0
1,U0000001,2024_07,0.21064,0
2,U0000002,2024_07,0.222565,0
3,U0000003,2024_07,0.227509,0
4,U0000004,2024_07,0.226944,0


In [170]:
df_unique_user_months = df_churn.loc[:, ['user_id', 'month']].drop_duplicates()

In [171]:
df_profile_updated = df_profile.merge(df_unique_user_months, on=['user_id', 'month'], how='inner')

# # drop lambda, p_churn and churned columns
df_profile_updated = df_profile_updated.drop(columns=['lambda'])

df_profile_updated.head()

Unnamed: 0,user_id,month,segment,tenure_months,tenure_class,registration_date,device_type,gender,age
0,U0000000,2024_07,01_new_user,4,01_new,2024-03-01,feature_phone,female,38
1,U0000000,2024_08,01_new_user,5,01_new,2024-03-01,feature_phone,female,38
2,U0000000,2024_09,01_new_user,6,01_new,2024-03-01,feature_phone,female,38
3,U0000000,2024_10,01_new_user,7,02_post_onboarding,2024-03-01,feature_phone,female,38
4,U0000000,2024_11,01_new_user,8,02_post_onboarding,2024-03-01,feature_phone,female,38


In [187]:
df_usage_updated = df_usage.merge(df_unique_user_months, on=['user_id', 'month'], how='inner')
cols_to_drop = ['lambda_usage_modifier', 'usage_delta', 'lambda_trend_modifier',
                'lambda_adj', 'p_churn']
df_usage_updated = df_usage_updated.drop(columns=cols_to_drop)
df_usage_updated.head()

Unnamed: 0,user_id,month,data_mb,voice_minutes,sms_count
0,U0000000,2024_07,174.84,0,0
1,U0000000,2024_08,69.94,0,4
2,U0000000,2024_09,69.94,0,0
3,U0000000,2024_10,48.96,0,0
4,U0000000,2024_11,34.27,0,0


In [173]:
df_topup_updated = df_topup.merge(df_unique_user_months, on=['user_id', 'month'], how='inner')
df_topup_updated.head()

Unnamed: 0,user_id,month,topup_date,topup_amount,channel
0,U0000000,2024_07,2024-07-08,100,app
1,U0000001,2024_07,2024-07-23,20,app
2,U0000003,2024_07,2024-07-03,50,retailer
3,U0000003,2024_07,2024-07-24,20,retailer
4,U0000004,2024_07,2024-07-28,20,retailer


In [174]:
df_promo_updated = df_promo.merge(df_unique_user_months, on=['user_id', 'month'], how='inner')
df_promo_updated.head()

Unnamed: 0,user_id,month,registration_date,promo_type
0,U0000006,2024_07,2024-07-04,social_bundle
1,U0000006,2024_07,2024-07-24,weekend_special
2,U0000007,2024_07,2024-07-12,combo_pack
3,U0000007,2024_07,2024-07-06,combo_pack
4,U0000023,2024_07,2024-07-21,data_booster


In [175]:
# check number of rows for all updated tables
len(df_churn), len(df_profile_updated), len(df_usage_updated), len(df_topup_updated), len(df_promo_updated)

(115950, 115950, 115950, 146799, 46764)

These numbers above show that all (user_id, month) pair has corresponding profile, usage and churn tags for all included months.

In [188]:
# save updated tables
df_churn_label.to_csv(f'{src_path}/fake_data/df_churn_label.csv', index=False)
df_profile_updated.to_csv(f'{src_path}/fake_data/customer_profile.csv', index=False)
df_usage_updated.to_csv(f'{src_path}/fake_data/usage_summary.csv', index=False)
df_topup_updated.to_csv(f'{src_path}/fake_data/topup_transactions.csv', index=False)
df_promo_updated.to_csv(f'{src_path}/fake_data/promo_registration.csv', index=False)

In [186]:
df_churn_label.groupby('month').churned.sum()

Unnamed: 0_level_0,churned
month,Unnamed: 1_level_1
2024_07,1415
2024_08,1301
2024_09,1346
2024_10,1225
2024_11,1186
2024_12,1172
2025_01,1141
2025_02,1040
2025_03,1068
2025_04,1022
