In [1]:
import pandas as pd
import numpy as np
from datetime import date, timedelta

# Constants needed for simulating data
# This ensures atleast 1 full year of data. 
# Unfortunately, I could not use more than that because the pipeline would take too long to run.
START_DATE = date(2024, 1, 1) 
END_DATE = date(2025, 11, 30)
N_CUSTOMERS = 100 
TARGET_CHURN_RATE = 0.15
CHURN_WINDOW_DAYS = 90 # Prediction window
RISK_INJECTION_DAYS = 30 # Window before churn to "inject" high-risk events

# Event Alphabet is the list of predetermined events and probability that it leads to churn. This is strictly for simulating the data
EVENT_ALPHABET = { # In () is the probability compared to the dataset average
    "DUL": 0.05,  # Device Usage Low (Low/neutral)
    "DUH": 0.05,  # Device Usage High (Low/Neutral)
    "SCT": 0.15,  # Support Call - Technical (Moderate)
    "SCB": 0.25,  # Support Call - Billing (high)
    "CM": 0.05,   # Contact - Marketing (Low)
    "CS": 0.05,   # Contact - Sales (low)
    "EE": 0.20,   # Error Encountered (Moderate/High)
    "CW": 0.25,   # Cancellation Warning (very high)
}
EVENT_CODES = list(EVENT_ALPHABET.keys())
EVENT_CHURN_PROBS = np.array(list(EVENT_ALPHABET.values()))
EVENT_CHURN_PROBS /= EVENT_CHURN_PROBS.sum() # Normalize for sampling


def generate_customer_base(n_customers, start_date, end_date, target_churn_rate):
    """generates initial customer base with subscription and known churn dates."""
    dates = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]
    
    # This part is random. I used seed 42 in the main so it should be reproducable.
    customers = pd.DataFrame({
        'customer_id': range(1, n_customers + 1),
        'subscription_date': [start_date + timedelta(days=np.random.randint(0, 365)) for _ in range(n_customers)],
        'is_churner': np.random.rand(n_customers) < target_churn_rate
    })
    
    # Assign churn dates for churners
    churn_dates = []
    for is_churner in customers['is_churner']:
        if is_churner:
            # Churn date is random between 1 year after sub date and END_DATE
            min_churn_date = start_date + timedelta(days=365)
            max_churn_date = END_DATE
            
            # Ensure max_churn_date is after min_churn_date
            if max_churn_date > min_churn_date:
                days_in_range = (max_churn_date - min_churn_date).days
                churn_date = min_churn_date + timedelta(days=np.random.randint(0, days_in_range))
            else:
                churn_date = END_DATE # Fallback to make sure in bounds
            churn_dates.append(churn_date)
        else:
            churn_dates.append(None)
            
    customers['churn_date'] = churn_dates
    
    # Assign a simple initial grade (A is low risk, F is high risk)
    grades = ['A', 'B', 'C', 'D', 'F']
    customers['initial_grade'] = np.random.choice(grades, size=n_customers, p=[0.3, 0.25, 0.2, 0.15, 0.1])
    
    return customers

def generate_events(customer_base, start_date, end_date):
    """generates daily events for all customers"""
    all_events = []
    
    for _, row in customer_base.iterrows():
        cust_id = row['customer_id']
        sub_date = row['subscription_date']
        churn_date = row['churn_date']
        
        current_date = sub_date
        
        while current_date <= end_date:
            
            # Do not let events get generated after chrun date
            if churn_date and current_date > churn_date:
                break
            
            # Base probability of an event occurring on any given day
            # Non-churners have a slightly higher chance of a "good" event (DUH is an example)
            base_event_prob = 0.6 if not row['is_churner'] else 0.5
            
            if np.random.rand() < base_event_prob:
                
                # Default event distribution (favoring low-risk events)
                event_weights = np.array([0.2, 0.2, 0.1, 0.05, 0.15, 0.15, 0.1, 0.05])
                
                # churn pattern injection
                if row['is_churner'] and churn_date:
                    days_to_churn = (churn_date - current_date).days
                    
                    if 0 <= days_to_churn <= RISK_INJECTION_DAYS:
                        # Scale the event weights to favor high-risk events (SCB, EE, CW)
                        # The closer to churn, the higher the weight for high-risk events
                        risk_factor = 1 + (RISK_INJECTION_DAYS - days_to_churn) / RISK_INJECTION_DAYS * 2
                        
                        # Indices for high-risk events: SCB (3), EE (6), CW (7)
                        high_risk_indices = [3, 6, 7]
                        
                        # Boost high-risk event weights, since we are close to the churn
                        for idx in high_risk_indices:
                            event_weights[idx] *= risk_factor
                        
                        # Re-normalize weights
                        event_weights /= event_weights.sum()
                
                # Sample an event
                event = np.random.choice(EVENT_CODES, p=event_weights)
                
                all_events.append({
                    'customer_id': cust_id,
                    'event_date': current_date,
                    'event_code': event
                })
                
            current_date += timedelta(days=1)
            
    return pd.DataFrame(all_events)

# MAIN

if __name__ == "__main__":
    np.random.seed(42) # SEED
    
    print("1. Generating customer base...")
    customer_base = generate_customer_base(N_CUSTOMERS, START_DATE, END_DATE, TARGET_CHURN_RATE)
    
    print("2. Generating daily events and injecting churn patterns...")
    raw_events_df = generate_events(customer_base, START_DATE, END_DATE)
    
    # Merge customer info (churn date, initial grade) into the events dataframe
    customer_info = customer_base[['customer_id', 'churn_date', 'initial_grade']]
    raw_events_df = pd.merge(raw_events_df, customer_info, on='customer_id', how='left')
    
    # Add a 'churn_in_90_days' target variable for later model training/evaluation
    # Later if you ever wanted to do a quick check of the churn rate you could do 
    # 1- Avg(churn_in_90_days)
    def check_churn(row):
        if pd.isna(row['churn_date']):
            return 0
        
        # Churn is 1 if the event date is within 90 days before the churn date
        days_to_churn = (row['churn_date'] - row['event_date']).days
        return 1 if 0 <= days_to_churn <= CHURN_WINDOW_DAYS else 0

    print("3. Calculating 90-day churn target variable...")
    raw_events_df['churn_in_90_days'] = raw_events_df.apply(check_churn, axis=1)
    
    # sort
    raw_events_df = raw_events_df.sort_values(by=['customer_id', 'event_date']).reset_index(drop=True)
    
    # Save the raw events to a file for later in the pipeline
    OUTPUT_PATH = 'raw_events.csv'
    raw_events_df.to_csv(OUTPUT_PATH, index=False)
    
    print(f"\nData saved to {OUTPUT_PATH}")
    print(f"Total events generated: {len(raw_events_df)}")
    print(f"Churners in base: {customer_base['is_churner'].sum()}")
    print(f"Events flagged for 90-day churn: {raw_events_df['churn_in_90_days'].sum()}")
    
    # Save the customer base to a file for reference.
    # You can view it in excel the easiest.
    customer_base.to_csv('customer_base.csv', index=False)
    print("Customer base saved to customer_base.csv")


1. Generating customer base...
2. Generating daily events and injecting churn patterns...
3. Calculating 90-day churn target variable...

Data saved to raw_events.csv
Total events generated: 27917
Churners in base: 17
Events flagged for 90-day churn: 756
Customer base saved to customer_base.csv
