In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

# constants
EVENT_WINDOW_DAYS = 30
MAX_N_GRAM = 3
OUTPUT_PATH = 'ngram_counts.csv'

""" in practice, you would not know what the ngrams would be until you have
the simulated data. But since I simulated the data previously, I know what ngrams to
look for. In a real situation you would be going off of whatever the customers happened to
do. You could extract your own ngrams from the data as you saw fit."""

def get_ngrams(sequence, max_n):
    """Generates all n-grams (up to max_n) from a sequence."""
    ngrams = []
    for n in range(1, max_n + 1):
        if len(sequence) >= n:
            ngrams.append(tuple(sequence[-n:]))
    return ngrams

def process_customer_events(customer_df):
    """
    Processes a single customer's event history to extract n-grams and
    associate them with the 90-day churn target.
    """
    # Sort by date to ensure correct sequence
    customer_df = customer_df.sort_values(by='event_date')
    # Convert event_date to datetime objects
    customer_df['event_date'] = pd.to_datetime(customer_df['event_date'])
    # List to store n-gram data for this customer
    customer_ngram_data = []
    
    # Iterate through each event as the "current" observation point
    for i in range(len(customer_df)):
        current_event = customer_df.iloc[i]
        current_date = current_event['event_date']
        churn_target = current_event['churn_in_90_days']
        
        # Define the 30-day lookback window
        window_start_date = current_date - pd.Timedelta(days=EVENT_WINDOW_DAYS)
        
        # Filter events within the window (including the current event)
        window_events = customer_df[
            (customer_df['event_date'] > window_start_date) & 
            (customer_df['event_date'] <= current_date)
        ]
        
        # Get the sequence of event codes in the window
        event_sequence = window_events['event_code'].tolist()
        
        # Extract n-grams from the end of the sequence (most recent events)
        # We only care about the n-grams ending at the current date
        ngrams = get_ngrams(event_sequence, MAX_N_GRAM)
        
        # Record the n-grams and the churn target at this point in time
        for ngram in ngrams:
            customer_ngram_data.append({
                'ngram': ngram,
                'count': 1,
                'churn_count': churn_target
            })
            
    return customer_ngram_data

def aggregate_ngrams(ngram_data):
    """Aggregates counts and churn counts for all unique n-grams."""
    ngram_stats = defaultdict(lambda: {'count': 0, 'churn_count': 0})
    
    for item in ngram_data:
        ngram_key = item['ngram']
        ngram_stats[ngram_key]['count'] += item['count']
        ngram_stats[ngram_key]['churn_count'] += item['churn_count']
        
    # Convert to a list of dictionaries for DataFrame creation
    aggregated_list = []
    for ngram, stats in ngram_stats.items():
        aggregated_list.append({
            'ngram': ' '.join(ngram),
            'n_size': len(ngram), # number of events in the sequence
            'count': stats['count'],
            'churn_count': stats['churn_count']
        })
        
    return pd.DataFrame(aggregated_list)

# MAIN

if __name__ == "__main__":
    print("1. Reading raw event data...")
    try:
        raw_events_df = pd.read_csv('raw_events.csv')
    except FileNotFoundError:
        print("Error: raw_events.csv not founnd run data_simulator.py first.")
        exit()

    # group by customer and process
    print("2. Processing events and extracting n-grams for all customers.")
    all_ngram_data = []
    
    # customer_groups = raw_events_df.groupby('customer_id').head(10) 
    customer_groups = raw_events_df.groupby('customer_id')
    
    for cust_id, group in customer_groups:
        all_ngram_data.extend(process_customer_events(group))
        
    print(f"3. Aggregating {len(all_ngram_data)} n-gram instances.")
    ngram_df = aggregate_ngrams(all_ngram_data)
    
    # Calculate raw churn rate. we will bayesian smooth later
    ngram_df['raw_churn_rate'] = ngram_df['churn_count'] / ngram_df['count']
    
    # Save the aggregated counts
    ngram_df.to_csv(OUTPUT_PATH, index=False)
    print(f"\nN-gram processing complete. Results saved to {OUTPUT_PATH}")
    print(f"Total unique n-grams found: {len(ngram_df)}")
    print(f"total n-gram instances processed: {ngram_df['count'].sum()}")


1. Reading raw event data...
2. Processing events and extracting n-grams for all customers.
3. Aggregating 83451 n-gram instances.

N-gram processing complete. Results saved to ngram_counts.csv
Total unique n-grams found: 584
total n-gram instances processed: 83451
