In [1]:
import pandas as pd
import numpy as np

# take the smoothed ngram scores and event data to generate grades
RAW_EVENTS_PATH = 'raw_events.csv'
SMOOTHED_PROBS_PATH = 'smoothed_probabilities.csv'
OUTPUT_PATH = 'daily_risk_grades.csv'
EVENT_WINDOW_DAYS = 30
MAX_N_GRAM = 3

# Define the grading scale based on probability
# Arbitrary grading scale
# If i wanted to improve it, I could add an extra normalization step
GRADING_SCALE = {
    'A': (0.00, 0.05),  # Very Low Risk
    'B': (0.05, 0.10),  # Low Risk
    'C': (0.10, 0.15),  # Moderate Risk (around the prior mean)
    'D': (0.15, 0.25),  # High Risk
    'F': (0.25, 1.00)   # Very High Risk
}

def get_grade(probability):
    """Assigns a letter grade based on the churn probability."""
    for grade, (lower, upper) in GRADING_SCALE.items():
        if lower <= probability < upper:
            return grade
    return 'F' # fallback

def get_ngrams(sequence, max_n):
    """Generates all n-grams (up to 3) from a sequence"""
    ngrams = []
    for n in range(1, max_n + 1):
        if len(sequence) >= n:
            ngrams.append(tuple(sequence[-n:]))
    return ngrams

def process_customer_risk(customer_df, prob_map):
    """Calculates the daily risk per customer. This is done by finding 
    the highest risk ngram per window and using that to assign a grade. This is a simple way,
    but not the best. The best would be to account for ALL ngrams within the window."""
    customer_df = customer_df.sort_values(by='event_date')
    customer_df['event_date'] = pd.to_datetime(customer_df['event_date'])
    
    daily_risk = []
    
    for i in range(len(customer_df)):
        current_event = customer_df.iloc[i]
        current_date = current_event['event_date']
        
        # Define the 30-day lookback window
        window_start_date = current_date - pd.Timedelta(days=EVENT_WINDOW_DAYS)
        
        # Filter events within the window (including the current event)
        window_events = customer_df[
            (customer_df['event_date'] > window_start_date) & 
            (customer_df['event_date'] <= current_date)
        ]
        
        event_sequence = window_events['event_code'].tolist()
        
        # Extract n-grams from the end of the sequence (most recent events)
        ngrams = get_ngrams(event_sequence, MAX_N_GRAM)
        
        # Find the highest risk n-gram (the most predictive one)
        max_prob = 0.0
        best_ngram = None
        
        for ngram in ngrams:
            ngram_key = ' '.join(ngram)
            prob = prob_map.get(ngram_key, 0.15) # Default to prior mean if ngram not seen
            
            if prob > max_prob:
                max_prob = prob
                best_ngram = ngram_key
                
        # Assign the risk score and grade
        risk_grade = get_grade(max_prob)
        
        daily_risk.append({
            'customer_id': current_event['customer_id'],
            'event_date': current_date,
            'churn_probability': max_prob,
            'risk_grade': risk_grade,
            'driving_ngram': best_ngram,
            'actual_churn_in_90_days': current_event['churn_in_90_days']
        })
            
    return daily_risk

# MAIN

if __name__ == "__main__":
    print("1. Reading raw events and smoothed probabilities...")
    try:
        raw_events_df = pd.read_csv(RAW_EVENTS_PATH)
        smoothed_df = pd.read_csv(SMOOTHED_PROBS_PATH)
    except FileNotFoundError as e:
        print(f"Error: {e}. Run the previous steps")
        exit()

    # Create a map from n-gram string to smoothed probability for fast lookup
    prob_map = smoothed_df.set_index('ngram')['smoothed_probability'].to_dict()

    print("2. Calculating daily risk and assigning grades for all customers...")
    all_daily_risk = []
    
    customer_groups = raw_events_df.groupby('customer_id')
    
    #add the daily risk for each customer
    for cust_id, group in customer_groups:
        all_daily_risk.extend(process_customer_risk(group, prob_map))
        
    daily_risk_df = pd.DataFrame(all_daily_risk)
    
    # sortings
    daily_risk_df = daily_risk_df.sort_values(by=['customer_id', 'event_date']).reset_index(drop=True)
    
    # Save the final daily risk grades file
    daily_risk_df.to_csv(OUTPUT_PATH, index=False)
    
    print(f"\nRisk grading complete. Daily risk grades saved to {OUTPUT_PATH}")
    print(f"Total daily risk records generated: {len(daily_risk_df)}")


1. Reading raw events and smoothed probabilities...
2. Calculating daily risk and assigning grades for all customers...

Risk grading complete. Daily risk grades saved to daily_risk_grades.csv
Total daily risk records generated: 27917
