In [1]:
import pandas as pd
import numpy as np

# smoothing scores - need input file with ngrams and their stats
INPUT_PATH = 'ngram_counts.csv'
OUTPUT_PATH = 'smoothed_probabilities.csv'
# The prior mean is the overall churn rate from the base data (it is set at 15%)
PRIOR_MEAN = 0.15 
#The higher the prior strength the more observations needed to pull the score away from the prior.
PRIOR_STRENGTH = 5 

def apply_bayesian_smoothing(df, prior_mean, prior_strength):
    """
    Applies Bayesian smoothing to the raw churn rates.
    Formula: Smoothed Rate = (Count * Raw Rate + Prior Strength * Prior Mean) / (Count + Prior Strength)
    
    Where:
    -Count * Raw Rate = Churn Count
    Count = Total Count
    -Prior Strength * Prior Mean = Pseudo Churn Count
    -Prior Strength = Pseudo Total Count
    """
    
    pseudo_churn_count = prior_strength * prior_mean
    
    # Apply the smoothing formula
    df['smoothed_probability'] = (df['churn_count'] + pseudo_churn_count) / (df['count'] + prior_strength)
    
    return df

# MAIN

if __name__ == "__main__":
    print("1. Reading n-gram counts...")
    try:
        ngram_df = pd.read_csv(INPUT_PATH)
    except FileNotFoundError:
        print("Error: ngram_counts.csv not found. Run ngram_processor.py first.")
        exit()

    print(f"2. Applying Bayesian Smoothing with Prior Mean={PRIOR_MEAN} and Prior Strength={PRIOR_STRENGTH}.")
    
    # Apply smoothing to the entire dataset
    smoothed_df = apply_bayesian_smoothing(ngram_df, PRIOR_MEAN, PRIOR_STRENGTH)
    
    # Select and rename final columns
    final_df = smoothed_df[['ngram', 'n_size', 'count', 'churn_count', 'raw_churn_rate', 'smoothed_probability']]
    
    # Save the final smoothed probabilities
    final_df.to_csv(OUTPUT_PATH, index=False)
    
    print(f"\nBayesian smoothing done. Results saved to {OUTPUT_PATH}")
    print(f"Total unique n-grams smoothed: {len(final_df)}")
    
    # Display a comparison of raw vs. smoothed for low-count n-grams
    # should be very different
    print("\n--- Comparison of Raw vs. Smoothed Probabilities (Low Count) ---")
    low_count_df = final_df[final_df['count'] < 10].sort_values(by='count', ascending=True).head(5)
    print(low_count_df[['ngram', 'count', 'raw_churn_rate', 'smoothed_probability']].to_markdown(index=False))
    
    # Display a comparison of raw vs. smoothed for high-count n-grams
    # should be similar
    print("\n--- Comparison of Raw vs. Smoothed Probabilities (High Count) ---")
    high_count_df = final_df[final_df['count'] > 1000].sort_values(by='count', ascending=False).head(5)
    print(high_count_df[['ngram', 'count', 'raw_churn_rate', 'smoothed_probability']].to_markdown(index=False))


1. Reading n-gram counts...
2. Applying Bayesian Smoothing with Prior Mean=0.15 and Prior Strength=5.

Bayesian smoothing done. Results saved to smoothed_probabilities.csv
Total unique n-grams smoothed: 584

--- Comparison of Raw vs. Smoothed Probabilities (Low Count) ---


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.