In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

DAILY_RISK_PATH = 'daily_risk_grades.csv'
SMOOTHED_PROBS_PATH = 'smoothed_probabilities.csv'
CUSTOMER_BASE_PATH = 'customer_base.csv'

def plot_churn_probability_distribution(df):
    """Plots the distribution of smoothed churn probabilities."""
    plt.figure(figsize=(10, 6))
    sns.histplot(df['smoothed_probability'], bins=30, kde=True, color='skyblue')
    plt.axvline(x=0.15, color='red', linestyle='--', label='Prior Mean (15%)')
    plt.title('Distribution of Smoothed Churn Probabilities Across All N-grams')
    plt.xlabel('Smoothed Churn Probability')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(axis='y', alpha=0.5)
    plt.tight_layout()
    plt.savefig('visual_ngram_distribution.png')
    print("Visual 1: N-gram probability distribution saved to visual_ngram_distribution.png")

def plot_model_performance(daily_risk_df):
    """Plots the model's ability to predict churn within the 90-day window (ROC Curve)."""
    
    
    # Only consider the latest risk score for each customer for a single performance metric
    latest_risk = daily_risk_df.sort_values(by='event_date').drop_duplicates(subset=['customer_id'], keep='last')
    
    # We need the actual churn status (1 or 0) for the 90-day window
    # We will use the 'actual_churn_in_90_days' column from the last day of the simulation
    
    # We need to determine the true churn status for the last day of the simulation
    # For simplicity, we will use the churn_date from the customer_base.csv
    customer_base = pd.read_csv(CUSTOMER_BASE_PATH)
    customer_base['churned_by_end'] = customer_base['churn_date'].notna().astype(int)
    
    # Merge the latest risk score with the actual churn status
    performance_df = pd.merge(latest_risk, customer_base[['customer_id', 'churned_by_end']], on='customer_id', how='left')
    
    # Calculate ROC curve
    # Manually calculate ROC curve and AUC
    y_true = performance_df['churned_by_end'].values
    y_score = performance_df['churn_probability'].values

    # Sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]

    # Calculate TPR and FPR at each threshold
    tpr = np.cumsum(y_true) / np.sum(y_true)
    fpr = np.cumsum(1 - y_true) / np.sum(1 - y_true)

    # Calculate AUC 
    roc_auc = np.trapz(tpr, fpr)
    
    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.tight_layout()
    plt.savefig('visual_model_performance.png')
    print("Visual 3: Model performance (ROC) saved to visual_model_performance.png")
    
    return roc_auc

# MAIN
if __name__ == "__main__":
    print("Starting analysis and visualization generation.")
    
    try:
        daily_risk_df = pd.read_csv(DAILY_RISK_PATH)
        smoothed_df = pd.read_csv(SMOOTHED_PROBS_PATH)
    except FileNotFoundError as e:
        print(f"Error: {e}. Required data files not found.")
        exit()

    # 1. N-gram Distribution
    plot_churn_probability_distribution(smoothed_df)
    
    # 2. Model Performance
    roc_auc = plot_model_performance(daily_risk_df)
    print(f"Model AUC: {roc_auc:.2f}")
    
    # 3. Grade Distribution (for report text)
    grade_counts = daily_risk_df['risk_grade'].value_counts(normalize=True).sort_index()
    print("\n--- Daily Risk Grade Distribution ---")
    print(grade_counts.to_markdown())
    
    print("\nAnalysis and visualization complete.")
