# Email Redaction Performance Analysis

This notebook analyzes the performance metrics to generate the key insights documented in the performance summary.


In [27]:
import json
import numpy as np
import os

# Load the analytics data
with open('output_analytics.json', 'r') as f:
    analytics_data = json.load(f)

timing_data = analytics_data['timing_data']


## Performance Breakdown


In [28]:
# Calculate time breakdown percentages
time_breakdown = {
    'Redaction': timing_data['total_redaction_time'],
    'Chart Generation': timing_data['chart_generation_time'], 
    'PDF Generation': timing_data['pdf_generation_time']
}

total_core_time = sum(time_breakdown.values())
bottleneck_analysis = {k: (v/total_core_time)*100 for k, v in time_breakdown.items()}

for component, percentage in bottleneck_analysis.items():
    print(f"{component}: {percentage:.1f}%")

Redaction: 2.2%
Chart Generation: 11.6%
PDF Generation: 86.3%


## Performance Projections


In [29]:
# Calculate parallel vs sequential performance for specific volumes
individual_times = timing_data['individual_redactions']
current_emails = len(individual_times)
avg_time_per_email = np.mean(individual_times)

def calculate_parallel_speedup(volume, cpu_cores=None):
    if cpu_cores is None:
        cpu_cores = os.cpu_count() or 4
    
    if volume <= 1:
        return 1.0
    
    max_workers = min(cpu_cores, volume, 8)
    
    # Parallel efficiency based on worker count
    if max_workers <= 2:
        efficiency = 0.85
    elif max_workers <= 4:
        efficiency = 0.75
    else:
        efficiency = 0.65
        
    return max_workers * efficiency

# Performance projections for key volumes
cpu_cores = os.cpu_count() or 4
print("| Email Volume | Sequential Time | Parallel Time | Speedup |")
print("|-------------|----------------|---------------|---------|")

for volume in [100, 1000, 10000]:
    # Sequential time calculation
    seq_redaction = volume * avg_time_per_email
    seq_charts = timing_data['chart_generation_time'] * (np.log(volume) / np.log(current_emails))
    seq_pdf = timing_data['pdf_generation_time'] * (volume / current_emails)
    seq_total = seq_redaction + seq_charts + seq_pdf
    
    # Parallel time with speedup
    speedup_factor = calculate_parallel_speedup(volume, cpu_cores)
    par_redaction = seq_redaction / speedup_factor
    par_total = par_redaction + seq_charts + seq_pdf * 0.8
    
    speedup_percent = int(((seq_total - par_total) / seq_total) * 100)
    
    if seq_total < 60:
        seq_str = f"{seq_total:.1f}s"
    else:
        seq_str = f"{seq_total/60:.1f}m"
        
    if par_total < 60:
        par_str = f"{par_total:.1f}s"
    else:
        par_str = f"{par_total/60:.1f}m"
    
    print(f"| {volume:,} | {seq_str} | {par_str} | {speedup_percent}% |")


| Email Volume | Sequential Time | Parallel Time | Speedup |
|-------------|----------------|---------------|---------|
| 100 | 8.2s | 6.6s | 19% |
| 1,000 | 1.3m | 1.1m | 20% |
| 10,000 | 13.2m | 10.5m | 20% |
