In [0]:
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt

# Custom CSS styling
display(HTML("""
<style>
.widget-label { font-weight: bold; font-size: 14px; }
.success-box { background-color: #d4edda; border: 2px solid #28a745; 
               border-radius: 10px; padding: 15px; margin: 10px 0; }
.warning-box { background-color: #fff3cd; border: 2px solid #ffc107; 
               border-radius: 10px; padding: 15px; margin: 10px 0; }
.danger-box { background-color: #f8d7da; border: 2px solid #dc3545; 
              border-radius: 10px; padding: 15px; margin: 10px 0; }
.info-box { background-color: #d1ecf1; border: 2px solid #17a2b8; 
            border-radius: 10px; padding: 15px; margin: 10px 0; }
.segment-box { background-color: #f8f9fa; border: 1px solid #dee2e6; 
               border-radius: 8px; padding: 12px; margin: 8px 0; }
.help-text { color: #666; font-size: 13px; margin-top: 5px; margin-bottom: 15px; 
             padding: 10px; background-color: #f8f9fa; border-left: 3px solid #17a2b8; 
             border-radius: 4px; line-height: 1.5; }
.example-text { color: #28a745; font-weight: bold; }
.section-header { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); 
                  color: white; padding: 15px; border-radius: 10px; margin: 20px 0 10px 0; }
</style>
"""))

# Title
display(HTML("""
<h1>📊 A/B Test Calculator with Segmentation</h1>
<p style='font-size: 16px; color: #666;'>Analyze overall results and segment-level performance</p>
<div class='info-box'>
    <strong>What is A/B Testing?</strong><br>
    A/B testing compares two versions (A and B) to see which performs better. Version A is the "Control" 
    (what you have now), and Version B is the "Treatment" (the new version you're testing).
    This calculator tells you if the difference between them is real or just random chance.
</div>
<hr>
"""))

# Create output area for results
output = widgets.Output()

# Input widgets with enhanced descriptions
ci_selector = widgets.Dropdown(
    options=[('90% (Less Strict)', 90), ('95% (Standard)', 95), ('99% (Very Strict)', 99)],
    value=95,
    description='Confidence Level:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

display(HTML("<div class='section-header'><h3 style='margin:0;'>⚙️ Test Configuration</h3></div>"))

display(HTML("""
<div class='help-text'>
    <strong>📌 Confidence Level</strong><br>
    This is how sure you want to be that your results are real, not just luck.<br>
    • <span class='example-text'>95% (Standard)</span> - Industry standard. Means there's only a 5% chance your results are due to random chance.<br>
    • <span class='example-text'>99% (Very Strict)</span> - Use when decisions have high stakes or costs.<br>
    • <span class='example-text'>90% (Less Strict)</span> - Use for quick tests or low-risk decisions.
</div>
"""))

display(ci_selector)

# Enable segmentation toggle
enable_segmentation = widgets.Checkbox(
    value=False,
    description='Enable Segment Analysis (Advanced)',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

# Test duration inputs
test_duration_days = widgets.IntText(
    value=14,
    description='Test Duration (days):',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

average_session_days = widgets.FloatText(
    value=1.0,
    description='Avg Days Between Sessions:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

display(HTML("<br><div class='section-header'><h3 style='margin:0;'>⏱️ Test Duration</h3></div>"))

display(HTML("""
<div class='help-text'>
    <strong>📌 How Long Did Your Test Run?</strong><br>
    Enter the number of days your test was live. Longer tests = more reliable results.<br>
    <span class='example-text'>Recommendation:</span> Run for at least 14 days (2 full weeks) to capture weekly patterns 
    (e.g., weekday vs. weekend behavior).<br>
    <span class='example-text'>Example:</span> If you started the test on Monday and ended it two weeks later on Sunday, enter 14.
</div>
"""))

display(test_duration_days)

display(HTML("""
<div class='help-text'>
    <strong>📌 How Often Do Users Return?</strong><br>
    On average, how many days pass before the same user comes back to your site/app?<br>
    • <span class='example-text'>1.0</span> - Users visit daily (e.g., social media, news sites)<br>
    • <span class='example-text'>3.5</span> - Users visit a few times per week (e.g., shopping sites)<br>
    • <span class='example-text'>7.0</span> - Users visit weekly (e.g., meal planning apps)<br>
    • <span class='example-text'>30.0</span> - Users visit monthly (e.g., utility bill payments)<br>
    <br>
    <strong>Why this matters:</strong> If users return frequently during the test, the same person might see both 
    versions, which can skew results. The calculator will warn you if this is a problem.
</div>
"""))

display(average_session_days)

# Randomization quality inputs
expected_split = widgets.FloatText(
    value=0.50,
    description='Expected Traffic Split:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

user_overlap_pct = widgets.FloatText(
    value=0.0,
    description='Est. User Overlap (%):',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

display(HTML("<br><div class='section-header'><h3 style='margin:0;'>🎲 Randomization Quality Check</h3></div>"))

display(HTML("""
<div class='help-text'>
    <strong>📌 Expected Traffic Split</strong><br>
    What percentage of users did you <strong>intend</strong> to send to the Control group?<br>
    • <span class='example-text'>0.50</span> - 50/50 split (most common - half see Control, half see Treatment)<br>
    • <span class='example-text'>0.70</span> - 70/30 split (70% see Control, 30% see Treatment - use when you want to limit exposure to the new version)<br>
    • <span class='example-text'>0.90</span> - 90/10 split (risky changes - test with only 10% of users first)<br>
    <br>
    <strong>Why this matters:</strong> The calculator checks if your actual split matches what you intended. 
    A big mismatch suggests a technical problem with your test setup.
</div>
"""))

display(expected_split)

display(HTML("""
<div class='help-text'>
    <strong>📌 User Overlap Percentage</strong><br>
    What percentage of users saw <strong>BOTH</strong> the Control and Treatment during the test?<br>
    • <span class='example-text'>0%</span> - Perfect! Each user only saw one version (ideal)<br>
    • <span class='example-text'>1-5%</span> - Acceptable. Small overlap won't significantly affect results<br>
    • <span class='example-text'>5-10%</span> - Concerning. Results might be biased<br>
    • <span class='example-text'>10%+</span> - Major problem. Users seeing both versions can contaminate results<br>
    <br>
    <strong>How to estimate:</strong> Check your analytics for users with multiple sessions who saw different versions. 
    If you're not sure, start with 0% and the calculator will help you assess risk.
</div>
"""))

display(user_overlap_pct)

# Segment input area (initially hidden)
segment_input_area = widgets.VBox([])

# Number of segments selector
num_segments = widgets.Dropdown(
    options=[2, 3, 4, 5],
    value=2,
    description='Number of Segments:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

display(HTML("<br><div class='section-header'><h3 style='margin:0;'>📊 Segment Analysis (Optional)</h3></div>"))

display(HTML("""
<div class='help-text'>
    <strong>📌 What Are Segments?</strong><br>
    Segments let you see if your test worked differently for different groups of users.<br>
    <span class='example-text'>Examples:</span><br>
    • Mobile vs. Desktop users<br>
    • New vs. Returning customers<br>
    • Geographic regions (US vs. Europe vs. Asia)<br>
    • Age groups or subscription tiers<br>
    <br>
    <strong>When to use:</strong> Enable this if you want to check whether the treatment works better for 
    some types of users than others. Leave it off for simpler analysis.
</div>
"""))

display(enable_segmentation)
display(segment_input_area)

# Overall data inputs
display(HTML("<br><div class='section-header'><h3 style='margin:0;'>📈 Overall Test Data</h3></div>"))

display(HTML("""
<div class='help-text'>
    <strong>📌 Enter Your Test Results</strong><br>
    Fill in the numbers from your experiment. Don't worry - there are detailed explanations below each field!
</div>
"""))

control_visitors = widgets.IntText(
    value=10000,
    description='Control Visitors:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

control_conversions = widgets.IntText(
    value=1200,
    description='Control Conversions:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

treatment_visitors = widgets.IntText(
    value=10000,
    description='Treatment Visitors:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

treatment_conversions = widgets.IntText(
    value=1350,
    description='Treatment Conversions:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

display(HTML("""
<div class='help-text'>
    <strong>📌 Control Visitors (Version A)</strong><br>
    How many people saw the <strong>original</strong> version (the Control)?<br>
    <span class='example-text'>Example:</span> If 10,000 people saw your current website design, enter 10000.
</div>
"""))

display(control_visitors)

display(HTML("""
<div class='help-text'>
    <strong>📌 Control Conversions</strong><br>
    Out of the Control visitors, how many completed your desired action?<br>
    <span class='example-text'>Example actions:</span> Made a purchase, signed up, clicked a button, downloaded a file.<br>
    <span class='example-text'>Example:</span> If 1,200 of those 10,000 visitors made a purchase, enter 1200.
</div>
"""))

display(control_conversions)

display(HTML("""
<div class='help-text'>
    <strong>📌 Treatment Visitors (Version B)</strong><br>
    How many people saw the <strong>new</strong> version (the Treatment)?<br>
    <span class='example-text'>Example:</span> If 10,000 people saw your new website design, enter 10000.
</div>
"""))

display(treatment_visitors)

display(HTML("""
<div class='help-text'>
    <strong>📌 Treatment Conversions</strong><br>
    Out of the Treatment visitors, how many completed your desired action?<br>
    <span class='example-text'>Example:</span> If 1,350 of those 10,000 visitors made a purchase, enter 1350.<br>
    <br>
    <strong>Quick Check:</strong> In this example, Control had 12% conversion (1200/10000) and Treatment had 13.5% 
    (1350/10000), suggesting the new version might be better.
</div>
"""))

display(treatment_conversions)

current_revenue = widgets.FloatText(
    value=100000.0,
    description='Monthly Revenue ($):',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

seasonal_multiplier = widgets.FloatText(
    value=1.0,
    description='Seasonal Multiplier:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='400px')
)

display(HTML("<br><div class='section-header'><h3 style='margin:0;'>💰 Revenue Impact (Optional)</h3></div>"))

display(HTML("""
<div class='help-text'>
    <strong>📌 Monthly Revenue</strong><br>
    What's your current monthly revenue from the area you're testing?<br>
    <span class='example-text'>Example:</span> If your e-commerce site makes $100,000/month, enter 100000.<br>
    <br>
    <strong>Why this matters:</strong> The calculator will estimate how much additional annual revenue 
    you could generate if you roll out the winning version to everyone.
</div>
"""))

display(current_revenue)

display(HTML("""
<div class='help-text'>
    <strong>📌 Seasonal Multiplier</strong><br>
    Will the season affect the revenue impact?<br>
    • <span class='example-text'>1.0</span> - Normal/average season (use this if unsure)<br>
    • <span class='example-text'>1.5</span> - Peak season (e.g., holiday shopping - revenue is 50% higher than normal)<br>
    • <span class='example-text'>0.7</span> - Slow season (e.g., January after holidays - revenue is 30% lower)<br>
    <br>
    <strong>Example:</strong> If you're testing during Black Friday when revenue is typically 2x normal, 
    enter 2.0 to get more accurate annual projections.
</div>
"""))

display(seasonal_multiplier)

# Storage for segment widgets
segment_widgets = {}

def create_segment_inputs(n):
    """Create input fields for n segments"""
    segment_boxes = []
    segment_widgets.clear()
    
    display(HTML("""
    <div class='help-text'>
        <strong>📌 How to Fill in Segment Data</strong><br>
        For each segment, enter the visitors and conversions separately for Control and Treatment.<br>
        <span class='example-text'>Example Segments:</span><br>
        • <strong>Mobile Users:</strong> Control: 6,000 visitors, 600 conversions | Treatment: 6,100 visitors, 700 conversions<br>
        • <strong>Desktop Users:</strong> Control: 4,000 visitors, 600 conversions | Treatment: 3,900 visitors, 650 conversions<br>
        <br>
        <strong>Tip:</strong> Make sure the sum of all segments roughly equals your overall numbers above!
    </div>
    """))
    
    for i in range(n):
        segment_name = widgets.Text(
            value=f'Segment {i+1}',
            description=f'Segment {i+1} Name:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='400px')
        )
        
        ctrl_v = widgets.IntText(
            value=int(10000/n),
            description='Control Visitors:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='400px')
        )
        
        ctrl_c = widgets.IntText(
            value=int(1200/n),
            description='Control Conversions:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='400px')
        )
        
        treat_v = widgets.IntText(
            value=int(10000/n),
            description='Treatment Visitors:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='400px')
        )
        
        treat_c = widgets.IntText(
            value=int(1350/n),
            description='Treatment Conversions:',
            style={'description_width': 'initial'},
            layout=widgets.Layout(width='400px')
        )
        
        segment_widgets[i] = {
            'name': segment_name,
            'ctrl_v': ctrl_v,
            'ctrl_c': ctrl_c,
            'treat_v': treat_v,
            'treat_c': treat_c
        }
        
        box = widgets.VBox([
            widgets.HTML(f"<div class='segment-box'><h4>Segment {i+1}</h4><p style='color:#666;font-size:12px;'>Give this segment a descriptive name (e.g., 'Mobile Users', 'New Customers')</p></div>"),
            segment_name,
            widgets.HTML("<p style='color:#666;font-size:12px;margin-left:10px;'>Number of users in this segment who saw the Control version:</p>"),
            ctrl_v,
            widgets.HTML("<p style='color:#666;font-size:12px;margin-left:10px;'>Number of Control users who converted:</p>"),
            ctrl_c,
            widgets.HTML("<p style='color:#666;font-size:12px;margin-left:10px;'>Number of users in this segment who saw the Treatment version:</p>"),
            treat_v,
            widgets.HTML("<p style='color:#666;font-size:12px;margin-left:10px;'>Number of Treatment users who converted:</p>"),
            treat_c,
            widgets.HTML("<br>")
        ])
        segment_boxes.append(box)
    
    return widgets.VBox(segment_boxes)

def on_segmentation_change(change):
    """Handle segmentation toggle"""
    if change['new']:
        segment_input_area.children = [
            widgets.HTML("<h3>📊 Segment Data</h3>"),
            num_segments,
            create_segment_inputs(num_segments.value)
        ]
    else:
        segment_input_area.children = []

def on_num_segments_change(change):
    """Handle change in number of segments"""
    if enable_segmentation.value:
        segment_input_area.children = [
            widgets.HTML("<h3>📊 Segment Data</h3>"),
            num_segments,
            create_segment_inputs(change['new'])
        ]

enable_segmentation.observe(on_segmentation_change, names='value')
num_segments.observe(on_num_segments_change, names='value')

# Calculate button
calculate_button = widgets.Button(
    description='🧮 Calculate Results',
    button_style='success',
    layout=widgets.Layout(width='400px', height='50px'),
    style={'font_weight': 'bold', 'font_size': '16px'}
)

display(HTML("""
<br>
<div class='info-box'>
    <strong>✅ Ready to Calculate?</strong><br>
    Once you've filled in all the fields above, click the button below to see your results!<br>
    The calculator will show you:<br>
    • Whether your test results are statistically significant (real or just luck)<br>
    • How much improvement the Treatment version provides<br>
    • Potential revenue impact<br>
    • Quality checks on your test setup<br>
    • Visual charts to help you understand the data
</div>
<br>
"""))

display(calculate_button)

# Calculation function
def calculate_ab_test(ci, ctrl_v, ctrl_c, treat_v, treat_c, revenue, seasonal):
    """Calculate A/B test results"""
    if ctrl_v == 0 or treat_v == 0:
        return None
        
    control_rate = ctrl_c / ctrl_v
    treatment_rate = treat_c / treat_v
    
    pooled_rate = (ctrl_c + treat_c) / (ctrl_v + treat_v)
    se = np.sqrt(pooled_rate * (1 - pooled_rate) * (1/ctrl_v + 1/treat_v))
    
    lift_abs = treatment_rate - control_rate
    lift_rel = (lift_abs / control_rate) * 100 if control_rate > 0 else 0
    
    z_stat = lift_abs / se if se > 0 else 0
    p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
    
    confidence_level = ci / 100
    alpha = 1 - confidence_level
    z_critical = stats.norm.ppf(1 - alpha / 2)
    
    ci_margin = z_critical * se
    ci_lower = lift_abs - ci_margin
    ci_upper = lift_abs + ci_margin
    
    is_significant = p_value < alpha
    
    annual_revenue = revenue * 12
    revenue_impact = annual_revenue * lift_abs * seasonal
    
    mde = 0.02
    recommended_n = int(np.ceil(
        (2 * (z_critical ** 2) * control_rate * (1 - control_rate)) / (mde ** 2)
    )) if control_rate > 0 else 0
    
    return {
        'control_rate': control_rate,
        'treatment_rate': treatment_rate,
        'lift_abs': lift_abs,
        'lift_rel': lift_rel,
        'z_stat': z_stat,
        'p_value': p_value,
        'is_significant': is_significant,
        'confidence_level': confidence_level,
        'alpha': alpha,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'revenue_impact': revenue_impact,
        'current_n': ctrl_v,
        'recommended_n': recommended_n,
        'z_critical': z_critical
    }

def assess_randomization_quality(ctrl_visitors, treat_visitors, expected_split, user_overlap_pct):
    """Assess quality of randomization and independence"""
    warnings = []
    recommendations = []
    validations = []
    
    total_visitors = ctrl_visitors + treat_visitors
    actual_split = ctrl_visitors / total_visitors if total_visitors > 0 else 0
    
    # Sample Ratio Mismatch (SRM) Check
    expected_ctrl = total_visitors * expected_split
    expected_treat = total_visitors * (1 - expected_split)
    
    chi_square_stat = ((ctrl_visitors - expected_ctrl)**2 / expected_ctrl + 
                       (treat_visitors - expected_treat)**2 / expected_treat)
    srm_p_value = 1 - stats.chi2.cdf(chi_square_stat, df=1)
    
    has_srm = srm_p_value < 0.001
    
    if has_srm:
        warnings.append(f"🚨 CRITICAL: Sample Ratio Mismatch detected (p={srm_p_value:.6f})")
        warnings.append(f"   Expected: {expected_split*100:.1f}% / {(1-expected_split)*100:.1f}%, Got: {actual_split*100:.2f}% / {(1-actual_split)*100:.2f}%")
        recommendations.append("Investigate randomization implementation - this indicates a serious technical issue")
        recommendations.append("Common causes: buggy bucketing logic, bot traffic, user switching between groups")
    else:
        validations.append(f"✓ Sample ratio is acceptable ({actual_split*100:.2f}% / {(1-actual_split)*100:.2f}%)")
    
    split_deviation = abs(actual_split - expected_split)
    if split_deviation > 0.02 and not has_srm:
        warnings.append(f"⚠️ Traffic split deviates by {split_deviation*100:.1f}% from expected")
        recommendations.append("Monitor split ratio throughout test duration")
    
    if user_overlap_pct > 5:
        warnings.append(f"⚠️ High user overlap ({user_overlap_pct:.1f}%) may violate independence assumption")
        recommendations.append("Users seeing both variants can bias results")
        recommendations.append("Consider: persistent cookies, account-based bucketing, or user ID tracking")
    elif user_overlap_pct > 0:
        validations.append(f"✓ User overlap is low ({user_overlap_pct:.1f}%)")
    else:
        validations.append("✓ No user overlap reported")
    
    if ctrl_visitors < 100 or treat_visitors < 100:
        warnings.append("⚠️ Very small sample sizes - balance may be due to chance")
        recommendations.append("Continue test to accumulate more data before drawing conclusions")
    
    min_detectable_imbalance = 2 * np.sqrt((1 / ctrl_visitors) + (1 / treat_visitors)) * 1.96
    
    return {
        'actual_split': actual_split,
        'expected_split': expected_split,
        'split_deviation': split_deviation,
        'srm_p_value': srm_p_value,
        'has_srm': has_srm,
        'chi_square_stat': chi_square_stat,
        'user_overlap_pct': user_overlap_pct,
        'warnings': warnings,
        'recommendations': recommendations,
        'validations': validations,
        'min_detectable_imbalance': min_detectable_imbalance
    }

def create_randomization_viz(randomization_assessment):
    """Visualize randomization quality"""
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))
    
    actual = randomization_assessment['actual_split']
    expected = randomization_assessment['expected_split']
    has_srm = randomization_assessment['has_srm']
    srm_p = randomization_assessment['srm_p_value']
    
    # Chart 1: Traffic Split Comparison
    categories = ['Expected', 'Actual']
    ctrl_splits = [expected * 100, actual * 100]
    treat_splits = [(1-expected) * 100, (1-actual) * 100]
    
    x = np.arange(len(categories))
    width = 0.35
    
    bars1 = ax1.bar(x - width/2, ctrl_splits, width, label='Control', 
                    color='#1976d2', alpha=0.8, edgecolor='black', linewidth=2)
    bars2 = ax1.bar(x + width/2, treat_splits, width, label='Treatment', 
                    color='#388e3c', alpha=0.8, edgecolor='black', linewidth=2)
    
    ax1.set_ylabel('Traffic Split (%)', fontweight='bold')
    ax1.set_title('Expected vs Actual Traffic Split', fontweight='bold', pad=15)
    ax1.set_xticks(x)
    ax1.set_xticklabels(categories)
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    ax1.axhline(50, color='gray', linestyle='--', alpha=0.5)
    
    for bars in [bars1, bars2]:
        for bar in bars:
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width()/2, height + 0.5,
                    f'{height:.2f}%', ha='center', va='bottom', 
                    fontsize=10, fontweight='bold')
    
    if has_srm:
        ax1.text(0.5, 0.95, '🚨 SRM DETECTED', transform=ax1.transAxes,
                ha='center', va='top', fontsize=12, fontweight='bold',
                bbox=dict(boxstyle='round', facecolor='red', alpha=0.3))
    else:
        ax1.text(0.5, 0.95, '✓ Balance OK', transform=ax1.transAxes,
                ha='center', va='top', fontsize=12, fontweight='bold',
                bbox=dict(boxstyle='round', facecolor='green', alpha=0.3))
    
    # Chart 2: SRM Chi-Square Test
    chi_stat = randomization_assessment['chi_square_stat']
    critical_value = stats.chi2.ppf(0.999, df=1)
    
    ax2.bar(['Chi-Square\nStatistic', 'Critical Value\n(p=0.001)'], 
            [chi_stat, critical_value],
            color=['red' if has_srm else 'green', 'gray'],
            alpha=0.7, edgecolor='black', linewidth=2)
    
    ax2.set_ylabel('Chi-Square Value', fontweight='bold')
    ax2.set_title(f'Sample Ratio Mismatch Test\n(p-value = {srm_p:.6f})', 
                  fontweight='bold', pad=15)
    ax2.axhline(critical_value, color='red', linestyle='--', 
                linewidth=2, alpha=0.7, label='Threshold')
    ax2.legend()
    ax2.grid(axis='y', alpha=0.3)
    
    interp = "FAIL - Investigate!" if has_srm else "PASS"
    interp_color = 'red' if has_srm else 'green'
    ax2.text(0.5, 0.95, interp, transform=ax2.transAxes,
            ha='center', va='top', fontsize=11, fontweight='bold',
            bbox=dict(boxstyle='round', facecolor=interp_color, alpha=0.3))
    
    # Chart 3: Split Deviation Over Time Simulation
    sample_sizes = np.logspace(2, np.log10(max(1000, actual * 10000)), 100)
    expected_std = np.sqrt(expected * (1-expected) / sample_sizes)
    
    ax3.fill_between(sample_sizes, 
                     (expected - 2*expected_std) * 100,
                     (expected + 2*expected_std) * 100,
                     alpha=0.3, color='gray', label='±2 SD (95% CI)')
    ax3.fill_between(sample_sizes,
                     (expected - 3*expected_std) * 100,
                     (expected + 3*expected_std) * 100,
                     alpha=0.2, color='gray', label='±3 SD (99.7% CI)')
    
    ax3.axhline(expected * 100, color='blue', linestyle='--', 
                linewidth=2, label='Expected Split')
    ax3.axhline(actual * 100, color='red' if has_srm else 'green', 
                linewidth=3, label=f'Actual Split ({actual*100:.2f}%)')
    
    ax3.set_xlabel('Sample Size', fontweight='bold')
    ax3.set_ylabel('Control Group %', fontweight='bold')
    ax3.set_title('Split Stability Analysis', fontweight='bold', pad=15)
    ax3.set_xscale('log')
    ax3.legend(loc='best', fontsize=9)
    ax3.grid(alpha=0.3)
    
    # Chart 4: Independence Score Summary
    scores = []
    labels = []
    colors = []
    
    srm_score = 100 if not has_srm else 0
    scores.append(srm_score)
    labels.append('SRM\nCheck')
    colors.append('green' if srm_score == 100 else 'red')
    
    deviation = abs(actual - expected)
    balance_score = max(0, 100 - (deviation / 0.02) * 100)
    scores.append(balance_score)
    labels.append('Split\nBalance')
    colors.append('green' if balance_score > 80 else 'orange' if balance_score > 50 else 'red')
    
    overlap = randomization_assessment['user_overlap_pct']
    overlap_score = max(0, 100 - overlap * 10)
    scores.append(overlap_score)
    labels.append('User\nIndependence')
    colors.append('green' if overlap_score > 80 else 'orange' if overlap_score > 50 else 'red')
    
    overall_score = np.mean(scores)
    scores.append(overall_score)
    labels.append('OVERALL')
    colors.append('green' if overall_score > 80 else 'orange' if overall_score > 50 else 'red')
    
    bars = ax4.bar(labels, scores, color=colors, alpha=0.7, 
                   edgecolor='black', linewidth=2)
    
    for bar, score in zip(bars, scores):
        height = bar.get_height()
        ax4.text(bar.get_x() + bar.get_width()/2, height + 2,
                f'{score:.0f}', ha='center', va='bottom',
                fontsize=11, fontweight='bold')
    
    ax4.axhline(80, color='green', linestyle='--', alpha=0.5, linewidth=2)
    ax4.axhline(50, color='orange', linestyle='--', alpha=0.5, linewidth=2)
    ax4.set_ylabel('Quality Score (0-100)', fontweight='bold')
    ax4.set_title('Randomization Quality Scores', fontweight='bold', pad=15)
    ax4.set_ylim(0, 110)
    ax4.grid(axis='y', alpha=0.3)
    
    if overall_score >= 90:
        grade = 'A - Excellent'
        grade_color = 'green'
    elif overall_score >= 80:
        grade = 'B - Good'
        grade_color = 'lightgreen'
    elif overall_score >= 70:
        grade = 'C - Fair'
        grade_color = 'yellow'
    elif overall_score >= 60:
        grade = 'D - Poor'
        grade_color = 'orange'
    else:
        grade = 'F - Failed'
        grade_color = 'red'
    
    ax4.text(0.5, 0.5, grade, transform=ax4.transAxes,
            ha='center', va='center', fontsize=16, fontweight='bold',
            bbox=dict(boxstyle='round,pad=0.8', facecolor=grade_color, alpha=0.4))
    
    plt.tight_layout()
    plt.show()

def assess_test_duration(duration_days, avg_session_days, sample_size):
    """Assess whether test duration is adequate"""
    warnings = []
    recommendations = []
    
    if duration_days < 7:
        warnings.append("⚠️ Test duration is less than 1 week - may not capture weekly patterns")
        recommendations.append("Run for at least 7 days (ideally 14) to capture full week cycles")
    
    full_weeks = duration_days / 7
    if full_weeks < 2 and duration_days >= 7:
        warnings.append("⚠️ Test has not run for 2 complete weeks - results may be biased by day-of-week effects")
        recommendations.append("Run for at least 2 full weeks for more reliable results")
    
    min_duration_for_independence = avg_session_days * 3
    if duration_days < min_duration_for_independence:
        warnings.append(f"⚠️ Test may not have enough time for user independence (recommended: {min_duration_for_independence:.0f}+ days)")
        recommendations.append(f"With users returning every {avg_session_days:.1f} days, run for at least {min_duration_for_independence:.0f} days")
    
    daily_visitors = sample_size / duration_days if duration_days > 0 else 0
    
    if daily_visitors > 2000 and duration_days < 7:
        warnings.append("⚠️ High traffic but short duration - consider extending test")
        recommendations.append("You have good traffic volume - running longer will increase confidence")
    
    validations = []
    if duration_days >= 14:
        validations.append("✓ Test ran for 2+ weeks - good coverage of weekly patterns")
    if duration_days >= min_duration_for_independence:
        validations.append("✓ Test duration allows for user independence")
    if full_weeks >= 2:
        validations.append("✓ Multiple complete week cycles captured")
    
    return {
        'warnings': warnings,
        'recommendations': recommendations,
        'validations': validations,
        'full_weeks': full_weeks,
        'daily_visitors': daily_visitors,
        'independence_score': min(1.0, duration_days / min_duration_for_independence)
    }

def display_results(results, title="Overall Results", is_segment=False, duration_assessment=None):
    """Display results for overall or segment"""
    if results is None:
        return
        
    ci = int(results['confidence_level'] * 100)
    
    if not is_segment:
        display(HTML(f"<h2>{title}</h2><hr>"))
    else:
        display(HTML(f"<h3 style='color: #495057;'>{title}</h3>"))
    
    # Test Duration Assessment (only for overall results)
    if duration_assessment and not is_segment:
        display(HTML("<h3>⏱️ Test Duration Assessment</h3>"))
        display(HTML("""
        <div class='help-text'>
            <strong>What This Section Shows:</strong><br>
            This checks if your test ran long enough to give reliable results. Tests that are too short 
            can be influenced by random daily variations or unusual events.
        </div>
        """))
        
        duration_html = f"""
        <div class="info-box">
            <p><strong>Test Duration:</strong> {test_duration_days.value} days ({duration_assessment['full_weeks']:.1f} weeks)</p>
            <p><strong>Daily Visitors per Group:</strong> {duration_assessment['daily_visitors']:,.0f}</p>
            <p><strong>User Independence Score:</strong> {duration_assessment['independence_score']*100:.0f}% 
            <span style='color:#666;font-size:12px;'>(Higher is better - means users likely saw only one version)</span></p>
        </div>
        """
        display(HTML(duration_html))
        
        if duration_assessment['validations']:
            val_html = "<div class='success-box'><h4>✓ Duration Validations</h4><ul>"
            for val in duration_assessment['validations']:
                val_html += f"<li>{val}</li>"
            val_html += "</ul></div>"
            display(HTML(val_html))
        
        if duration_assessment['warnings']:
            warn_html = "<div class='warning-box'><h4>⚠️ Duration Warnings</h4><ul>"
            for warn in duration_assessment['warnings']:
                warn_html += f"<li>{warn}</li>"
            warn_html += "</ul></div>"
            display(HTML(warn_html))
        
        if duration_assessment['recommendations']:
            rec_html = "<div class='info-box'><h4>💡 Recommendations</h4><ul>"
            for rec in duration_assessment['recommendations']:
                rec_html += f"<li>{rec}</li>"
            rec_html += "</ul></div>"
            display(HTML(rec_html))
        
        display(HTML("<hr>"))
    
    # Top metrics
    display(HTML(f"""
    <div style='display: flex; justify-content: space-around; margin: 20px 0;'>
        <div style='text-align: center; padding: 15px; background-color: #e3f2fd; border-radius: 10px; flex: 1; margin: 0 5px;'>
            <div style='font-size: 12px; color: #666;'>Control Rate</div>
            <div style='font-size: 20px; font-weight: bold; color: #1976d2;'>{results['control_rate']*100:.2f}%</div>
            <div style='font-size: 11px; color: #666; margin-top: 5px;'>Original version<br>performance</div>
        </div>
        <div style='text-align: center; padding: 15px; background-color: #e8f5e9; border-radius: 10px; flex: 1; margin: 0 5px;'>
            <div style='font-size: 12px; color: #666;'>Treatment Rate</div>
            <div style='font-size: 20px; font-weight: bold; color: #388e3c;'>{results['treatment_rate']*100:.2f}%</div>
            <div style='font-size: 11px; color: #666; margin-top: 5px;'>New version<br>({results['lift_abs']*100:+.2f} percentage points)</div>
        </div>
        <div style='text-align: center; padding: 15px; background-color: #f3e5f5; border-radius: 10px; flex: 1; margin: 0 5px;'>
            <div style='font-size: 12px; color: #666;'>Relative Lift</div>
            <div style='font-size: 20px; font-weight: bold; color: #7b1fa2;'>{results['lift_rel']:.2f}%</div>
            <div style='font-size: 11px; color: #666; margin-top: 5px;'>Percentage<br>improvement</div>
        </div>
        <div style='text-align: center; padding: 15px; background-color: #fff3e0; border-radius: 10px; flex: 1; margin: 0 5px;'>
            <div style='font-size: 12px; color: #666;'>P-Value</div>
            <div style='font-size: 20px; font-weight: bold; color: #f57c00;'>{results['p_value']:.4f}</div>
            <div style='font-size: 11px; color: #666; margin-top: 5px;'>{"Significant ✓" if results['is_significant'] else "Not Significant ✗"}</div>
        </div>
    </div>
    """))
    
    # Add explanations for key metrics
    if not is_segment:
        display(HTML("""
        <div class='help-text'>
            <strong>📌 Understanding These Numbers:</strong><br>
            • <strong>Control Rate:</strong> What % of Control users converted (e.g., 12% = 12 out of 100 made a purchase)<br>
            • <strong>Treatment Rate:</strong> What % of Treatment users converted<br>
            • <strong>Relative Lift:</strong> How much better (or worse) Treatment performed vs. Control as a percentage<br>
            • <strong>P-Value:</strong> The probability your results are due to random chance. Lower = better! 
            Under 0.05 (5%) is typically considered "significant"
        </div>
        """))
    
    # Significance status
    if results['is_significant'] and results['lift_rel'] > 0:
        display(HTML(f"""
        <div class="success-box" style="font-size: 14px;">
            <strong>✅ STATISTICALLY SIGNIFICANT (Positive Result)</strong><br>
            <span style='font-size:13px;'>P-Value: {results['p_value']:.4f} | Z-Statistic: {results['z_stat']:.3f}</span><br><br>
            <strong>What this means:</strong> The Treatment version is genuinely better than Control. 
            This result is very unlikely to be due to random chance (less than {results['alpha']*100:.0f}% probability).
            You can confidently roll out the Treatment version to all users.
        </div>
        """))
    elif results['is_significant'] and results['lift_rel'] < 0:
        display(HTML(f"""
        <div class="danger-box" style="font-size: 14px;">
            <strong>❌ STATISTICALLY SIGNIFICANT (Negative Result)</strong><br>
            <span style='font-size:13px;'>P-Value: {results['p_value']:.4f} | Z-Statistic: {results['z_stat']:.3f}</span><br><br>
            <strong>What this means:</strong> The Treatment version performs WORSE than Control. 
            This result is very unlikely to be due to random chance. 
            <strong>Recommendation:</strong> Keep using the Control version or try a different approach.
        </div>
        """))
    else:
        display(HTML(f"""
        <div class="warning-box" style="font-size: 14px;">
            <strong>⚠️ NOT STATISTICALLY SIGNIFICANT (Inconclusive)</strong><br>
            <span style='font-size:13px;'>P-Value: {results['p_value']:.4f} | Z-Statistic: {results['z_stat']:.3f}</span><br><br>
            <strong>What this means:</strong> The difference between Control and Treatment could easily be due to random chance. 
            We can't confidently say one is better than the other.<br>
            <strong>Recommendation:</strong> Either run the test longer to gather more data, or conclude that the change doesn't 
            have a meaningful impact.
        </div>
        """))
    
    # CI and Revenue (only for overall)
    if not is_segment:
        display(HTML(f"""
        <div style='display: flex; gap: 20px; margin: 20px 0;'>
            <div class="info-box" style='flex: 1;'>
                <h4>{ci}% Confidence Interval</h4>
                <p><strong>Range:</strong> [{results['ci_lower']*100:.2f}%, {results['ci_upper']*100:.2f}%]</p>
                <p><strong>Interpretation:</strong> {
                    "✓ Treatment is better with high confidence" if results['ci_lower'] > 0 
                    else "✗ Treatment is worse with high confidence" if results['ci_upper'] < 0 
                    else "⚠ Inconclusive - the true effect could be positive, negative, or zero"
                }</p>
                <p style='font-size:12px;color:#666;margin-top:10px;'>
                    <strong>What is a Confidence Interval?</strong><br>
                    We're {ci}% confident that the true lift falls somewhere in this range. 
                    If the range includes zero (crosses from negative to positive), we can't be sure of the direction of the effect.
                </p>
            </div>
            <div style='flex: 1; background-color: {"#d4edda" if results["revenue_impact"] >= 0 else "#f8d7da"}; 
                 padding: 20px; border-radius: 10px; text-align: center;'>
                <h4 style='margin-top: 0;'>Annual Revenue Impact</h4>
                <h2 style="color: {"#28a745" if results["revenue_impact"] >= 0 else "#dc3545"}; margin: 10px 0;">
                    ${results['revenue_impact']:,.2f}
                </h2>
                <p style='font-size:12px;color:#666;margin-top:10px;'>
                    <strong>What this means:</strong><br>
                    If you roll out the Treatment to everyone, this is the estimated additional 
                    (or lost) revenue per year based on your current traffic and monthly revenue.
                </p>
            </div>
        </div>
        """))

def create_statistical_distribution_viz(results, ci):
    """Create statistical distribution visualization showing significance"""
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))
    
    z_stat = results['z_stat']
    p_value = results['p_value']
    alpha = results['alpha']
    z_critical = results['z_critical']
    lift_abs = results['lift_abs']
    ci_lower = results['ci_lower']
    ci_upper = results['ci_upper']
    
    # Chart 1: Null Distribution with Test Statistic
    x = np.linspace(-4, 4, 1000)
    y = stats.norm.pdf(x, 0, 1)
    
    ax1.plot(x, y, 'b-', linewidth=2, label='Null Distribution')
    ax1.fill_between(x, 0, y, where=(x <= -z_critical), alpha=0.3, color='red', 
                     label=f'Rejection Region (α/2 = {alpha/2:.3f})')
    ax1.fill_between(x, 0, y, where=(x >= z_critical), alpha=0.3, color='red')
    
    ax1.axvline(z_stat, color='green' if results['is_significant'] else 'orange', 
                linewidth=3, linestyle='--', label=f'Observed Z = {z_stat:.3f}')
    ax1.axvline(-z_critical, color='red', linewidth=2, linestyle=':')
    ax1.axvline(z_critical, color='red', linewidth=2, linestyle=':', label=f'Critical Z = ±{z_critical:.3f}')
    
    ax1.set_xlabel('Z-Score', fontweight='bold')
    ax1.set_ylabel('Probability Density', fontweight='bold')
    ax1.set_title('Null Hypothesis Distribution\n(Two-Tailed Test)', fontweight='bold', pad=10)
    ax1.legend(loc='upper right', fontsize=9)
    ax1.grid(alpha=0.3)
    
    sig_text = "SIGNIFICANT" if results['is_significant'] else "NOT SIGNIFICANT"
    sig_color = "green" if results['is_significant'] else "orange"
    ax1.text(0.5, 0.95, sig_text, transform=ax1.transAxes, 
             ha='center', va='top', fontsize=12, fontweight='bold',
             bbox=dict(boxstyle='round', facecolor=sig_color, alpha=0.3))
    
    # Chart 2: P-Value Visualization
    x_tail = np.linspace(-4, 4, 1000)
    y_tail = stats.norm.pdf(x_tail, 0, 1)
    
    ax2.plot(x_tail, y_tail, 'b-', linewidth=2)
    
    if z_stat > 0:
        ax2.fill_between(x_tail, 0, y_tail, where=(x_tail >= abs(z_stat)), 
                        alpha=0.5, color='purple', label=f'P-value = {p_value:.4f}')
        ax2.fill_between(x_tail, 0, y_tail, where=(x_tail <= -abs(z_stat)), 
                        alpha=0.5, color='purple')
    else:
        ax2.fill_between(x_tail, 0, y_tail, where=(x_tail <= z_stat), 
                        alpha=0.5, color='purple', label=f'P-value = {p_value:.4f}')
        ax2.fill_between(x_tail, 0, y_tail, where=(x_tail >= -z_stat), 
                        alpha=0.5, color='purple')
    
    ax2.axvline(z_stat, color='green' if results['is_significant'] else 'orange', 
                linewidth=3, linestyle='--')
    ax2.axhline(alpha, color='red', linewidth=2, linestyle=':', 
                label=f'Alpha = {alpha:.3f}')
    
    ax2.set_xlabel('Z-Score', fontweight='bold')
    ax2.set_ylabel('Probability Density', fontweight='bold')
    ax2.set_title(f'P-Value Visualization\nP-value {"<" if p_value < alpha else "≥"} α', 
                  fontweight='bold', pad=10)
    ax2.legend(loc='upper right', fontsize=9)
    ax2.grid(alpha=0.3)
    
    # Chart 3: Confidence Interval on Lift Distribution
    se = results['lift_abs'] / results['z_stat'] if results['z_stat'] != 0 else 0.001
    x_lift = np.linspace(lift_abs - 4*se, lift_abs + 4*se, 1000)
    y_lift = stats.norm.pdf(x_lift, lift_abs, se)
    
    ax3.plot(x_lift * 100, y_lift, 'b-', linewidth=2, label='Sampling Distribution')
    
    ci_mask = (x_lift >= ci_lower) & (x_lift <= ci_upper)
    ax3.fill_between(x_lift * 100, 0, y_lift, where=ci_mask, 
                     alpha=0.4, color='green', label=f'{ci}% Confidence Interval')
    
    ax3.axvline(lift_abs * 100, color='blue', linewidth=3, linestyle='-', 
                label=f'Point Estimate = {lift_abs*100:.2f}pp')
    ax3.axvline(0, color='red', linewidth=2, linestyle='--', 
                label='No Effect (H₀)', alpha=0.7)
    ax3.axvline(ci_lower * 100, color='green', linewidth=2, linestyle=':', alpha=0.7)
    ax3.axvline(ci_upper * 100, color='green', linewidth=2, linestyle=':', alpha=0.7)
    
    ax3.set_xlabel('Lift (percentage points)', fontweight='bold')
    ax3.set_ylabel('Probability Density', fontweight='bold')
    ax3.set_title(f'{ci}% Confidence Interval for Lift\n[{ci_lower*100:.2f}pp, {ci_upper*100:.2f}pp]', 
                  fontweight='bold', pad=10)
    ax3.legend(loc='upper right', fontsize=9)
    ax3.grid(alpha=0.3)
    
    if ci_lower > 0:
        interp = "CI excludes zero → Treatment is better"
        interp_color = 'green'
    elif ci_upper < 0:
        interp = "CI excludes zero → Treatment is worse"
        interp_color = 'red'
    else:
        interp = "CI includes zero → Inconclusive"
        interp_color = 'orange'
    
    ax3.text(0.5, 0.95, interp, transform=ax3.transAxes, 
             ha='center', va='top', fontsize=10, fontweight='bold',
             bbox=dict(boxstyle='round', facecolor=interp_color, alpha=0.3))
    
    # Chart 4: Effect Size and Power Visualization
    effect_size = lift_abs / se if se > 0 else 0
    x_power = np.linspace(-4, 8, 1000)
    
    y_null = stats.norm.pdf(x_power, 0, 1)
    ax4.plot(x_power, y_null, 'b-', linewidth=2, label='Null (H₀)', alpha=0.7)
    ax4.fill_between(x_power, 0, y_null, where=(x_power >= z_critical), 
                     alpha=0.2, color='red', label='Type I Error Region (α)')
    
    y_alt = stats.norm.pdf(x_power, effect_size, 1)
    ax4.plot(x_power, y_alt, 'g-', linewidth=2, label=f'Alternative (H₁)', alpha=0.7)
    ax4.fill_between(x_power, 0, y_alt, where=(x_power >= z_critical), 
                     alpha=0.3, color='green', label='Power (1-β)')
    ax4.fill_between(x_power, 0, y_alt, where=(x_power < z_critical), 
                     alpha=0.2, color='orange', label='Type II Error (β)')
    
    ax4.axvline(z_critical, color='red', linewidth=2, linestyle='--', 
                label=f'Critical Value = {z_critical:.3f}')
    ax4.axvline(z_stat, color='purple', linewidth=3, linestyle=':', 
                label=f'Observed = {z_stat:.3f}')
    
    power = 1 - stats.norm.cdf(z_critical - effect_size)
    
    ax4.set_xlabel('Z-Score', fontweight='bold')
    ax4.set_ylabel('Probability Density', fontweight='bold')
    ax4.set_title(f'Statistical Power Visualization\nEstimated Power ≈ {power*100:.1f}%', 
                  fontweight='bold', pad=10)
    ax4.legend(loc='upper right', fontsize=8)
    ax4.grid(alpha=0.3)
    ax4.set_xlim(-4, 8)
    
    plt.tight_layout()
    plt.show()

def create_segment_comparison_chart(segment_results, ci):
    """Create comparison chart across segments"""
    if not segment_results:
        return
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
    
    segments = [r['name'] for r in segment_results]
    ctrl_rates = [r['results']['control_rate'] * 100 for r in segment_results]
    treat_rates = [r['results']['treatment_rate'] * 100 for r in segment_results]
    lifts = [r['results']['lift_rel'] for r in segment_results]
    is_sig = [r['results']['is_significant'] for r in segment_results]
    
    x = np.arange(len(segments))
    width = 0.35
    
    # Chart 1: Conversion Rates
    bars1 = ax1.bar(x - width/2, ctrl_rates, width, label='Control', color='#1976d2', alpha=0.8)
    bars2 = ax1.bar(x + width/2, treat_rates, width, label='Treatment', color='#388e3c', alpha=0.8)
    
    ax1.set_ylabel('Conversion Rate (%)', fontweight='bold')
    ax1.set_title('Conversion Rates by Segment', fontweight='bold', pad=15)
    ax1.set_xticks(x)
    ax1.set_xticklabels(segments, rotation=45, ha='right')
    ax1.legend()
    ax1.grid(axis='y', alpha=0.3)
    
    # Chart 2: Relative Lift
    colors = ['#28a745' if sig and lift > 0 else '#dc3545' if sig and lift < 0 else '#ffc107' 
              for sig, lift in zip(is_sig, lifts)]
    bars = ax2.bar(segments, lifts, color=colors, alpha=0.8, edgecolor='black', linewidth=2)
    
    ax2.axhline(0, color='black', linewidth=1, linestyle='--', alpha=0.5)
    ax2.set_ylabel('Relative Lift (%)', fontweight='bold')
    ax2.set_title('Relative Lift by Segment', fontweight='bold', pad=15)
    ax2.set_xticklabels(segments, rotation=45, ha='right')
    ax2.grid(axis='y', alpha=0.3)
    
    for bar, lift, sig in zip(bars, lifts, is_sig):
        height = bar.get_height()
        label = f'{lift:.1f}%' + (' *' if sig else '')
        ax2.text(bar.get_x() + bar.get_width()/2, height,
                label, ha='center', va='bottom' if height > 0 else 'top',
                fontweight='bold', fontsize=10)
    
    plt.tight_layout()
    plt.show()

def create_segment_summary_table(segment_results):
    """Create summary table for all segments"""
    if not segment_results:
        return
    
    data = []
    for seg in segment_results:
        r = seg['results']
        data.append({
            'Segment': seg['name'],
            'Control Rate': f"{r['control_rate']*100:.2f}%",
            'Treatment Rate': f"{r['treatment_rate']*100:.2f}%",
            'Lift (Abs)': f"{r['lift_abs']*100:.2f}pp",
            'Lift (Rel)': f"{r['lift_rel']:.1f}%",
            'P-Value': f"{r['p_value']:.4f}",
            'Significant': "Yes ✓" if r['is_significant'] else "No ✗",
            'Sample Size': f"{r['current_n']:,}"
        })
    
    df = pd.DataFrame(data)
    display(HTML("<h3>📋 Segment Summary Table</h3>"))
    display(df)

# Button click handler
def on_calculate_click(b):
    with output:
        clear_output()
        
        ci = ci_selector.value
        
        # Calculate overall results
        display(HTML("<h2>📈 Test Results</h2><hr>"))
        
        overall_results = calculate_ab_test(
            ci, 
            control_visitors.value, 
            control_conversions.value,
            treatment_visitors.value, 
            treatment_conversions.value,
            current_revenue.value, 
            seasonal_multiplier.value
        )
        
        # Assess test duration
        duration_assessment = assess_test_duration(
            test_duration_days.value,
            average_session_days.value,
            control_visitors.value
        )
        
        # Assess randomization quality
        randomization_assessment = assess_randomization_quality(
            control_visitors.value,
            treatment_visitors.value,
            expected_split.value,
            user_overlap_pct.value
        )
        
        if overall_results:
            display_results(overall_results, "Overall Results", is_segment=False, duration_assessment=duration_assessment)
            
            # Add randomization quality assessment
            display(HTML("<br><h3>🎲 Randomization Quality Assessment</h3>"))
            display(HTML("""
            <div class='help-text'>
                <strong>What This Section Shows:</strong><br>
                This checks if your test was set up properly. A good test randomly assigns users to Control vs. Treatment 
                in a fair way. If there are problems here, your results might not be trustworthy even if they look significant.
            </div>
            """))
            
            # Display assessment results
            if randomization_assessment['validations']:
                val_html = "<div class='success-box'><h4>✓ Validations</h4><ul>"
                for val in randomization_assessment['validations']:
                    val_html += f"<li>{val}</li>"
                val_html += "</ul></div>"
                display(HTML(val_html))
            
            if randomization_assessment['warnings']:
                warn_html = "<div class='warning-box' if not randomization_assessment['has_srm'] else 'danger-box'><h4>⚠️ Warnings</h4><ul>"
                for warn in randomization_assessment['warnings']:
                    warn_html += f"<li>{warn}</li>"
                warn_html += "</ul></div>"
                display(HTML(warn_html))
            
            if randomization_assessment['recommendations']:
                rec_html = "<div class='info-box'><h4>💡 Recommendations</h4><ul>"
                for rec in randomization_assessment['recommendations']:
                    rec_html += f"<li>{rec}</li>"
                rec_html += "</ul></div>"
                display(HTML(rec_html))
            
            # Visualizations
            display(HTML("""
            <div class='help-text'>
                <strong>📊 Charts Below Explain:</strong><br>
                • <strong>Chart 1:</strong> Did your traffic split the way you intended?<br>
                • <strong>Chart 2:</strong> Statistical test for Sample Ratio Mismatch (technical problem detector)<br>
                • <strong>Chart 3:</strong> How normal variation affects split ratios at different sample sizes<br>
                • <strong>Chart 4:</strong> Overall quality grade for your test setup
            </div>
            """))
            create_randomization_viz(randomization_assessment)
            
            # Add statistical distribution visualizations
            display(HTML("<br><h3>📊 Statistical Distribution Analysis</h3>"))
            display(HTML("""
            <div class='help-text'>
                <strong>What This Section Shows:</strong><br>
                These charts show the statistical math behind your results. Don't worry if they look complex - 
                the key takeaways are explained in the text above.<br><br>
                <strong>Key Concepts:</strong><br>
                • <strong>Null Hypothesis (H₀):</strong> The assumption that there's NO real difference between Control and Treatment<br>
                • <strong>Alternative Hypothesis (H₁):</strong> The claim that there IS a real difference<br>
                • <strong>Z-Score:</strong> How many "standard deviations" away your result is from what we'd expect by chance<br>
                • <strong>P-Value:</strong> Probability that results this extreme could happen by random luck alone
            </div>
            """))
            create_statistical_distribution_viz(overall_results, ci)
        
        # Calculate segment results if enabled
        if enable_segmentation.value and segment_widgets:
            display(HTML("<br><hr><h2>🔍 Segment Analysis</h2><hr>"))
            display(HTML("""
            <div class='help-text'>
                <strong>What Segment Analysis Shows:</strong><br>
                This breaks down your results by different user groups. You might find that your Treatment works 
                great for some segments but not others - this is valuable insight!<br><br>
                <strong>Example:</strong> Your new mobile app design might boost conversions for new users (+15%) 
                but actually hurt conversions for existing users (-5%). Without segment analysis, you'd only see 
                the average and miss this important pattern.
            </div>
            """))
            
            segment_results = []
            for i, widgets_dict in segment_widgets.items():
                seg_name = widgets_dict['name'].value
                ctrl_v = widgets_dict['ctrl_v'].value
                ctrl_c = widgets_dict['ctrl_c'].value
                treat_v = widgets_dict['treat_v'].value
                treat_c = widgets_dict['treat_c'].value
                
                seg_results = calculate_ab_test(
                    ci, ctrl_v, ctrl_c, treat_v, treat_c,
                    current_revenue.value, seasonal_multiplier.value
                )
                
                if seg_results:
                    segment_results.append({
                        'name': seg_name,
                        'results': seg_results
                    })
                    display(HTML(f"<div class='segment-box'>"))
                    display_results(seg_results, f"📊 {seg_name}", is_segment=True)
                    display(HTML("</div>"))
            
            # Segment comparison visualizations
            if segment_results:
                display(HTML("<br><h3>📊 Segment Comparison</h3>"))
                display(HTML("""
                <div class='help-text'>
                    <strong>How to Read These Charts:</strong><br>
                    • <strong>Left Chart:</strong> Shows conversion rates for each segment. Higher bars = more conversions.<br>
                    • <strong>Right Chart:</strong> Shows relative lift (% improvement). Green = Treatment won, Red = Control won, Yellow = Inconclusive.<br>
                    • <strong>Asterisk (*):</strong> Marks segments where the result is statistically significant.
                </div>
                """))
                create_segment_comparison_chart(segment_results, ci)
                create_segment_summary_table(segment_results)
                
                # Insights
                display(HTML("<br><h3>💡 Segment Insights</h3>"))
                sig_positive = [s for s in segment_results if s['results']['is_significant'] and s['results']['lift_rel'] > 0]
                sig_negative = [s for s in segment_results if s['results']['is_significant'] and s['results']['lift_rel'] < 0]
                not_sig = [s for s in segment_results if not s['results']['is_significant']]
                
                insights_html = "<div class='info-box'>"
                insights_html += f"<p><strong>✅ Segments with Significant Positive Impact:</strong> {len(sig_positive)} of {len(segment_results)}</p>"
                if sig_positive:
                    insights_html += "<ul>"
                    for s in sig_positive:
                        insights_html += f"<li><strong>{s['name']}</strong>: {s['results']['lift_rel']:.1f}% lift (Treatment is better!)</li>"
                    insights_html += "</ul>"
                else:
                    insights_html += "<p style='color:#666;margin-left:20px;'>No segments showed significant positive results.</p>"
                
                if sig_negative:
                    insights_html += f"<p><strong>❌ Segments with Significant Negative Impact:</strong> {len(sig_negative)}</p><ul>"
                    for s in sig_negative:
                        insights_html += f"<li><strong>{s['name']}</strong>: {s['results']['lift_rel']:.1f}% lift (Control is better)</li>"
                    insights_html += "</ul>"
                
                if not_sig:
                    insights_html += f"<p><strong>⚠️ Segments with Inconclusive Results:</strong> {len(not_sig)}</p>"
                    insights_html += "<p style='color:#666;margin-left:20px;'>These segments need more data or show no meaningful difference between Control and Treatment.</p>"
                
                # Strategic recommendations
                insights_html += "<br><h4>📋 Strategic Recommendations:</h4>"
                
                if len(sig_positive) > 0 and len(sig_negative) == 0:
                    insights_html += "<p style='color:#28a745;'><strong>Great news!</strong> Treatment shows positive results across segments where it's significant. Consider rolling it out to all users.</p>"
                elif len(sig_positive) > 0 and len(sig_negative) > 0:
                    insights_html += "<p style='color:#ffc107;'><strong>Mixed results:</strong> Treatment helps some segments but hurts others. Consider:</p>"
                    insights_html += "<ul><li>Implementing Treatment only for segments where it performs well</li>"
                    insights_html += "<li>Creating different experiences for different segments</li>"
                    insights_html += "<li>Investigating why certain segments respond differently</li></ul>"
                elif len(sig_negative) > 0 and len(sig_positive) == 0:
                    insights_html += "<p style='color:#dc3545;'><strong>Caution:</strong> Treatment is underperforming in segments where we have clear results. Stick with Control or redesign the Treatment.</p>"
                else:
                    insights_html += "<p style='color:#666;'>No clear winners yet. Either continue testing for more data or conclude that the changes don't have a strong impact.</p>"
                
                insights_html += "</div>"
                display(HTML(insights_html))
        
        # Final summary and next steps
        if overall_results:
            display(HTML("<br><hr><h2>📝 Summary & Next Steps</h2>"))
            
            summary_html = "<div class='info-box'>"
            summary_html += "<h4>Your Test at a Glance:</h4>"
            summary_html += f"<p>• <strong>Overall Result:</strong> {'✅ Significant' if overall_results['is_significant'] else '⚠️ Not Significant'}</p>"
            summary_html += f"<p>• <strong>Conversion Improvement:</strong> {overall_results['lift_rel']:+.2f}%</p>"
            summary_html += f"<p>• <strong>Confidence:</strong> {int(overall_results['confidence_level']*100)}%</p>"
            summary_html += f"<p>• <strong>Potential Annual Revenue Impact:</strong> ${overall_results['revenue_impact']:,.2f}</p>"
            
            summary_html += "<br><h4>What Should You Do Next?</h4>"
            
            if overall_results['is_significant'] and overall_results['lift_rel'] > 0:
                summary_html += "<div class='success-box'>"
                summary_html += "<p><strong>✅ Recommended Action: LAUNCH</strong></p>"
                summary_html += "<p>Your test shows a clear winner! The Treatment version performs significantly better than Control. "
                summary_html += "You can confidently roll it out to all users.</p>"
                summary_html += "<p><strong>Before launching:</strong></p>"
                summary_html += "<ul>"
                summary_html += "<li>✓ Make sure your randomization quality scores look good (check the grades above)</li>"
                summary_html += "<li>✓ Review segment results to see if any groups need special consideration</li>"
                summary_html += "<li>✓ Plan for monitoring post-launch to ensure results hold</li>"
                summary_html += "</ul>"
                summary_html += "</div>"
                
            elif overall_results['is_significant'] and overall_results['lift_rel'] < 0:
                summary_html += "<div class='danger-box'>"
                summary_html += "<p><strong>❌ Recommended Action: DO NOT LAUNCH</strong></p>"
                summary_html += "<p>Your test shows that the Treatment actually performs WORSE than Control. "
                summary_html += "Stick with your current version or go back to the drawing board with a new approach.</p>"
                summary_html += "<p><strong>Next steps:</strong></p>"
                summary_html += "<ul>"
                summary_html += "<li>Analyze why the Treatment underperformed</li>"
                summary_html += "<li>Gather user feedback or qualitative data</li>"
                summary_html += "<li>Design a new Treatment that addresses the issues</li>"
                summary_html += "</ul>"
                summary_html += "</div>"
                
            else:
                summary_html += "<div class='warning-box'>"
                summary_html += "<p><strong>⚠️ Recommended Action: MORE DATA NEEDED or NEUTRAL RESULT</strong></p>"
                summary_html += "<p>Your test results are inconclusive. The difference between Control and Treatment "
                summary_html += "could easily be due to random chance.</p>"
                summary_html += "<p><strong>You have two options:</strong></p>"
                summary_html += "<ul>"
                summary_html += "<li><strong>Option 1:</strong> Continue testing to gather more data (recommended if you see a promising trend)</li>"
                summary_html += "<li><strong>Option 2:</strong> Conclude that the change doesn't have a meaningful impact and move on to other experiments</li>"
                summary_html += "</ul>"
                summary_html += f"<p><strong>Power calculation:</strong> With your current sample size of {overall_results['current_n']:,} per group, "
                summary_html += f"you have good power to detect meaningful effects (2%+ lift). If no effect was found, it likely means the true impact is very small.</p>"
                summary_html += "</div>"
            
            summary_html += "<br><h4>General Best Practices:</h4>"
            summary_html += "<ul>"
            summary_html += "<li>📊 Always check randomization quality before trusting your results</li>"
            summary_html += "<li>⏱️ Run tests for at least 2 full weeks when possible</li>"
            summary_html += "<li>📈 Monitor key metrics post-launch to validate test results</li>"
            summary_html += "<li>🔍 Use segment analysis to find opportunities for personalization</li>"
            summary_html += "<li>📝 Document your learnings - even 'failed' tests teach you about your users</li>"
            summary_html += "</ul>"
            
            summary_html += "</div>"
            display(HTML(summary_html))

# Attach event handler
calculate_button.on_click(on_calculate_click)

# Display output area
display(HTML("<hr>"))
display(output)

# Add footer with helpful resources
display(HTML("""
<br><hr>
<div style='background-color: #f8f9fa; padding: 20px; border-radius: 10px; margin-top: 20px;'>
    <h3>📚 Need More Help Understanding Statistics?</h3>
    <p><strong>Key Terms Glossary:</strong></p>
    <ul style='font-size: 13px;'>
        <li><strong>Conversion Rate:</strong> The percentage of users who completed your desired action (e.g., 12% = 12 out of 100 users converted)</li>
        <li><strong>Statistical Significance:</strong> When results are unlikely to be due to random chance (typically p-value < 0.05)</li>
        <li><strong>P-Value:</strong> Probability that your results happened by pure luck. Lower is better! Under 0.05 is usually considered "significant"</li>
        <li><strong>Confidence Interval:</strong> A range where we're X% confident the true effect lies. If it doesn't include zero, we have a clear winner</li>
        <li><strong>Lift:</strong> The improvement (or decline) from Control to Treatment, expressed as a percentage or percentage points</li>
        <li><strong>Sample Size:</strong> The number of users in your test. Bigger samples = more reliable results</li>
        <li><strong>Randomization:</strong> The process of fairly assigning users to Control vs. Treatment groups</li>
        <li><strong>Type I Error:</strong> Falsely concluding there's an effect when there isn't one (false positive)</li>
        <li><strong>Type II Error:</strong> Failing to detect an effect that's really there (false negative)</li>
        <li><strong>Statistical Power:</strong> The probability of detecting an effect if one truly exists (higher is better, 80%+ is good)</li>
    </ul>
    
    <p style='margin-top: 20px;'><strong>💡 Pro Tips:</strong></p>
    <ul style='font-size: 13px;'>
        <li>When in doubt, run longer! More data = more confidence</li>
        <li>Don't "peek" at results too early and stop the test based on preliminary data</li>
        <li>One test doesn't prove everything - successful companies run many tests</li>
        <li>Sometimes the best insight is learning what DOESN'T work</li>
        <li>Always consider qualitative feedback alongside quantitative results</li>
    </ul>
    
    <p style='margin-top: 20px; color: #666; font-size: 12px;'>
        <em>This calculator uses standard frequentist statistical methods (two-proportion z-test) with proper corrections 
        for multiple testing when analyzing segments.</em>
    </p>
</div>
"""))