In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Import the enhanced modules
from utils.statistical_analysis import StatisticalAnalyzer
from system.experiment_runner import ExperimentRunner
from utils.helpers import create_project_directories, verify_api_keys

# Set random seed for reproducibility (important for research)
np.random.seed(42)

# Initialize statistical analyzer
stat_analyzer = StatisticalAnalyzer(alpha=0.05)  # 5% significance level

print("Statistical analysis environment ready")

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
# Define your research questions and hypotheses
research_questions = {
    "RQ1": "Does chunking method significantly affect retrieval quality?",
    "RQ2": "Is there an interaction between embedding model and retrieval strategy?",
    "RQ3": "What is the optimal trade-off between performance and response time?"
}

# Define null hypotheses
null_hypotheses = {
    "H0_1": "There is no significant difference in faithfulness scores between chunking methods",
    "H0_2": "There is no interaction effect between embedding model and retrieval strategy",
    "H0_3": "There is no correlation between retrieval quality and response time"
}

# Calculate required sample size for adequate power
from statsmodels.stats.power import TTestPower

power_analysis = TTestPower()
required_n = power_analysis.solve_power(
    effect_size=0.5,  # Medium effect size
    power=0.8,        # 80% power
    alpha=0.05        # 5% significance level
)

print(f"Required sample size per group: {int(np.ceil(required_n))}")
print(f"Recommended: Test at least {int(np.ceil(required_n))} questions per configuration")

In [None]:
# Import the enhanced experiment runner
from utils.statistical_analysis import enhance_experiment_runner
StatisticalExperimentRunner = enhance_experiment_runner()

# Initialize runner with statistical considerations
stat_runner = StatisticalExperimentRunner(
    base_path="./experiments",
    min_runs_per_config=30  # Minimum for statistical validity
)

# Define test questions with ground truth for evaluation
test_questions = [
    {
        "question": "What is the main methodology described?",
        "reference": "The main methodology involves retrieval-augmented generation.",
        "question_type": "factual"
    },
    {
        "question": "How does the system handle errors?",
        "reference": "The system handles errors through exception handling and logging.",
        "question_type": "technical"
    },
    {
        "question": "What are the performance characteristics?",
        "reference": "Performance depends on retrieval strategy and model size.",
        "question_type": "analytical"
    },
    # Add at least 30 questions for statistical validity and more for power analysis
]

# Define configurations for controlled experiment
from core.experiment_config import ExperimentConfig

# Control configuration (baseline)
control_config = ExperimentConfig(
    experiment_name="control",
    tags=["baseline", "control"],
    chunker=ChunkerConfig(method="recursive", chunk_size=500),
    embedding=EmbeddingConfig(model="text-embedding-3-small"),
    retrieval=RetrievalConfig(strategy="vector", top_k=5)
)

# Treatment configurations (variations to test)
treatment_configs = [
    ExperimentConfig(
        experiment_name="semantic_chunking",
        tags=["treatment", "semantic"],
        chunker=ChunkerConfig(method="semantic", semantic_threshold=0.8),
        embedding=EmbeddingConfig(model="text-embedding-3-small"),
        retrieval=RetrievalConfig(strategy="vector", top_k=5)
    ),
    ExperimentConfig(
        experiment_name="hybrid_retrieval",
        tags=["treatment", "hybrid"],
        chunker=ChunkerConfig(method="recursive", chunk_size=500),
        embedding=EmbeddingConfig(model="text-embedding-3-small"),
        retrieval=RetrievalConfig(strategy="hybrid", top_k=5)
    ),
    ExperimentConfig(
        experiment_name="large_embedding",
        tags=["treatment", "embedding"],
        chunker=ChunkerConfig(method="recursive", chunk_size=500),
        embedding=EmbeddingConfig(model="text-embedding-3-large"),
        retrieval=RetrievalConfig(strategy="vector", top_k=5)
    )
]

# Run controlled experiment
all_configs = [control_config] + treatment_configs
results_df = await stat_runner.run_statistical_experiment(
    configs=all_configs,
    test_questions=test_questions,
    document_path="./documents",
    runs_per_config=3  # Multiple runs to account for variability
)

print(f"Collected {len(results_df)} data points")

In [None]:
# Prepare data for analysis
df_prepared = stat_analyzer.prepare_data(results_df)

# Check assumptions for ANOVA
chunker_groups = [
    df_prepared[df_prepared['chunker'] == method]['ragas_faithfulness'].values
    for method in df_prepared['chunker'].unique()
]

assumptions = stat_analyzer.check_assumptions(chunker_groups, "ANOVA_chunking")

print("Assumption Checks:")
print(f"Normality tests: {assumptions['normality']}")
print(f"Homogeneity of variance: {assumptions['homogeneity']}")

# Visualize distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Q-Q plot for normality
for i, group in enumerate(chunker_groups):
    stats.probplot(group, dist="norm", plot=axes[0])
axes[0].set_title("Q-Q Plot for Normality Check")

# Box plot for variance
df_prepared.boxplot(column='ragas_faithfulness', by='chunker', ax=axes[1])
axes[1].set_title("Distribution by Chunking Method")

plt.tight_layout()
plt.show()

# If assumptions are violated, consider transformations or non-parametric tests
if not all(test['normal'] for test in assumptions['normality'].values()):
    print("\nWARNING: Normality assumption violated. Consider:")
    print("1. Log transformation of the data")
    print("2. Non-parametric tests (Kruskal-Wallis)")

In [None]:
# Test H0_1: Effect of chunking method on faithfulness
print("=== Testing H0_1: Effect of Chunking Method ===")
anova_result = stat_analyzer.anova_analysis(
    df_prepared, 
    factor='chunker', 
    metric='ragas_faithfulness',
    include_tukey=True
)

print(f"F-statistic: {anova_result['f_statistic']:.4f}")
print(f"p-value: {anova_result['p_value']:.4f}")
print(f"Effect size (η²): {anova_result['eta_squared']:.4f} ({anova_result['effect_size']})")

if anova_result['significant']:
    print("\nReject H0_1: Chunking method has a significant effect on faithfulness")
    print("\nPost-hoc analysis (Tukey HSD):")
    for pair in anova_result['post_hoc']['significant_pairs']:
        print(f"  - Significant difference between {pair[0]} and {pair[1]}")
else:
    print("\nFail to reject H0_1: No significant effect of chunking method")

# Test H0_2: Interaction between embedding and retrieval
print("\n=== Testing H0_2: Interaction Effect ===")
factorial_result = stat_analyzer.factorial_anova(
    df_prepared,
    factors=['embedding', 'retrieval'],
    metric='ragas_faithfulness'
)

print("Significant effects:")
for effect in factorial_result['significant_effects']:
    print(f"  - {effect}")

# Test H0_3: Correlation between quality and response time
print("\n=== Testing H0_3: Correlation Analysis ===")
corr_result = stat_analyzer.correlation_analysis(
    df_prepared,
    metrics=['ragas_faithfulness', 'ragas_answer_relevancy', 'response_time']
)

print("Significant correlations:")
for corr in corr_result['significant_correlations']:
    print(f"  - {corr['metric1']} vs {corr['metric2']}: "
          f"r={corr['correlation']:.3f}, p={corr['p_value']:.4f}")

In [None]:
# Compare control vs each treatment
print("=== Control vs Treatment Comparisons ===")

control_data = df_prepared[df_prepared['experiment_name'] == 'control']

for treatment_config in treatment_configs:
    treatment_data = df_prepared[df_prepared['experiment_name'] == treatment_config.experiment_name]
    
    if len(treatment_data) > 0:
        comparison = stat_analyzer.paired_comparison(
            df_prepared,
            config1='control',
            config2=treatment_config.experiment_name,
            metric='ragas_faithfulness'
        )
        
        print(f"\nControl vs {treatment_config.experiment_name}:")
        print(f"  Mean difference: {comparison['mean_diff']:.4f} "
              f"(95% CI: {comparison['confidence_interval'][0]:.4f}, "
              f"{comparison['confidence_interval'][1]:.4f})")
        print(f"  Cohen's d: {comparison['cohens_d']:.3f} ({comparison['effect_size']})")
        print(f"  p-value: {comparison['p_value']:.4f}")
        
        if comparison['significant']:
            print(f"  ✓ Significant difference detected")

In [None]:
# Create forest plot for effect sizes
fig, ax = plt.subplots(figsize=(10, 6))

comparisons = []
for treatment_config in treatment_configs:
    comp = stat_analyzer.paired_comparison(
        df_prepared,
        config1='control',
        config2=treatment_config.experiment_name,
        metric='ragas_faithfulness'
    )
    comparisons.append({
        'name': treatment_config.experiment_name,
        'mean_diff': comp['mean_diff'],
        'ci_low': comp['confidence_interval'][0],
        'ci_high': comp['confidence_interval'][1],
        'significant': comp['significant']
    })

# Plot
y_pos = np.arange(len(comparisons))
colors = ['red' if c['significant'] else 'gray' for c in comparisons]

ax.scatter([c['mean_diff'] for c in comparisons], y_pos, 
          c=colors, s=100, zorder=3)

for i, comp in enumerate(comparisons):
    ax.plot([comp['ci_low'], comp['ci_high']], [i, i], 
           color=colors[i], linewidth=2, zorder=2)

ax.axvline(x=0, color='black', linestyle='--', alpha=0.5)
ax.set_yticks(y_pos)
ax.set_yticklabels([c['name'] for c in comparisons])
ax.set_xlabel('Mean Difference from Control (95% CI)')
ax.set_title('Treatment Effects Compared to Control')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Check if we had sufficient power
print("=== Statistical Power Analysis ===")

for metric in ['ragas_faithfulness', 'ragas_answer_relevancy', 'response_time']:
    if metric in df_prepared.columns:
        power_result = stat_analyzer.sample_size_analysis(
            df_prepared, 
            metric=metric,
            effect_size=0.5,  # Medium effect
            power=0.8         # 80% power
        )
        
        print(f"\n{metric}:")
        print(f"  Current sample size: {power_result['current_sample_size']}")
        print(f"  Current power: {power_result['current_power']:.3f}")
        print(f"  Required for 80% power: {power_result['required_sample_size']}")
        
        if not power_result['sufficient_power']:
            print(f"  ⚠️ Insufficient power - need {power_result['required_sample_size'] - power_result['current_sample_size']} more samples")

In [None]:
# Generate comprehensive statistical report
report_path = stat_analyzer.generate_statistical_report(
    df_prepared,
    output_path="./experiments/statistical_report.html"
)

print(f"Statistical report generated: {report_path}")

# Export for SPSS/R if needed
spss_data = stat_analyzer.export_for_spss(
    df_prepared,
    output_path="./experiments/rag_data_for_spss.csv"
)

print("Data exported for external statistical software")

# Create publication-ready table
summary_table = df_prepared.groupby(['chunker', 'retrieval']).agg({
    'ragas_faithfulness': ['mean', 'std', 'count'],
    'response_time': ['mean', 'std']
}).round(3)

print("\nSummary Table for Publication:")
print(summary_table.to_latex(
    caption="RAG Pipeline Performance by Configuration",
    label="tab:rag_performance"
))

In [None]:
# Generate interpretation based on statistical findings
print("=== Statistical Findings Summary ===")

# Check main effects
main_effects = []
for factor in ['chunker', 'retrieval', 'embedding']:
    anova = stat_analyzer.anova_analysis(df_prepared, factor, 'ragas_faithfulness')
    if 'significant' in anova and anova['significant']:
        main_effects.append({
            'factor': factor,
            'p_value': anova['p_value'],
            'effect_size': anova['eta_squared']
        })

print("\n1. MAIN EFFECTS:")
if main_effects:
    for effect in main_effects:
        print(f"   - {effect['factor']}: p={effect['p_value']:.4f}, η²={effect['effect_size']:.3f}")
else:
    print("   - No significant main effects found")

print("\n2. PRACTICAL SIGNIFICANCE:")
print("   Consider both statistical significance (p-values) and effect sizes")
print("   Small effects may be statistically significant but not practically important")

print("\n3. LIMITATIONS:")
print("   - Sample size limitations may affect power")
print("   - Multiple comparisons increase Type I error risk")
print("   - Results specific to test dataset and questions")

print("\n4. RECOMMENDATIONS:")
print("   - Replicate findings with larger sample sizes")
print("   - Test on diverse datasets")
print("   - Consider Bonferroni correction for multiple comparisons")