In [2]:
import json
import pandas as pd
import numpy as np
from scipy import stats
from pathlib import Path

# Load individual request data from token_data.jsonl
data = []
token_file = Path('test_results/token_data.jsonl')

if not token_file.exists():
    raise FileNotFoundError("No token data found. Run tests first with: python run_tests.py")

with open(token_file, 'r') as f:
    for line in f:
        record = json.loads(line.strip())
        data.append(record)

df = pd.DataFrame(data)
print(f"Total requests loaded: {len(df)}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData preview:")
df.head()

Total requests loaded: 790

Columns: ['timestamp', 'region_prefix', 'service_tier', 'prompt_size', 'input_tokens', 'output_tokens', 'total_tokens']

Data preview:


Unnamed: 0,timestamp,region_prefix,service_tier,prompt_size,input_tokens,output_tokens,total_tokens
0,1767931000.0,us,default,small,425,2618,3043
1,1767931000.0,us,default,small,425,2603,3028
2,1767931000.0,us,default,small,425,2709,3134
3,1767931000.0,us,default,small,425,2769,3194
4,1767931000.0,us,default,small,425,2916,3341


## Extract Response Times by Prompt Size

Now we have individual request-level data with response times for proper statistical analysis.

In [3]:
# Extract response times by prompt size
small = df[df['prompt_size'] == 'small']['response_time_ms'].values
medium = df[df['prompt_size'] == 'medium']['response_time_ms'].values
large = df[df['prompt_size'] == 'large']['response_time_ms'].values

print(f"Sample sizes:")
print(f"  Small: {len(small)} requests")
print(f"  Medium: {len(medium)} requests")
print(f"  Large: {len(large)} requests")
print(f"\nNote: For conclusive statistical tests, n≥30 per group is recommended.")

KeyError: 'response_time_ms'

## Descriptive Statistics by Prompt Size

In [None]:
# Calculate descriptive statistics for each prompt size
stats_summary = pd.DataFrame({
    'Prompt Size': ['Small', 'Medium', 'Large'],
    'n': [len(small), len(medium), len(large)],
    'Mean (ms)': [small.mean(), medium.mean(), large.mean()],
    'Median (ms)': [np.median(small), np.median(medium), np.median(large)],
    'Std Dev (ms)': [small.std(), medium.std(), large.std()],
    'Min (ms)': [small.min(), medium.min(), large.min()],
    'Max (ms)': [small.max(), medium.max(), large.max()]
})

print(stats_summary.to_string(index=False))

## Statistical Significance Tests

In [None]:
# ANOVA test - compares all three groups at once
f_stat, p_value_anova = stats.f_oneway(small, medium, large)

print("="*60)
print("ANOVA TEST (Overall difference across all 3 groups)")
print("="*60)
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value_anova:.6f}")
print()
if p_value_anova < 0.001:
    print("*** HIGHLY SIGNIFICANT (p < 0.001)")
    print("The differences in latency are NOT random.")
elif p_value_anova < 0.01:
    print("** VERY SIGNIFICANT (p < 0.01)")
    print("The differences in latency are very unlikely to be random.")
elif p_value_anova < 0.05:
    print("* SIGNIFICANT (p < 0.05)")
    print("The differences in latency are statistically significant.")
else:
    print("NOT SIGNIFICANT (p >= 0.05)")
    print("The differences could be due to random chance.")

## Pairwise Comparisons (Which groups differ?)

In [None]:
# Pairwise t-tests
print("="*60)
print("PAIRWISE T-TESTS")
print("="*60)

# Small vs Medium
t_stat_sm, p_sm = stats.ttest_ind(small, medium)
diff_sm = small.mean() - medium.mean()
print(f"\nSmall vs Medium:")
print(f"  Difference: {diff_sm:+.0f} ms")
print(f"  t-statistic: {t_stat_sm:.4f}")
print(f"  p-value: {p_sm:.6f}")
if p_sm < 0.001:
    print(f"  *** HIGHLY SIGNIFICANT")
elif p_sm < 0.01:
    print(f"  ** VERY SIGNIFICANT")
elif p_sm < 0.05:
    print(f"  * SIGNIFICANT")
else:
    print(f"  NOT SIGNIFICANT")

# Medium vs Large
t_stat_ml, p_ml = stats.ttest_ind(medium, large)
diff_ml = medium.mean() - large.mean()
print(f"\nMedium vs Large:")
print(f"  Difference: {diff_ml:+.0f} ms")
print(f"  t-statistic: {t_stat_ml:.4f}")
print(f"  p-value: {p_ml:.6f}")
if p_ml < 0.001:
    print(f"  *** HIGHLY SIGNIFICANT")
elif p_ml < 0.01:
    print(f"  ** VERY SIGNIFICANT")
elif p_ml < 0.05:
    print(f"  * SIGNIFICANT")
else:
    print(f"  NOT SIGNIFICANT")

# Small vs Large
t_stat_sl, p_sl = stats.ttest_ind(small, large)
diff_sl = small.mean() - large.mean()
print(f"\nSmall vs Large:")
print(f"  Difference: {diff_sl:+.0f} ms")
print(f"  t-statistic: {t_stat_sl:.4f}")
print(f"  p-value: {p_sl:.6f}")
if p_sl < 0.001:
    print(f"  *** HIGHLY SIGNIFICANT")
elif p_sl < 0.01:
    print(f"  ** VERY SIGNIFICANT")
elif p_sl < 0.05:
    print(f"  * SIGNIFICANT")
else:
    print(f"  NOT SIGNIFICANT")

## Effect Sizes (How big is the difference?)

In [None]:
import numpy as np

def cohens_d(group1, group2):
    """Calculate Cohen's d effect size"""
    diff = group1.mean() - group2.mean()
    pooled_std = np.sqrt((group1.std()**2 + group2.std()**2) / 2)
    return diff / pooled_std

print("="*60)
print("EFFECT SIZES (Cohen's d)")
print("="*60)
print("\nInterpretation:")
print("  |d| < 0.2  = negligible")
print("  |d| < 0.5  = small")
print("  |d| < 0.8  = medium")
print("  |d| >= 0.8 = large")
print()

d_sm = cohens_d(small, medium)
print(f"Small vs Medium: d = {d_sm:.3f}")
if abs(d_sm) >= 0.8:
    print("  → LARGE effect")
elif abs(d_sm) >= 0.5:
    print("  → MEDIUM effect")
elif abs(d_sm) >= 0.2:
    print("  → SMALL effect")
else:
    print("  → NEGLIGIBLE effect")

d_ml = cohens_d(medium, large)
print(f"\nMedium vs Large: d = {d_ml:.3f}")
if abs(d_ml) >= 0.8:
    print("  → LARGE effect")
elif abs(d_ml) >= 0.5:
    print("  → MEDIUM effect")
elif abs(d_ml) >= 0.2:
    print("  → SMALL effect")
else:
    print("  → NEGLIGIBLE effect")

d_sl = cohens_d(small, large)
print(f"\nSmall vs Large: d = {d_sl:.3f}")
if abs(d_sl) >= 0.8:
    print("  → LARGE effect")
elif abs(d_sl) >= 0.5:
    print("  → MEDIUM effect")
elif abs(d_sl) >= 0.2:
    print("  → SMALL effect")
else:
    print("  → NEGLIGIBLE effect")

## Visualization

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
ax1 = axes[0]
data_to_plot = [small, medium, large]
ax1.boxplot(data_to_plot, labels=['Small', 'Medium', 'Large'])
ax1.set_ylabel('Response Time (ms)')
ax1.set_xlabel('Prompt Size')
ax1.set_title('Distribution of Response Times by Prompt Size')
ax1.grid(axis='y', alpha=0.3)

# Bar plot with error bars
ax2 = axes[1]
means = [small.mean(), medium.mean(), large.mean()]
stds = [small.std(), medium.std(), large.std()]
x_pos = np.arange(len(means))
ax2.bar(x_pos, means, yerr=stds, capsize=5, alpha=0.7)
ax2.set_xticks(x_pos)
ax2.set_xticklabels(['Small', 'Medium', 'Large'])
ax2.set_ylabel('Response Time (ms)')
ax2.set_xlabel('Prompt Size')
ax2.set_title('Mean Response Time with Standard Deviation')
ax2.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Summary and Recommendations

Based on the statistical analysis above:

In [None]:
print("=" * 60)
print("STATISTICAL CONCLUSION")
print("=" * 60)

if anova_p < 0.05:
    print("✓ The ANOVA test shows statistically significant differences")
    print(f"  between prompt sizes (p = {anova_p:.6f})")
    print()
    
    # Check which specific comparisons are significant
    significant_pairs = []
    if p_small_medium < 0.05:
        significant_pairs.append(f"Small vs Medium (p={p_small_medium:.6f}, d={d_small_medium:.2f})")
    if p_medium_large < 0.05:
        significant_pairs.append(f"Medium vs Large (p={p_medium_large:.6f}, d={d_medium_large:.2f})")
    if p_small_large < 0.05:
        significant_pairs.append(f"Small vs Large (p={p_small_large:.6f}, d={d_small_large:.2f})")
    
    if significant_pairs:
        print("Significant pairwise differences:")
        for pair in significant_pairs:
            print(f"  • {pair}")
    
    print()
    print("CONCLUSION: The observed latency differences are NOT random.")
    print("Prompt size has a statistically significant effect on response time.")
else:
    print("✗ The ANOVA test does NOT show statistically significant differences")
    print(f"  (p = {anova_p:.4f})")
    print()
    print("CONCLUSION: The observed latency differences could be due to random chance.")
    print(f"Sample size: {len(small)} per group (recommend n≥30 for conclusive results)")

print("=" * 60)