# 04 - Statistical Significance Testing

Statistical inference for classification results using permutation and t-tests.

**Contents:**
1. Single-subject permutation testing
2. Group-level t-tests (one-sample vs. chance)
3. Multiple comparison correction (FDR, Bonferroni)
4. Cluster-based permutation for temporal decoding
5. Bootstrap confidence intervals

In [None]:
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from scipy.ndimage import label as scipy_label

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    cross_val_score,
    StratifiedKFold,
    permutation_test_score
)

# For multiple comparison correction
from statsmodels.stats.multitest import multipletests

---
## 1. Single-Subject Permutation Testing

Test if classification accuracy is significantly above chance by comparing to a null distribution.

In [None]:
# Create example classification data
np.random.seed(42)

n_samples = 120
n_features = 50

# Data with true signal
X = np.random.randn(n_samples, n_features)
y = np.random.randint(0, 2, n_samples)

# Add signal
X[y == 1, :5] += 0.8

print(f"Data: {X.shape}")
print(f"Labels: {np.bincount(y)}")

In [None]:
def permutation_test(
    X, y,
    classifier='lda',
    n_permutations=1000,
    cv=5,
    n_jobs=-1,
    random_state=42
):
    """
    Run permutation test for classification significance.
    
    Returns
    -------
    observed : float
        Observed accuracy
    p_value : float
        P-value from permutation test
    null_distribution : np.ndarray
        Null distribution of accuracies
    """
    # Create classifier
    if classifier == 'lda':
        clf = make_pipeline(
            StandardScaler(),
            LDA(solver='lsqr', shrinkage='auto')
        )
    elif classifier == 'svm':
        clf = make_pipeline(
            StandardScaler(),
            SVC(kernel='linear')
        )
    else:
        raise ValueError(f"Unknown classifier: {classifier}")
    
    # Cross-validation
    cv_obj = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
    
    # Permutation test
    observed, null_dist, p_value = permutation_test_score(
        clf, X, y,
        cv=cv_obj,
        n_permutations=n_permutations,
        n_jobs=n_jobs,
        random_state=random_state,
        scoring='accuracy'
    )
    
    return observed, p_value, null_dist

In [None]:
# Run permutation test
observed, p_value, null_dist = permutation_test(
    X, y,
    classifier='lda',
    n_permutations=500,  # Use 1000+ for publication
    cv=5
)

print("Permutation Test Results:")
print("=" * 40)
print(f"Observed accuracy: {observed:.1%}")
print(f"Null mean: {np.mean(null_dist):.1%}")
print(f"Null std: {np.std(null_dist):.1%}")
print(f"P-value: {p_value:.4f}")
print(f"Significant (p<0.05): {p_value < 0.05}")

In [None]:
# Plot null distribution
fig, ax = plt.subplots(figsize=(10, 6))

# Histogram of null distribution
ax.hist(null_dist, bins=50, color='gray', alpha=0.7, 
        edgecolor='black', label='Null distribution')

# Observed accuracy
ax.axvline(observed, color='red', linewidth=3, linestyle='--',
           label=f'Observed: {observed:.1%}')

# Chance level
ax.axvline(0.5, color='black', linewidth=2, linestyle=':',
           label='Chance: 50%')

# 95th percentile of null
threshold_95 = np.percentile(null_dist, 95)
ax.axvline(threshold_95, color='orange', linewidth=2, linestyle='-',
           label=f'95th percentile: {threshold_95:.1%}')

ax.set_xlabel('Accuracy', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title(f'Permutation Test (p = {p_value:.4f})', fontsize=14)
ax.legend(fontsize=10)

plt.tight_layout()
plt.show()

---
## 2. Group-Level T-Tests

Test if group mean accuracy is significantly above chance.

In [None]:
# Simulate multi-subject accuracies
n_subjects = 20

# Simulated subject accuracies (with effect)
np.random.seed(42)
subject_accuracies = 0.5 + 0.15 * np.random.randn(n_subjects) + 0.1  # Above chance
subject_accuracies = np.clip(subject_accuracies, 0.3, 0.9)  # Realistic bounds

print(f"Subject accuracies: N={n_subjects}")
print(f"Mean: {np.mean(subject_accuracies):.1%}")
print(f"Std: {np.std(subject_accuracies):.1%}")
print(f"Range: [{np.min(subject_accuracies):.1%}, {np.max(subject_accuracies):.1%}]")

In [None]:
def group_ttest(accuracies, chance_level=0.5, alternative='greater'):
    """
    One-sample t-test against chance level.
    
    Parameters
    ----------
    accuracies : np.ndarray
        Subject accuracies
    chance_level : float
        Chance level (0.5 for binary)
    alternative : str
        'two-sided', 'greater', or 'less'
    
    Returns
    -------
    dict with t-statistic, p-value, effect size (Cohen's d)
    """
    n = len(accuracies)
    mean = np.mean(accuracies)
    std = np.std(accuracies, ddof=1)
    
    # T-test
    t_stat, p_value = stats.ttest_1samp(accuracies, chance_level)
    
    # Adjust for one-tailed if needed
    if alternative == 'greater':
        p_value = p_value / 2 if t_stat > 0 else 1 - p_value / 2
    elif alternative == 'less':
        p_value = p_value / 2 if t_stat < 0 else 1 - p_value / 2
    
    # Effect size (Cohen's d)
    cohens_d = (mean - chance_level) / std
    
    # Confidence interval
    se = std / np.sqrt(n)
    ci_95 = stats.t.interval(0.95, df=n-1, loc=mean, scale=se)
    
    return {
        'mean': mean,
        'std': std,
        'n': n,
        't_stat': t_stat,
        'p_value': p_value,
        'df': n - 1,
        'cohens_d': cohens_d,
        'ci_95': ci_95
    }

In [None]:
# Run group t-test
result = group_ttest(subject_accuracies, chance_level=0.5, alternative='greater')

print("Group-Level T-Test (vs. 50% chance):")
print("=" * 50)
print(f"Mean accuracy: {result['mean']:.1%} ± {result['std']:.1%}")
print(f"t({result['df']}) = {result['t_stat']:.3f}")
print(f"p-value (one-tailed): {result['p_value']:.4f}")
print(f"Cohen's d: {result['cohens_d']:.2f}")
print(f"95% CI: [{result['ci_95'][0]:.1%}, {result['ci_95'][1]:.1%}]")
print(f"\nSignificant (p<0.05): {result['p_value'] < 0.05}")

In [None]:
# Non-parametric alternative: Wilcoxon signed-rank test
w_stat, w_pvalue = stats.wilcoxon(subject_accuracies - 0.5, alternative='greater')

print("\nWilcoxon Signed-Rank Test (non-parametric):")
print(f"W = {w_stat:.1f}")
print(f"p-value: {w_pvalue:.4f}")
print(f"Significant (p<0.05): {w_pvalue < 0.05}")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar plot with individual subjects
ax = axes[0]
x = np.arange(n_subjects) + 1
ax.bar(x, subject_accuracies, color='steelblue', edgecolor='black')
ax.axhline(0.5, color='red', linestyle='--', linewidth=2, label='Chance')
ax.axhline(result['mean'], color='green', linestyle='-', linewidth=2, 
           label=f"Mean: {result['mean']:.1%}")
ax.set_xlabel('Subject', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Per-Subject Classification Accuracy', fontsize=14)
ax.legend()

# Box plot with stats
ax = axes[1]
bp = ax.boxplot(subject_accuracies, vert=True, patch_artist=True)
bp['boxes'][0].set_facecolor('steelblue')
ax.scatter(np.ones(n_subjects), subject_accuracies, alpha=0.6, color='black', s=50)
ax.axhline(0.5, color='red', linestyle='--', linewidth=2, label='Chance')

# Add stats annotation
stats_text = f"t({result['df']}) = {result['t_stat']:.2f}\np = {result['p_value']:.4f}\nd = {result['cohens_d']:.2f}"
ax.text(1.3, result['mean'], stats_text, fontsize=11, verticalalignment='center',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Group Distribution', fontsize=14)
ax.set_xticks([1])
ax.set_xticklabels(['All Subjects'])

plt.tight_layout()
plt.show()

---
## 3. Multiple Comparison Correction

When testing multiple ROIs, correct for multiple comparisons.

In [None]:
# Simulate ROI-level results (426 ROIs, like HCP atlas)
n_rois = 426
np.random.seed(42)

# Most ROIs are at chance, some have signal
roi_accuracies = 0.5 + 0.03 * np.random.randn(n_rois)  # Mostly noise

# Add signal to some ROIs
signal_rois = [10, 25, 50, 100, 150, 200, 250, 300]
roi_accuracies[signal_rois] += 0.15

# Simulate subject data for each ROI (for t-tests)
n_subjects = 15
roi_subject_accs = np.zeros((n_rois, n_subjects))
for r in range(n_rois):
    roi_subject_accs[r] = roi_accuracies[r] + 0.08 * np.random.randn(n_subjects)

print(f"Testing {n_rois} ROIs")
print(f"Signal ROIs: {signal_rois}")

In [None]:
def roi_significance_testing(roi_subject_accs, chance_level=0.5, alpha=0.05):
    """
    Test each ROI and apply multiple comparison correction.
    
    Returns dict with uncorrected and corrected results.
    """
    n_rois = roi_subject_accs.shape[0]
    
    # T-tests for each ROI
    p_values = []
    t_stats = []
    
    for r in range(n_rois):
        t, p = stats.ttest_1samp(roi_subject_accs[r], chance_level)
        # One-tailed
        p = p / 2 if t > 0 else 1 - p / 2
        p_values.append(p)
        t_stats.append(t)
    
    p_values = np.array(p_values)
    t_stats = np.array(t_stats)
    
    # Multiple comparison corrections
    
    # Bonferroni
    _, p_bonf, _, _ = multipletests(p_values, alpha=alpha, method='bonferroni')
    sig_bonf = p_bonf < alpha
    
    # FDR (Benjamini-Hochberg)
    _, p_fdr, _, _ = multipletests(p_values, alpha=alpha, method='fdr_bh')
    sig_fdr = p_fdr < alpha
    
    # Uncorrected
    sig_uncorr = p_values < alpha
    
    return {
        'p_values': p_values,
        't_stats': t_stats,
        'p_bonferroni': p_bonf,
        'p_fdr': p_fdr,
        'sig_uncorrected': sig_uncorr,
        'sig_bonferroni': sig_bonf,
        'sig_fdr': sig_fdr,
        'n_sig_uncorrected': np.sum(sig_uncorr),
        'n_sig_bonferroni': np.sum(sig_bonf),
        'n_sig_fdr': np.sum(sig_fdr)
    }

In [None]:
# Run multiple comparison testing
mcc_results = roi_significance_testing(roi_subject_accs, chance_level=0.5, alpha=0.05)

print("Multiple Comparison Correction Results:")
print("=" * 50)
print(f"Total ROIs tested: {n_rois}")
print(f"\nSignificant ROIs:")
print(f"  Uncorrected (p<0.05): {mcc_results['n_sig_uncorrected']}")
print(f"  Bonferroni corrected: {mcc_results['n_sig_bonferroni']}")
print(f"  FDR corrected (BH):   {mcc_results['n_sig_fdr']}")

# Check if true signal ROIs were detected
print(f"\nTrue signal ROIs: {signal_rois}")
detected_fdr = np.where(mcc_results['sig_fdr'])[0]
print(f"Detected (FDR): {list(detected_fdr)}")

In [None]:
# Visualization
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# P-value distribution
ax = axes[0]
ax.hist(mcc_results['p_values'], bins=50, color='steelblue', edgecolor='black')
ax.axvline(0.05, color='red', linestyle='--', linewidth=2, label='α = 0.05')
ax.set_xlabel('P-value (uncorrected)', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('P-value Distribution', fontsize=14)
ax.legend()

# Sorted p-values with FDR threshold
ax = axes[1]
sorted_idx = np.argsort(mcc_results['p_values'])
sorted_p = mcc_results['p_values'][sorted_idx]
ranks = np.arange(1, n_rois + 1)
fdr_threshold = 0.05 * ranks / n_rois  # BH threshold line

ax.plot(ranks[:100], sorted_p[:100], 'b-', linewidth=2, label='Sorted p-values')
ax.plot(ranks[:100], fdr_threshold[:100], 'r--', linewidth=2, label='FDR threshold')
ax.set_xlabel('Rank', fontsize=12)
ax.set_ylabel('P-value', fontsize=12)
ax.set_title('FDR (Benjamini-Hochberg)', fontsize=14)
ax.legend()

# ROI significance map
ax = axes[2]
x = np.arange(n_rois)
ax.bar(x, roi_subject_accs.mean(axis=1), color='lightgray', width=1)
# Highlight significant ROIs
sig_mask = mcc_results['sig_fdr']
ax.bar(x[sig_mask], roi_subject_accs.mean(axis=1)[sig_mask], 
       color='red', width=1, label=f'FDR sig (n={np.sum(sig_mask)})')
ax.axhline(0.5, color='black', linestyle='--', linewidth=1)
ax.set_xlabel('ROI Index', fontsize=12)
ax.set_ylabel('Mean Accuracy', fontsize=12)
ax.set_title('ROI Accuracies (FDR significant in red)', fontsize=14)
ax.legend()

plt.tight_layout()
plt.show()

---
## 4. Cluster-Based Permutation for Temporal Decoding

Test significance of temporal decoding while correcting for multiple time points.

In [None]:
# Simulate temporal decoding results
n_subjects = 15
n_times = 100
times = np.linspace(-0.2, 0.8, n_times)

# Subject x time accuracy matrix
np.random.seed(42)
temporal_accs = 0.5 + 0.02 * np.random.randn(n_subjects, n_times)

# Add signal in REWP window (240-340ms)
rewp_mask = (times >= 0.24) & (times <= 0.34)
temporal_accs[:, rewp_mask] += 0.12 + 0.03 * np.random.randn(n_subjects, np.sum(rewp_mask))

print(f"Temporal data: {n_subjects} subjects x {n_times} time points")

In [None]:
def cluster_permutation_test(
    subject_scores,
    times,
    chance_level=0.5,
    n_permutations=1000,
    cluster_alpha=0.05,
    tail=1  # 1 for greater, -1 for less, 0 for two-sided
):
    """
    Cluster-based permutation test for temporal decoding.
    
    Parameters
    ----------
    subject_scores : np.ndarray
        (n_subjects, n_times) accuracy matrix
    times : np.ndarray
        Time vector
    chance_level : float
        Chance level
    n_permutations : int
        Number of permutations
    cluster_alpha : float
        Threshold for cluster formation
    tail : int
        1 (greater), -1 (less), 0 (two-sided)
    
    Returns
    -------
    dict with clusters, p-values, etc.
    """
    n_subjects, n_times = subject_scores.shape
    
    # T-test at each time point
    t_stats, _ = stats.ttest_1samp(subject_scores, chance_level, axis=0)
    
    # Threshold for cluster formation
    t_threshold = stats.t.ppf(1 - cluster_alpha, df=n_subjects - 1)
    
    if tail == 1:
        cluster_mask = t_stats > t_threshold
    elif tail == -1:
        cluster_mask = t_stats < -t_threshold
    else:
        cluster_mask = np.abs(t_stats) > t_threshold
    
    # Find clusters
    labeled_array, n_clusters = scipy_label(cluster_mask)
    
    # Compute cluster statistics (sum of t-values in cluster)
    observed_cluster_stats = []
    cluster_info = []
    
    for i in range(1, n_clusters + 1):
        cluster_indices = labeled_array == i
        cluster_t_sum = np.sum(t_stats[cluster_indices])
        observed_cluster_stats.append(cluster_t_sum)
        
        cluster_times = times[cluster_indices]
        cluster_info.append({
            'cluster_id': i,
            'start': cluster_times[0],
            'end': cluster_times[-1],
            't_sum': cluster_t_sum,
            'indices': cluster_indices
        })
    
    # Permutation testing
    max_cluster_stats = []
    
    for perm in range(n_permutations):
        # Randomly flip signs (equivalent to permuting condition labels)
        signs = np.random.choice([-1, 1], size=n_subjects)
        perm_scores = subject_scores.copy()
        perm_scores = (perm_scores - chance_level) * signs[:, np.newaxis] + chance_level
        
        # T-test
        perm_t, _ = stats.ttest_1samp(perm_scores, chance_level, axis=0)
        
        # Find clusters
        if tail == 1:
            perm_mask = perm_t > t_threshold
        elif tail == -1:
            perm_mask = perm_t < -t_threshold
        else:
            perm_mask = np.abs(perm_t) > t_threshold
        
        perm_labeled, perm_n_clusters = scipy_label(perm_mask)
        
        # Get max cluster stat
        if perm_n_clusters > 0:
            perm_cluster_stats = [
                np.sum(perm_t[perm_labeled == j]) 
                for j in range(1, perm_n_clusters + 1)
            ]
            max_cluster_stats.append(np.max(np.abs(perm_cluster_stats)))
        else:
            max_cluster_stats.append(0)
    
    max_cluster_stats = np.array(max_cluster_stats)
    
    # Compute cluster p-values
    for info in cluster_info:
        p = np.mean(max_cluster_stats >= np.abs(info['t_sum']))
        info['p_value'] = p
        info['significant'] = p < 0.05
    
    return {
        't_stats': t_stats,
        't_threshold': t_threshold,
        'clusters': cluster_info,
        'n_clusters': n_clusters,
        'max_cluster_dist': max_cluster_stats
    }

In [None]:
# Run cluster permutation test
cluster_results = cluster_permutation_test(
    temporal_accs,
    times,
    chance_level=0.5,
    n_permutations=500,  # Use 1000+ for publication
    cluster_alpha=0.05
)

print("Cluster-Based Permutation Test:")
print("=" * 50)
print(f"T-threshold: {cluster_results['t_threshold']:.3f}")
print(f"Number of clusters found: {cluster_results['n_clusters']}")
print()

for cluster in cluster_results['clusters']:
    sig_str = "*" if cluster['significant'] else ""
    print(f"Cluster {cluster['cluster_id']}: {cluster['start']*1000:.0f}-{cluster['end']*1000:.0f}ms")
    print(f"  t-sum = {cluster['t_sum']:.2f}")
    print(f"  p = {cluster['p_value']:.4f} {sig_str}")

In [None]:
# Visualization
fig, axes = plt.subplots(2, 1, figsize=(12, 8))

# Mean temporal decoding with CI
ax = axes[0]
mean_scores = temporal_accs.mean(axis=0)
sem = temporal_accs.std(axis=0) / np.sqrt(n_subjects)

ax.plot(times * 1000, mean_scores, 'b-', linewidth=2, label='Mean accuracy')
ax.fill_between(times * 1000, mean_scores - sem, mean_scores + sem, alpha=0.3)
ax.axhline(0.5, color='gray', linestyle='--', linewidth=1, label='Chance')
ax.axvline(0, color='black', linestyle='-', linewidth=0.5)

# Highlight significant clusters
for cluster in cluster_results['clusters']:
    if cluster['significant']:
        ax.axvspan(cluster['start']*1000, cluster['end']*1000, 
                   alpha=0.3, color='green', label=f"p={cluster['p_value']:.3f}")

ax.set_xlabel('Time (ms)', fontsize=12)
ax.set_ylabel('Accuracy', fontsize=12)
ax.set_title('Temporal Decoding with Cluster Correction', fontsize=14)
ax.legend()

# T-statistics
ax = axes[1]
ax.plot(times * 1000, cluster_results['t_stats'], 'b-', linewidth=2)
ax.axhline(cluster_results['t_threshold'], color='red', linestyle='--', 
           linewidth=1, label=f"Threshold: t={cluster_results['t_threshold']:.2f}")
ax.axhline(-cluster_results['t_threshold'], color='red', linestyle='--', linewidth=1)
ax.axhline(0, color='gray', linestyle='-', linewidth=0.5)
ax.axvline(0, color='black', linestyle='-', linewidth=0.5)

ax.set_xlabel('Time (ms)', fontsize=12)
ax.set_ylabel('t-statistic', fontsize=12)
ax.set_title('T-statistics Across Time', fontsize=14)
ax.legend()

plt.tight_layout()
plt.show()

---
## 5. Bootstrap Confidence Intervals

Estimate uncertainty in accuracy estimates using bootstrapping.

In [None]:
def bootstrap_ci(accuracies, n_bootstrap=10000, ci=95, random_state=42):
    """
    Compute bootstrap confidence interval for mean accuracy.
    
    Parameters
    ----------
    accuracies : np.ndarray
        Subject accuracies
    n_bootstrap : int
        Number of bootstrap samples
    ci : float
        Confidence level (e.g., 95)
    
    Returns
    -------
    dict with mean, CI bounds, bootstrap distribution
    """
    np.random.seed(random_state)
    n = len(accuracies)
    
    # Bootstrap resampling
    boot_means = []
    for _ in range(n_bootstrap):
        boot_sample = np.random.choice(accuracies, size=n, replace=True)
        boot_means.append(np.mean(boot_sample))
    
    boot_means = np.array(boot_means)
    
    # Percentile method
    alpha = (100 - ci) / 2
    ci_lower = np.percentile(boot_means, alpha)
    ci_upper = np.percentile(boot_means, 100 - alpha)
    
    return {
        'mean': np.mean(accuracies),
        'ci_lower': ci_lower,
        'ci_upper': ci_upper,
        'ci': ci,
        'boot_distribution': boot_means
    }

In [None]:
# Compute bootstrap CI
boot_result = bootstrap_ci(subject_accuracies, n_bootstrap=10000, ci=95)

print("Bootstrap Confidence Interval:")
print("=" * 40)
print(f"Mean accuracy: {boot_result['mean']:.1%}")
print(f"95% CI: [{boot_result['ci_lower']:.1%}, {boot_result['ci_upper']:.1%}]")

# Check if CI excludes chance
excludes_chance = boot_result['ci_lower'] > 0.5
print(f"\n95% CI excludes chance (50%): {excludes_chance}")

In [None]:
# Visualization
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(boot_result['boot_distribution'], bins=50, color='steelblue', 
        alpha=0.7, edgecolor='black')
ax.axvline(boot_result['mean'], color='green', linewidth=3, 
           label=f"Mean: {boot_result['mean']:.1%}")
ax.axvline(boot_result['ci_lower'], color='orange', linewidth=2, linestyle='--',
           label=f"95% CI: [{boot_result['ci_lower']:.1%}, {boot_result['ci_upper']:.1%}]")
ax.axvline(boot_result['ci_upper'], color='orange', linewidth=2, linestyle='--')
ax.axvline(0.5, color='red', linewidth=2, linestyle=':', label='Chance')

ax.set_xlabel('Mean Accuracy', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Bootstrap Distribution', fontsize=14)
ax.legend(fontsize=10)

plt.tight_layout()
plt.show()

---
## 6. Publication-Ready Summary Table

In [None]:
def create_stats_summary(accuracies, chance_level=0.5):
    """
    Create publication-ready statistics summary.
    """
    # T-test
    ttest_result = group_ttest(accuracies, chance_level, alternative='greater')
    
    # Wilcoxon
    w_stat, w_p = stats.wilcoxon(accuracies - chance_level, alternative='greater')
    
    # Bootstrap
    boot_result = bootstrap_ci(accuracies)
    
    print("="*60)
    print("STATISTICAL SUMMARY")
    print("="*60)
    print(f"\nDescriptive Statistics (N={len(accuracies)}):")
    print(f"  Mean ± SD: {ttest_result['mean']:.1%} ± {ttest_result['std']:.1%}")
    print(f"  Range: [{np.min(accuracies):.1%}, {np.max(accuracies):.1%}]")
    print(f"  95% CI: [{boot_result['ci_lower']:.1%}, {boot_result['ci_upper']:.1%}]")
    
    print(f"\nParametric Test (One-sample t-test vs {chance_level:.0%}):")
    print(f"  t({ttest_result['df']}) = {ttest_result['t_stat']:.3f}, p = {ttest_result['p_value']:.4f}")
    print(f"  Cohen's d = {ttest_result['cohens_d']:.2f}")
    
    print(f"\nNon-parametric Test (Wilcoxon signed-rank):")
    print(f"  W = {w_stat:.1f}, p = {w_p:.4f}")
    
    print("\n" + "="*60)
    
    # APA-style string
    apa_string = f"M = {ttest_result['mean']:.1%}, SD = {ttest_result['std']:.1%}, "
    apa_string += f"t({ttest_result['df']}) = {ttest_result['t_stat']:.2f}, "
    apa_string += f"p = {ttest_result['p_value']:.3f}, d = {ttest_result['cohens_d']:.2f}"
    print(f"\nAPA-style: {apa_string}")

# Run summary
create_stats_summary(subject_accuracies)

---
## Summary

This notebook covered:

1. **Single-subject permutation testing**: Compare observed accuracy to null distribution
2. **Group-level t-tests**: One-sample t-test vs. chance with effect size
3. **Multiple comparison correction**: Bonferroni and FDR for many ROIs
4. **Cluster-based permutation**: Correct for multiple time points
5. **Bootstrap confidence intervals**: Non-parametric uncertainty estimation

### Key Functions:
- `permutation_test()`: Single-subject significance
- `group_ttest()`: Group-level inference
- `multipletests()`: FDR/Bonferroni correction
- `cluster_permutation_test()`: Temporal cluster correction
- `bootstrap_ci()`: Confidence intervals

### Recommended Usage:
- **Single subject**: Permutation test (≥1000 permutations)
- **Group level**: One-sample t-test + Cohen's d
- **Many ROIs**: FDR correction (less conservative than Bonferroni)
- **Temporal data**: Cluster-based permutation