## Setup

In [None]:
# Check GPU
!nvidia-smi

In [None]:
# Clone repository (if not already cloned)
import os
if not os.path.exists('basin-compression-analysis'):
    !git clone https://github.com/jacobposchl/basin-compression-analysis.git
    os.chdir('basin-compression-analysis')
else:
    os.chdir('basin-compression-analysis')
    !git pull

!pwd

In [None]:
# Install dependencies
!pip install -q -r requirements.txt
!pip install -e .

## Configuration

Adjust parameters below based on your computational budget:

| Parameter | Quick Test | Standard | Deep Analysis |
|-----------|------------|----------|---------------|
| `num_passages` | 50 | 100 | 200 |
| `epoch_schedule` | [1, 5, 20] | [1, 3, 5, 10, 20, 30, 50, 100] | [1, 3, 5, 7, 10, 15, 20, 30, 40, 50, 75, 100, 150, 200] |
| Estimated time | ~1 hour | ~5-6 hours | ~12-15 hours |

In [None]:
# CONFIGURATION
# Choose your experiment scale:

# Quick test (1 hour on A100)
# EXPERIMENT_MODE = "quick"
# num_passages = 50
# epoch_schedule = "1,5,20"

# Standard experiment (5-6 hours on A100) - RECOMMENDED
EXPERIMENT_MODE = "standard"
num_passages = 100
epoch_schedule = "1,3,5,10,20,30,50,100"

# Deep analysis (12-15 hours on A100)
# EXPERIMENT_MODE = "deep"
# num_passages = 200
# epoch_schedule = "1,3,5,7,10,15,20,30,40,50,75,100,150,200"

# Other parameters
model_name = "gpt2"
learning_rate = 5e-5
batch_size = 4
output_dir = f"dynamics_results_{EXPERIMENT_MODE}"

print(f"Experiment Mode: {EXPERIMENT_MODE}")
print(f"Number of passages: {num_passages}")
print(f"Epoch schedule: {epoch_schedule}")
print(f"Output directory: {output_dir}")

## Run Experiment

This will train the model at each epoch count and compute compression + memorization metrics.

In [None]:
# Run the full experiment
!python scripts/run_training_dynamics.py \
    --model {model_name} \
    --num_passages {num_passages} \
    --epoch_schedule {epoch_schedule} \
    --learning_rate {learning_rate} \
    --batch_size {batch_size} \
    --output_dir {output_dir} \
    --k_neighbors 15

## Quick Progress Check

Check the summary CSV to see progress:

In [None]:
import pandas as pd
from pathlib import Path

summary_file = Path(output_dir) / 'training_summary.csv'

if summary_file.exists():
    df = pd.read_csv(summary_file)
    print("\n" + "="*70)
    print("PROGRESS SUMMARY")
    print("="*70)
    print(df.to_string(index=False))
    print("\n")
    
    # Plot memorization progress
    import matplotlib.pyplot as plt
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Memorization rate
    ax = axes[0]
    ax.plot(df['epoch'], df['memorization_rate'] * 100, 'o-', linewidth=2, markersize=8)
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Memorization Rate (%)')
    ax.set_title('Memorization Rate vs. Training Epochs')
    ax.grid(True, alpha=0.3)
    
    # Best correlation
    ax = axes[1]
    ax.plot(df['epoch'], df['best_correlation'].abs(), 's-', linewidth=2, markersize=8, color='orange')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Absolute Correlation')
    ax.set_title('Compression-Memorization Correlation vs. Training Epochs')
    ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print(f"Summary file not found: {summary_file}")
    print("The experiment may not have started yet.")

## Generate Visualizations

Once the experiment completes, generate all analysis figures:

In [None]:
# Generate all visualizations
!python scripts/visualize_training_dynamics.py \
    --results_file {output_dir}/training_dynamics_results.pkl \
    --output_dir {output_dir}

## Display Results

In [None]:
# Display the summary report
report_file = Path(output_dir) / 'summary_report.txt'

if report_file.exists():
    with open(report_file, 'r') as f:
        print(f.read())
else:
    print(f"Report not found: {report_file}")

In [None]:
# Display visualizations
from IPython.display import Image, display
import matplotlib.pyplot as plt

figures_dir = Path(output_dir) / 'figures'

if figures_dir.exists():
    figure_files = [
        'compression_trajectories.png',
        'memorization_vs_compression.png',
        'layer_epoch_heatmap.png',
        'individual_trajectories.png',
        'memorization_rate.png'
    ]
    
    for fig_name in figure_files:
        fig_path = figures_dir / fig_name
        if fig_path.exists():
            print(f"\n{'='*70}")
            print(f"  {fig_name.replace('_', ' ').title()}")
            print('='*70)
            display(Image(filename=str(fig_path)))
        else:
            print(f"Figure not found: {fig_path}")
else:
    print(f"Figures directory not found: {figures_dir}")

## Advanced Analysis

Load the results and perform custom analyses:

In [None]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
from compression_lm.analysis.training_dynamics import (
    analyze_u_shape_trajectory,
    analyze_memorization_onset,
    analyze_layer_temporal_patterns,
    compute_compression_velocity
)

# Load results
results_file = Path(output_dir) / 'training_dynamics_results.pkl'

if results_file.exists():
    with open(results_file, 'rb') as f:
        data = pickle.load(f)
    
    all_results = data['all_results']
    epochs = sorted(all_results.keys())
    
    print(f"Loaded results for {len(epochs)} checkpoints")
    print(f"Epochs: {epochs}")
else:
    print(f"Results file not found: {results_file}")
    all_results = None

In [None]:
# Analyze U-shape for each layer
if all_results:
    print("\nU-SHAPE ANALYSIS BY LAYER")
    print("="*70)
    
    num_layers = len(all_results[epochs[0]]['layer_analyses'])
    
    for layer_idx in range(num_layers):
        u_analysis = analyze_u_shape_trajectory(all_results, layer_idx)
        
        print(f"\nLayer {layer_idx}:")
        print(f"  Shape: {u_analysis['shape']}")
        print(f"  R²: {u_analysis['r_squared']:.4f}")
        if u_analysis['vertex_epoch']:
            print(f"  Vertex at epoch: {u_analysis['vertex_epoch']:.1f}")
        print(f"  Final compression difference: {u_analysis['final_diff']:.4f}")
        print(f"  Trend: {u_analysis['trend']}")

In [None]:
# Analyze memorization onset
if all_results:
    print("\nMEMORIZATION ONSET ANALYSIS")
    print("="*70)
    
    mid_layer = len(all_results[epochs[0]]['layer_analyses']) // 2
    onset_analysis = analyze_memorization_onset(all_results, mid_layer)
    
    print(f"\nLayer {mid_layer} (middle layer):")
    print(f"  Passages memorized: {onset_analysis['passages_memorized']}")
    print(f"  Passages never memorized: {onset_analysis['passages_never_memorized']}")
    
    if not np.isnan(onset_analysis['correlation']):
        print(f"  Correlation (initial compression vs onset epoch): r={onset_analysis['correlation']:.3f}")
        print(f"  P-value: {onset_analysis['p_value']:.2e}")
        print(f"  Mean onset epoch: {onset_analysis['mean_onset_epoch']:.1f}")
        print(f"  Median onset epoch: {onset_analysis['median_onset_epoch']:.1f}")
        
        # Interpretation
        if onset_analysis['correlation'] < -0.3 and onset_analysis['p_value'] < 0.01:
            print("\n  ✓ FINDING: Passages with lower initial compression memorize EARLIER")
            print("    This suggests compression predicts memorization difficulty.")
        elif onset_analysis['correlation'] > 0.3 and onset_analysis['p_value'] < 0.01:
            print("\n  ✓ FINDING: Passages with higher initial compression memorize EARLIER")
            print("    This suggests already-distinct patterns are easier to memorize.")
    else:
        print("  Insufficient memorization to compute correlation")

In [None]:
# Analyze layer temporal patterns
if all_results:
    print("\nLAYER TEMPORAL PATTERNS")
    print("="*70)
    
    temporal = analyze_layer_temporal_patterns(all_results)
    
    print(f"\nPattern: {temporal['early_vs_late']}")
    print(f"\nOrder of emergence (by significance):")
    for layer, epoch in temporal['layer_ordering_significance'][:5]:
        print(f"  Layer {layer}: epoch {epoch}")
    
    print(f"\nOrder of emergence (by effect size):")
    for layer, epoch in temporal['layer_ordering_effect_size'][:5]:
        print(f"  Layer {layer}: epoch {epoch}")

In [None]:
# Velocity analysis for middle layer
if all_results:
    mid_layer = len(all_results[epochs[0]]['layer_analyses']) // 2
    velocity_data = compute_compression_velocity(all_results, mid_layer)
    
    print(f"\nVELOCITY ANALYSIS (Layer {mid_layer})")
    print("="*70)
    print(f"Max velocity: {velocity_data['max_velocity']:.4f}")
    print(f"Max acceleration: {velocity_data['max_acceleration']:.4f}")
    print(f"Velocity sign changes: {velocity_data['velocity_sign_changes']}")
    
    # Plot
    from compression_lm.analysis.dynamics_visualizations import plot_compression_velocity
    
    fig = plot_compression_velocity(all_results, mid_layer)
    plt.show()

## Download Results

Download the results to your local machine:

In [None]:
# Zip results for download
import shutil

zip_file = f"{output_dir}.zip"
shutil.make_archive(output_dir, 'zip', output_dir)

print(f"\nResults zipped to: {zip_file}")
print(f"File size: {os.path.getsize(zip_file) / 1e6:.1f} MB")
print("\nTo download in Colab:")
print(f"  from google.colab import files")
print(f"  files.download('{zip_file}')")

In [None]:
# Optional: Download via Colab
from google.colab import files
files.download(f"{output_dir}.zip")

## Interpretation Guide

### Scenario 1: U-Shaped Curve ✓
**Pattern:** Compression decreases → plateaus → increases  
**Interpretation:** Model goes through learning → consolidation → memorization phases  
**Scientific Value:** HIGH - Novel mechanistic finding about geometric phase transitions

### Scenario 2: Monotonic Decrease
**Pattern:** Compression steadily decreases across all epochs  
**Interpretation:** LLM memorization differs from VAE compression; maintains distinctions even for memorized content  
**Scientific Value:** MEDIUM-HIGH - Important difference between reconstruction vs. generation

### Scenario 3: Layer-Dependent Patterns
**Pattern:** Early layers increase, late layers decrease (or vice versa)  
**Interpretation:** Hierarchical organization of memorization mechanisms  
**Scientific Value:** HIGH - Reveals layer-wise specialization

### Scenario 4: No Memorization Achieved
**Pattern:** Reproduction accuracy stays low even with 100 epochs  
**Next Step:** Run deeper experiment with fewer passages (10 passages, 200 epochs)

### Scenario 5: Passage Heterogeneity
**Pattern:** Different passages show different compression patterns  
**Interpretation:** Memorization mechanisms vary by content type  
**Next Step:** Categorize passages by properties (length, complexity, topic)

## Citation

If you use this experiment in your research:

```
@article{basin-compression-analysis,
  title={Compression Dynamics in Language Model Memorization},
  author={Your Name},
  year={2025},
  url={https://github.com/jacobposchl/basin-compression-analysis}
}
```