Steps:
1. generate rollouts over example prompts
2. compute and store activations over all tokens all layers
3. test out each of the hypotheses
4. try make a progress bar

# 1. Generate rollouts over prompts

In [1]:
import vllm
import torch

In [None]:
# Load Gemma 2 2B Instruct model with vLLM
model_name = "Qwen/Qwen3-4B"
llm = vllm.LLM(
    model=model_name,
    trust_remote_code=True,
)

In [None]:
import json

# Load instructions from splits/harmless_train.json
with open('splits/harmless_train.json', 'r') as f:
    data = json.load(f)

# Get the tokenizer to apply chat template
tokenizer = llm.get_tokenizer()

# Apply chat template to each instruction
instructions = []
for item in data:
    # Format as a chat message
    messages = [{"role": "user", "content": item['instruction']}]
    # Apply the chat template
    formatted_instruction = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    instructions.append(formatted_instruction)

print(f"Loaded {len(instructions)} instructions with chat template applied")


In [None]:
# Run vLLM in parallel over all instructions
outputs = llm.generate(instructions[:1000], sampling_params=vllm.SamplingParams(max_tokens=32768))

In [None]:
responses = [{'instruction': y, 'response': x.outputs[0].text, 'char_length': len(x.outputs[0].text), 'tokens_length': len(x.outputs[0].token_ids)} for x,y in zip(outputs, instructions)]

In [None]:
# Save responses to JSON file
import json
import os

# Create directory if it doesn't exist
os.makedirs('/workspace/llm-progress-monitor/rollouts', exist_ok=True)

# Save to JSON file
with open(f'/workspace/llm-progress-monitor/rollouts/{model_name.split("/")[-1]}.json', 'w') as f:
    json.dump(responses, f, indent=2)

print(f"Saved {len(responses)} responses to /workspace/llm-progress-monitor/rollouts/{model_name.split('/')[-1]}.json")

# 2. Compute and store activations over all tokens all layers

In [None]:
import nnsight
import torch
import json
from transformers import AutoTokenizer
model_name = "Qwen/Qwen3-4B"

In [None]:
with open(f'/workspace/llm-progress-monitor/rollouts/{model_name.split("/")[-1]}.json', 'r') as f:
    responses = json.load(f)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Format each response with Gemma 2 chat template
formatted_responses = []
for response in responses:
    # Apply Gemma 2 chat template format
    chat_formatted = tokenizer.apply_chat_template(
        [
            {"role": "user", "content": response['instruction']},
            {"role": "assistant", "content": response['response']}
        ],
        tokenize=False,
        add_generation_prompt=False
    )
    
    formatted_item = {
        'instruction': response['instruction'],
        'response': response['response'],
        'chat_formatted': chat_formatted,
        'char_length': response['char_length'],
        'tokens_length': response['tokens_length']
    }
    formatted_responses.append(formatted_item)

formatted_responses

In [None]:
import os
import gc
# Create activations directory if it doesn't exist
activations_dir = '/workspace/llm-progress-monitor/rollouts/activations'
os.makedirs(activations_dir, exist_ok=True)

model = nnsight.LanguageModel(model_name, device_map="auto", dtype=torch.bfloat16)

In [None]:

with torch.no_grad():
    for i, formatted_response in enumerate(formatted_responses[366:1000]):
        gc.collect()
        torch.cuda.empty_cache()
        chat_formatted = formatted_response['chat_formatted']
        input_ids_len = len(tokenizer.encode(tokenizer.apply_chat_template([{"role": "user", "content": formatted_response['instruction']}], tokenize=False, add_generation_prompt=True), return_tensors='pt')[0])
        
        # Tokenize the chat
        try:
            with model.trace(chat_formatted): #TODO: change to batched.
                gc.collect()
                torch.cuda.empty_cache()
                layer_outputs = []
                for layer in model.model.layers:
                    layer_outputs.append(layer.output[0][:,input_ids_len:])
                activations = torch.stack(layer_outputs, dim=0)
                
                torch.save(activations, f'{activations_dir}/{366+i}.pt')
                print(f"Saved activations for response {366+i}")
        except Exception as e:
            print(f"Error on response {366+i}: {e}")
            gc.collect()
            torch.cuda.empty_cache()

# 3. Test out each hypothesis

- Linear n_tokens_remaining
- Logarithmic n_tokens_remaining
- Linear % of response left
- Linear n_total_tokens
- Rotation around a circle to track n_tokens_remaining modulo something?


In [1]:
from torch.utils.data import DataLoader, Dataset
from typing import List
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import gc

In [2]:
import os
activations_dir = '/workspace/llm-progress-monitor/rollouts/activations'
layer_idx = 15
activations = []

with torch.no_grad():

    def load_activations(start_idx: int, end_idx: int):
        with torch.no_grad():
            for i in range(start_idx, end_idx):
                if i % 10 == 0:
                    print(f"Loaded {i} activations")
                    gc.collect()
                    torch.cuda.empty_cache()
                filename = f'{i}.pt'
                if os.path.exists(os.path.join(activations_dir, filename)):
                    activations.append(torch.load(os.path.join(activations_dir, filename))[layer_idx].to('cpu'))
        return activations

    import asyncio
    import aiofiles

    async def load_activation_async(filename_path):
        """Load a single activation file asynchronously"""
        if os.path.exists(filename_path):
            # Use async file reading with torch.load
            loop = asyncio.get_event_loop()
            activation = await loop.run_in_executor(None, torch.load, filename_path)
            return activation[layer_idx].to('cpu')
        return None

    async def load_batch_async(start_idx, end_idx):
        """Load a batch of activations asynchronously"""
        tasks = []
        for i in range(start_idx, end_idx):
            filename_path = os.path.join(activations_dir, f'{i}.pt')
            tasks.append(load_activation_async(filename_path))
        
        results = await asyncio.gather(*tasks)
        return [result for result in results if result is not None]

    async def load_all_activations_async():
        """Load all activations using async I/O"""
        batch_size = 100
        total_samples = 1000
        batches = [(i, min(i + batch_size, total_samples)) for i in range(0, total_samples, batch_size)]
        
        activations = []
        for i, (start, end) in enumerate(batches):
            batch_activations = await load_batch_async(start, end)
            activations.extend(batch_activations)
            print(f"Loaded batch {i+1}/{len(batches)}: {len(batch_activations)} activations")
            gc.collect()
            torch.cuda.empty_cache()
        
        return activations

    # Run the async loading - use await directly in Jupyter
    activations = await load_all_activations_async()

    print(f"Total activations loaded: {len(activations)}")

    # Train-test split on activations
    from sklearn.model_selection import train_test_split

    train_activations, test_activations = train_test_split(activations, test_size=0.2, random_state=42)
    print(f"Train set size: {len(train_activations)}")
    print(f"Test set size: {len(test_activations)}")

    class TokensRemainingDataset(Dataset):
        def __init__(self, activations: List[torch.Tensor]):
            self.data = []
            for activation in activations:
                for i in range(activation.shape[0]):
                    self.data.append((activation[i, :], activation.shape[0] - i, activation.shape[0]))

        def __len__(self):
            return len(self.data)
        
        def __getitem__(self, idx):
            return self.data[idx]

    train_dataset = TokensRemainingDataset(train_activations)
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_dataset = TokensRemainingDataset(test_activations)
    test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)


Loaded batch 1/10: 100 activations
Loaded batch 2/10: 100 activations
Loaded batch 3/10: 100 activations
Loaded batch 4/10: 100 activations
Loaded batch 5/10: 100 activations
Loaded batch 6/10: 100 activations
Loaded batch 7/10: 100 activations
Loaded batch 8/10: 100 activations
Loaded batch 9/10: 100 activations
Loaded batch 10/10: 100 activations
Total activations loaded: 1000
Train set size: 800
Test set size: 200


## 3.1 Linear n_tokens_remaining (ignore)

In [None]:
class LinearRegression(nn.Module):
    def __init__(self,input_dim: int):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)
model = LinearRegression(train_dataset[0][0].shape[0]).to('cuda')
adam = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
losses = []
for X, y, _ in tqdm(train_dataloader):
    adam.zero_grad()
    pred = model(X.to('cuda', dtype=torch.float32))
    loss = loss_fn(pred, y.to('cuda', dtype=torch.float32))
    loss.backward()
    adam.step()
    losses.append(loss.item())
import matplotlib.pyplot as plt
plt.title('Training Loss for linear n_tokens_remaining hypothesis')
plt.plot(losses)
# Evaluate test loss
test_losses = []
preds = []
actuals = []
model.eval()
with torch.no_grad():
    for X, y, total_tokens in test_dataloader:
        total_tokens = total_tokens.to('cuda', dtype=torch.float32)
        y = y.to('cuda', dtype=torch.float32)
        pred = model(X.to('cuda', dtype=torch.float32))
        tokens_already_seen = total_tokens - y
        percentage_pred = torch.clamp(pred/(pred + tokens_already_seen), 0.0, 1.0)
        actual_percentage = y/total_tokens
        loss = loss_fn(percentage_pred, actual_percentage)
        test_losses.append(loss.item())
        preds += percentage_pred[0].tolist()
        actuals += actual_percentage.tolist()
average_test_loss = sum(test_losses) / len(test_losses)
print(f"Average test loss: {average_test_loss:.4f}")

for X, y, _ in test_dataloader:
    print(y)
    break
preds
plt.title('Predicted vs Actual Percentage of Tokens Remaining for linear n_tokens_remaining hypothesis')
plt.scatter(preds, actuals)

## 3.2 Logarithmic n_tokens remaining (ignore)

In [None]:
import matplotlib.pyplot as plt
class NonLinearRegression(nn.Module):
    def __init__(self,input_dim: int):
        super().__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)
model = NonLinearRegression(train_dataset[0][0].shape[0]).to('cuda')
adam = optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
def evaluate_model(model, test_dataloader, loss_fn):
    preds = []
    actuals = []
    model.eval()
    with torch.no_grad():
        for X, y, total_tokens in test_dataloader:
            total_tokens = total_tokens.to('cuda', dtype=torch.float32)
            y = y.to('cuda', dtype=torch.float32)
            pred = model(X.to('cuda', dtype=torch.float32)).exp()
            preds += pred.flatten().tolist()
            actuals += (y+1).log().flatten().tolist()
    return preds, actuals
losses = []

for i,(X, y, total_tokens) in enumerate(tqdm(train_dataloader)):
    y = y.to('cuda', dtype=torch.float32)
    total_tokens = total_tokens.to('cuda', dtype=torch.float32)
    adam.zero_grad()
    pred = model(X.to('cuda', dtype=torch.float32))
    loss = loss_fn(pred, (y+1).log())
    loss.backward()
    adam.step()
    losses.append(loss.item())
    if i % 50 == 0:
        preds, actuals = evaluate_model(model, test_dataloader, loss_fn)
        plt.title('Predicted vs Actual Percentage of Tokens Remaining for logarithmic n_tokens_remaining hypothesis')
        plt.scatter(preds, actuals)
        plt.show()
plt.title('Training Loss for logarithmic n_tokens_remaining hypothesis')
plt.plot(losses)
losses
preds, actuals = evaluate_model(model, train_dataloader, loss_fn)
plt.scatter(preds, torch.tensor(actuals))
for X,y, _ in train_dataloader:
    print(X.shape, y.shape)
    break
X

## 3.5 Rotation (leading hypothesis)

In [None]:
n_bins = 11
def bin_y(y):
    return (y+1).log().floor().clamp(0,n_bins-1).to('cuda', dtype=torch.long)

In [None]:
class LogBinClassifier(nn.Module):
    def __init__(self,input_dim: int):
        super().__init__()
        self.linear = nn.Linear(input_dim, n_bins, dtype=torch.bfloat16)

    def forward(self, x):
        return self.linear(x)
    
model = LogBinClassifier(train_dataset[0][0].shape[0]).to('cuda')
adam = optim.Adam(model.parameters(), lr=0.001)
# Calculate class frequencies from training data
class_counts = torch.zeros(n_bins)
total_samples = 0

for X, y, _ in train_dataloader:
    y_binned = bin_y(y)
    for i in range(n_bins):
        class_counts[i] += (y_binned == i).sum().item()
    total_samples += len(y)

# Calculate inverse frequency weights
weight = total_samples / (n_bins * class_counts)
weight = weight.to('cuda', dtype=torch.bfloat16)
print("Class frequencies:", class_counts)
print("Weights:", weight)
losses = []
test_losses = []

In [None]:
loss_fn = nn.CrossEntropyLoss(weight = weight)

In [None]:
n_epochs = 1
adam = optim.Adam(model.parameters(), lr=0.0001)
for epoch in range(n_epochs):
    for i, (X, y, _) in enumerate(train_dataloader):
        model.zero_grad()

        preds = model(X.to('cuda'))
        y_transformed = bin_y(y)
        loss = loss_fn(preds, y_transformed)
        loss.backward()
        adam.step()
        losses.append(loss.item())
        
        # Calculate test loss
        if i % 10 == 0:  # Calculate test loss every 10 batches
            model.eval()
            test_loss_sum = 0
            test_batches = 0
            with torch.no_grad():
                for X_test, y_test, _ in test_dataloader:
                    preds_test = model(X_test.to('cuda'))
                    y_test_transformed = bin_y(y_test)
                    test_loss = loss_fn(preds_test, y_test_transformed)
                    test_loss_sum += test_loss.item()
                    test_batches += 1
            test_losses.append(test_loss_sum / test_batches)
            model.train()
        
        if i % 10 == 0 and test_losses:
            print(f"Batch {i}, Train Loss: {loss.item():.4f}")
            print(f"Test Loss: {test_losses[-1]:.4f}")

In [None]:
torch.save(model.linear.weight, '/workspace/llm-progress-monitor/qwen3_4b_weight_tensor.pt')

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.title('Training Loss for rotation hypothesis')
plt.plot(losses)
plt.xlabel('Batch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.title('Test Loss')
# Fix the x-axis to match the length of test_losses
test_x_values = [i for i in range(0, len(losses), 10) if i // 10 < len(test_losses)]
plt.plot(test_x_values, test_losses[:len(test_x_values)])
plt.xlabel('Batch')
plt.ylabel('Loss')

plt.tight_layout()
plt.show()

model.zero_grad()
preds = model(X.to('cuda'))
preds.argmax(dim=1)
y_transformed


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Create x-axis values for batches where test losses were recorded
test_batch_indices = [i for i in range(0, len(losses), 10) if i // 10 < len(test_losses)]

plt.figure(figsize=(10, 6))
plt.plot(np.log(np.array(test_batch_indices) + 1), test_losses[:len(test_batch_indices)])
plt.xlabel('log(batch + 1)')
plt.ylabel('Test Loss')
plt.title('Test Loss vs log(batch)')
plt.grid(True)
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np

all_preds = []
all_targets = []

for X, y, _ in test_dataloader:
    with torch.no_grad():
        preds = model(X.to('cuda'))
        y_transformed = bin_y(y)
        all_preds.extend(preds.argmax(dim=1).cpu().numpy())
        all_targets.extend(y_transformed.cpu().numpy())

cm = confusion_matrix(all_targets, all_preds)
# Normalize each row so it sums to 1
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues')
plt.title('Confusion Matrix (Normalized)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
model.linear.weight.norm(dim=1)

In [None]:
model.linear.bias

In [29]:
n_bins = 30
def bin_y(y):
    return (y+1).log() / torch.log(torch.tensor(1.2)).floor().to('cuda', dtype=torch.long)

In [37]:
def log_floor(y, base=1.2):
    return ((y+1).log() / torch.log(torch.tensor(base, device=y.device))).floor().to('cuda', dtype=torch.long)


In [38]:
mean_vecs = torch.zeros(n_bins, train_dataset[0][0].shape[0]).to('cuda')
counts = torch.zeros(n_bins).to('cuda')
for i, (X,y, _) in enumerate(train_dataloader):
    print(X.shape, y.shape)
    y_transformed = log_floor(y)
    print(y_transformed)
    mean_vecs[y_transformed] += X.to('cuda')
    counts[y_transformed] += 1
    if i > 400:
        break

mean_vecs = mean_vecs / counts.unsqueeze(1)

AcceleratorError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [16]:
torch.save(mean_vecs, "qwen 4b mean vecs.pt")
mean_vecs.shape

torch.Size([30, 2560])

In [17]:
mean_vecs[:,0]

tensor([    nan, 18.1875, 21.7797, 25.3207, 25.4286, 25.5131, 25.6278, 26.0277,
        26.4142, 25.7046, 26.5776,     nan,     nan,     nan,     nan,     nan,
            nan,     nan,     nan,     nan,     nan,     nan,     nan,     nan,
            nan,     nan,     nan,     nan,     nan,     nan], device='cuda:0')

### Misc plots

In [None]:
import torch
import matplotlib.pyplot as plt
import  numpy as np

In [None]:
# Get the 2 principal component vectors
from sklearn.decomposition import PCA

# Use model.linear.weight instead of mean_vecs
# Convert to CPU for sklearn
#weight_tensor = torch.load('/workspace/llm-progress-monitor/weight_tensor.pt')
weight_tensor = model.linear.weight.to(dtype=torch.float32)
weight_vecs_cpu = weight_tensor.detach().cpu().numpy()
n_bins = weight_tensor.shape[0]

# Exclude bin 0 from PCA analysis
weight_vecs_no_zero = weight_vecs_cpu[1:]  # Exclude first bin (bin 0)

# Perform PCA to get 2 components
pca = PCA(n_components=2)
pca_result_no_zero = pca.fit_transform(weight_vecs_no_zero)

# Plot the principal components with different colors for each bin (excluding bin 0)
plt.figure(figsize=(10, 6))
colors = plt.cm.tab10(np.linspace(0, 1, n_bins))
colors_no_zero = colors[1:]  # Exclude color for bin 0
plt.scatter(pca_result_no_zero[:, 0], pca_result_no_zero[:, 1], s=100, alpha=0.7, c=colors_no_zero)
for i, (x, y) in enumerate(pca_result_no_zero):
    plt.annotate(f'Bin {i+1}', (x, y), xytext=(5, 5), textcoords='offset points')
plt.xlabel(f'PC1 (explained variance: {pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'PC2 (explained variance: {pca.explained_variance_ratio_[1]:.2%})')
plt.title('Principal Component Analysis of Model Linear Weight Vectors by Bin (Excluding Bin 0)')
plt.grid(True, alpha=0.3)
plt.show()

# Plot 2D density of samples using colors (excluding bin 0)
plt.figure(figsize=(12, 8))
sample_count = 2000  # Number of samples per bin to plot
all_bin_data = []  # Collect all data

# First pass: collect all data (excluding bin 0)
for bin_idx in range(1, n_bins):  # Start from bin 1
    bin_samples = []
    for i, (X, y, _) in enumerate(train_dataloader):
        y_transformed = bin_y(y)
        mask = (y_transformed == bin_idx)
        if mask.any():
            bin_samples.append(X.to(dtype=torch.float32)[mask.cpu()].cpu())  # Move to CPU before appending
        if len(bin_samples) * train_dataloader.batch_size >= sample_count:
            break
    
    if bin_samples:
        # Concatenate and take only the first sample_count samples
        bin_data = torch.cat(bin_samples, dim=0)[:sample_count]
        bin_data_cpu = bin_data.numpy()  # Already on CPU
        all_bin_data.append(bin_data_cpu)

# Create a figure with subplots for each bin (excluding bin 0)
if all_bin_data:
    n_bins_no_zero = n_bins - 1
    fig, axes = plt.subplots(2, (n_bins + 1) // 2, figsize=(16, 8))
    axes = axes.flatten()
    
    # Plot density for each bin (excluding bin 0)
    for idx, bin_data_cpu in enumerate(all_bin_data):
        bin_pca = pca.transform(bin_data_cpu)
        
        # Create 2D histogram (density plot)
        ax = axes[idx]
        hist, xedges, yedges = np.histogram2d(bin_pca[:, 0], bin_pca[:, 1], bins=20)
        extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
        
        im = ax.imshow(hist.T, extent=extent, origin='lower', cmap='Blues', alpha=0.7)
        ax.set_title(f'Bin {idx+1} Density')
        ax.set_xlabel('PC1')
        ax.set_ylabel('PC2')
        
        # Add weight vector for this bin
        weight_pca = pca.transform(weight_vecs_no_zero[idx].reshape(1, -1))
        ax.scatter(weight_pca[0, 0], weight_pca[0, 1], s=100, c='red', 
                  marker='x', linewidth=3, label='Weight vector')
        ax.legend()
    
    # Hide unused subplots
    for idx in range(n_bins_no_zero, len(axes)):
        axes[idx].set_visible(False)
    
    plt.tight_layout()
    plt.suptitle('2D Density of Samples by Bin in PCA Space (Excluding Bin 0)', y=1.02)
    plt.show()

# Combined density plot with Gaussian distributions (excluding bin 0)
plt.figure(figsize=(12, 8))
all_data_combined = np.concatenate(all_bin_data, axis=0)
all_pca = pca.transform(all_data_combined)

# Create bin labels for coloring (excluding bin 0)
bin_labels = []
for idx, bin_data_cpu in enumerate(all_bin_data):
    bin_labels.extend([idx+1] * len(bin_data_cpu))  # Bin indices start from 1
bin_labels = np.array(bin_labels)

# Calculate Gaussian distributions for each bin
from scipy.stats import multivariate_normal

# Get the overall extent for the plot
x_min, x_max = all_pca[:, 0].min() - 1, all_pca[:, 0].max() + 1
y_min, y_max = all_pca[:, 1].min() - 1, all_pca[:, 1].max() + 1

# Create a grid for plotting contours
x_grid = np.linspace(x_min, x_max, 100)
y_grid = np.linspace(y_min, y_max, 100)
X_grid, Y_grid = np.meshgrid(x_grid, y_grid)
pos = np.dstack((X_grid, Y_grid))

# Define standard deviation levels
# For a 2D Gaussian, the probability levels corresponding to k-sigma are:
# 1σ: ~0.3935 probability (68.27% for 1D becomes 39.35% for 2D)
# 2σ: ~0.8647 probability (95.45% for 1D becomes 86.47% for 2D)
# 3σ: ~0.9889 probability (99.73% for 1D becomes 98.89% for 2D)
from scipy.stats import chi2

# For 2D case, chi2 with 2 degrees of freedom
sigma_1_level = chi2.ppf(0.3935, df=2)  # 1σ
sigma_2_level = chi2.ppf(0.8647, df=2)  # 2σ
sigma_3_level = chi2.ppf(0.9889, df=2)  # 3σ

# Plot Gaussian distributions for each bin (excluding bin 0)
for idx in range(len(all_bin_data)):
    bin_idx = idx + 1  # Actual bin index
    mask = bin_labels == bin_idx
    if mask.any():
        # Get PCA coordinates for this bin
        bin_pca = all_pca[mask]
        
        # Calculate mean and covariance for the Gaussian
        mean = np.mean(bin_pca, axis=0)
        cov = np.cov(bin_pca.T)
        
        # Add small regularization to avoid singular covariance matrices
        cov += np.eye(2) * 1e-6
        
        # Create multivariate normal distribution
        rv = multivariate_normal(mean, cov)
        
        # Calculate the maximum density (at the mean)
        max_density = rv.pdf(mean)
        
        # Convert chi2 levels to density levels
        # For multivariate normal: density = max_density * exp(-0.5 * chi2_level)
        density_1sigma = max_density * np.exp(-0.5 * sigma_1_level)
        density_2sigma = max_density * np.exp(-0.5 * sigma_2_level)
        density_3sigma = max_density * np.exp(-0.5 * sigma_3_level)
        
        # Plot the contours at 1σ, 2σ, and 3σ levels
        contour_levels = [density_3sigma, density_2sigma, density_1sigma]
        contour_labels = ['3σ', '2σ', '1σ']
        
        cs = plt.contour(X_grid, Y_grid, rv.pdf(pos), colors=[colors[bin_idx]], 
                        alpha=0.8, linewidths=2, levels=contour_levels)
        
        # Label the contours
        plt.clabel(cs, inline=True, fontsize=8, fmt=dict(zip(contour_levels, contour_labels)))
        
        plt.contourf(X_grid, Y_grid, rv.pdf(pos), colors=[colors[bin_idx]], 
                    alpha=0.2, levels=contour_levels + [max_density])

# Add weight vectors (excluding bin 0)
pca_result_centered = pca.transform(weight_vecs_no_zero)


plt.xlabel(f'PC1 (explained variance: {pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'PC2 (explained variance: {pca.explained_variance_ratio_[1]:.2%})')
plt.title('Gaussian Distributions with 1σ, 2σ, 3σ Contours (Excluding Bin 0)')
plt.grid(True, alpha=0.3)
plt.show()

# Show the principal component vectors themselves
print("Principal Component Vectors (excluding bin 0):")
print(f"PC1: {pca.components_[0]}")
print(f"PC2: {pca.components_[1]}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")

In [None]:
# Show explained variance ratio for each principal component (5 components)
pca_5 = PCA(n_components=5)
pca_5.fit(weight_vecs_no_zero)

cumulative_variance = 0
for i in range(5):
    individual_variance = pca_5.explained_variance_ratio_[i]
    cumulative_variance += individual_variance
    print(f"PC{i+1} explained variance: {individual_variance:.2%} (cumulative: {cumulative_variance:.2%})")


In [None]:
torch.save(model.linear.weight, '/workspace/llm-progress-monitor/qwen3_4b_weight_tensor.pt')

### Varying layer code

In [None]:
from torch.utils.data import DataLoader, Dataset
from typing import List
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import gc
import os
from sklearn.model_selection import train_test_split

In [None]:
loss_fn = nn.CrossEntropyLoss()

In [None]:
class LogBinClassifier(nn.Module):
        def __init__(self,input_dim: int, n_bins: int):
            super().__init__()
            self.linear = nn.Linear(input_dim, n_bins)

        def forward(self, x):
            return self.linear(x)

class TokensRemainingDataset(Dataset):
        def __init__(self, activations: List[torch.Tensor]):
            self.data = []
            for activation in activations:
                for i in range(activation.shape[0]):
                    self.data.append((activation[i, :], activation.shape[0] - i, activation.shape[0]))

        def __len__(self):
            return len(self.data)
        
        def __getitem__(self, idx):
            return self.data[idx]
def bin_y(y, n_bins: int):
    return (y+1).log().floor().clamp(0,n_bins-1).to('cuda', dtype=torch.long)

In [None]:

def load_train_and_test_activations(layer_idx: int, n_bins: int = 8, n_train: int = 10000, return_model: bool = False):
    activations_dir = '/workspace/llm-progress-monitor/rollouts/activations'
    activations = []
    with torch.no_grad():
        for i in range(len(os.listdir(activations_dir))): #TODO: remove min
            if i % 10 == 0:
                print(f"Loaded {i} activations")
                gc.collect()
                torch.cuda.empty_cache()
            filename = f'{i}.pt'
            if os.path.exists(os.path.join(activations_dir, filename)):
                activations.append(torch.load(os.path.join(activations_dir, filename))[layer_idx].to('cpu'))
    # Train-test split on activations


    train_activations, test_activations = train_test_split(activations, test_size=0.2, random_state=42)
    print(f"Train set size: {len(train_activations)}")
    print(f"Test set size: {len(test_activations)}")

    

    train_dataset = TokensRemainingDataset(train_activations)
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_dataset = TokensRemainingDataset(test_activations)
    test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

        
    model = LogBinClassifier(train_dataset[0][0].shape[0], n_bins).to('cuda')
    adam = optim.Adam(model.parameters(), lr=0.0001)
    # Calculate class frequencies from training data
    class_counts = torch.zeros(n_bins)
    total_samples = 0

    for X, y, _ in train_dataloader:
        y_binned = bin_y(y, n_bins)
        for i in range(n_bins):
            class_counts[i] += (y_binned == i).sum().item()
        total_samples += len(y)

    # Calculate inverse frequency weights
    weight = total_samples / (n_bins * class_counts)
    weight = weight.to('cuda', dtype=torch.float32)
    print("Class frequencies:", class_counts)
    print("Weights:", weight)
    losses = []

    test_losses = []

    for i, (X, y, _) in enumerate(train_dataloader):
        model.zero_grad()
        preds = model(X.to('cuda'))
        y_transformed = bin_y(y, n_bins)
        loss = loss_fn(preds, y_transformed)
        loss.backward()
        adam.step()
        losses.append(loss.item())
        
        # Calculate test loss
        if i % 10 == 0:  # Calculate test loss every 10 batches
            model.eval()
            test_loss_sum = 0
            test_batches = 0
            with torch.no_grad():
                for X_test, y_test, _ in test_dataloader:
                    preds_test = model(X_test.to('cuda'))
                    y_test_transformed = bin_y(y_test, n_bins)
                    test_loss = loss_fn(preds_test, y_test_transformed)
                    test_loss_sum += test_loss.item()
                    test_batches += 1
            test_losses.append(test_loss_sum / test_batches)
            model.train()
        
        print(f"Batch {i}, Train Loss: {loss.item():.4f}")
        if i % 10 == 0 and test_losses:
            print(f"Test Loss: {test_losses[-1]:.4f}")
        if i >= n_train:
            break
    if return_model:
        return losses, test_losses, model
    else:
        return losses, test_losses

In [None]:
losses_dict = {}
test_losses_dict = {} #maps layer_idx to losses and test_losses
for layer_idx in [0,5,10,15,20,25]:
    gc.collect()
    torch.cuda.empty_cache()
    losses, test_losses = load_train_and_test_activations(layer_idx)
    losses_dict[layer_idx] = losses
    test_losses_dict[layer_idx] = test_losses

In [None]:
import json

# Save the losses dictionaries to JSON files
with open('losses_dict.json', 'w') as f:
    json.dump(losses_dict, f, indent=2)

with open('test_losses_dict.json', 'w') as f:
    json.dump(test_losses_dict, f, indent=2)

print("Saved losses_dict.json and test_losses_dict.json")


In [None]:
import matplotlib.pyplot as plt

# Plot the training losses for each layer
plt.figure(figsize=(12, 8))

for layer_idx, losses in losses_dict.items():
    plt.plot(losses, label=f'Layer {layer_idx}', alpha=0.8)

plt.xlabel('Training Step')
plt.ylabel('Training Loss')
plt.title('Training Losses by Layer')
plt.ylim(0, 2)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Also plot test losses if available
if test_losses_dict:
    plt.figure(figsize=(12, 8))
    
    for layer_idx, test_losses in test_losses_dict.items():
        if test_losses:  # Only plot if there are test losses
            plt.plot(test_losses, label=f'Layer {layer_idx} (Test)', alpha=0.8, linestyle='--')
    
    plt.xlabel('Evaluation Step')
    plt.ylabel('Test Loss')
    plt.title('Test Losses by Layer')
    plt.ylim(0, 2)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
import matplotlib.pyplot as plt

# Plot the training losses for each layer with log x-axis
plt.figure(figsize=(12, 8))

for layer_idx, losses in losses_dict.items():
    plt.semilogx(losses, label=f'Layer {layer_idx}', alpha=0.8)

plt.xlabel('Training Step (log scale)')
plt.ylabel('Training Loss')
plt.title('Training Losses by Layer (Log X-axis)')
plt.ylim(0, 2)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Also plot test losses if available with log x-axis
if test_losses_dict:
    plt.figure(figsize=(12, 8))
    
    for layer_idx, test_losses in test_losses_dict.items():
        if test_losses:  # Only plot if there are test losses
            plt.semilogx(test_losses, label=f'Layer {layer_idx} (Test)', alpha=0.8, linestyle='--')
    
    plt.xlabel('Evaluation Step (log scale)')
    plt.ylabel('Test Loss')
    plt.title('Test Losses by Layer (Log X-axis)')
    plt.ylim(0, 5)
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()


More data probably doesnt help at this point

In [None]:
losses, test_losses, model = load_train_and_test_activations(15, return_model=True)

In [None]:
weight_tensor = model.linear.weight
torch.save(weight_tensor, 'weight_tensor.pt')
weight_tensor

In [None]:
from sklearn.decomposition import PCA

# Get the weight tensor and convert to numpy for PCA
weight_numpy = weight_tensor.detach().cpu().numpy()

# Fit PCA to find the 2 principal components
pca = PCA(n_components=2)
pca_result = pca.fit_transform(weight_numpy)

print(f"Shape of weight tensor: {weight_tensor.shape}")
print(f"Shape after PCA: {pca_result.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total explained variance: {pca.explained_variance_ratio_.sum():.4f}")

# Plot the principal components
plt.figure(figsize=(10, 8))
# Number each point with its index
for i in range(len(pca_result)):
    plt.scatter(pca_result[i, 0], pca_result[i, 1], alpha=0.6)
    plt.annotate(str(i), (pca_result[i, 0], pca_result[i, 1]), 
                xytext=(5, 5), textcoords='offset points', fontsize=8)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.3f} variance)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.3f} variance)')
plt.title('Weight Matrix Projected onto First 2 Principal Components')
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
pca_tensor = torch.tensor(pca.components_)

In [None]:
from einops import einsum

In [None]:
import pandas as pd

In [None]:
import torch

weight_tensor = torch.load('/workspace/llm-progress-monitor/weight_tensor.pt')

# Get the top 2 principal components (2304-dimensional vectors)
from sklearn.decomposition import PCA

# Convert to numpy for sklearn
weight_numpy = weight_tensor.cpu().detach().numpy()

# Fit PCA to get the principal components
pca = PCA(n_components=2)
pca.fit(weight_numpy)

# Get the 2 principal component vectors (each 2304-dimensional)
pca_components = torch.tensor(pca.components_, dtype=weight_tensor.dtype, device=weight_tensor.device)

print(f"Original shape: {weight_tensor.shape}")
print(f"PCA components shape: {pca_components.shape}")
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
pca_components

In [None]:
weight_tensor = torch.load('/workspace/llm-progress-monitor/qwen3_4b_weight_tensor.pt')

In [None]:
import random
random.seed(42)

In [None]:
def get_ema_preds(log_preds, alpha=0.99):
    given_alpha = alpha
    preds_list = log_preds.exp().tolist()
    
    ema_preds = []
    cur_ema = None
    for i,pred in enumerate(preds_list):
        if pred < 10:
            alpha = 0.5
        else:
            alpha = given_alpha
        if cur_ema is None:
            cur_ema = pred
        else:
            cur_ema = alpha*(cur_ema-1) + (1-alpha)*pred #-1 because we have stepped one token
        ema_preds.append(cur_ema)
    return ema_preds

for i in range(10):
    idx = random.randint(0, len(test_activations)-1)
    log_preds = einsum(
        einsum(test_activations[idx].to('cuda'), weight_tensor, 'seq d_model, pca d_model -> seq pca').softmax(dim=1),
        0.5+torch.arange(weight_tensor.shape[0]).to('cuda', dtype=torch.bfloat16),
        'seq pca, pca -> seq'
    )
    
    ema_preds = get_ema_preds(log_preds)
    plt.plot(ema_preds)
plt.xlabel('Actual Tokens')
plt.ylabel('Predicted Tokens')
plt.title('Prediction Remaining Over Generation')
plt.axhline(0, color='black', linestyle='--')
plt.gca().set_aspect('equal', adjustable='box')
plt.show()

In [None]:
def get_log_preds(activation, weight_tensor):
    return einsum(
        einsum(activation.to('cuda'), weight_tensor, 'seq d_model, pca d_model -> seq pca').softmax(dim=1),
        0.5+torch.arange(weight_tensor.shape[0]).to('cuda', dtype=torch.bfloat16),
        'seq pca, pca -> seq'
    )

# Get predictions for all test activations
dataset = []

for idx in range(len(test_activations)):
    log_preds = get_log_preds(test_activations[idx], weight_tensor)
    ema_preds = get_ema_preds(log_preds)
    for token_idx, pred in enumerate(ema_preds):
        percent_pred = token_idx/(token_idx + pred)
        percent_actual = token_idx/(len(log_preds))
        dataset.append((percent_actual, percent_pred))
dataset_tensor = torch.tensor(dataset)

In [None]:
plt.title('Actual vs Predicted Percent')
plt.scatter(dataset_tensor[:,0], dataset_tensor[:,1], alpha=0.1, s=1)
plt.plot([0, 1], [0, 1], 'r-', label='y=x')
plt.xlabel('Actual Percent')
plt.ylabel('Predicted Percent')

# Calculate R-squared
x = dataset_tensor[:,0]
y = dataset_tensor[:,1]
mean_y = torch.mean(y)
ss_tot = torch.sum((y - mean_y) ** 2)
ss_res = torch.sum((y - x) ** 2)  # assuming perfect prediction would be y = x
r_squared = 1 - (ss_res / ss_tot)
print(f"R-squared: {r_squared.item():.4f}")

# Calculate mean absolute difference
mean_abs_diff = torch.mean(torch.abs(y - x))
print(f"Mean absolute difference: {mean_abs_diff.item():.4f}")