# Data Pipeline for Subliminal Steering

Interactive notebook for preparing Data-1 (HuggingFace) and Data-2 (Model-2 generation) following Plan.md requirements.

This notebook provides step-by-step data preparation with visualization and validation.

In [None]:
# Setup and imports
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('.')

from prepare_data import DataPipeline
from utils_io import setup_device, log_gpu_memory
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

## Configuration

Set up experimental parameters following Plan.md specifications.

In [None]:
# Experimental configuration
CONFIG = {
    'model_name': 'Qwen/Qwen2.5-7B-Instruct',
    'hf_dataset_name': 'minhxle/subliminal-learning_numbers_dataset',
    'hf_config': 'qwen2.5-7b-instruct_bear_preference',
    'output_dir': './notebook_data_output',
    'num_samples': 1000,  # Reduced for notebook demonstration
    'force_cpu': False,  # Set to True if no GPU
    'low_memory': True   # Enable memory optimizations
}

print("Data Pipeline Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")

# Check device availability
device = setup_device(CONFIG['force_cpu'])
log_gpu_memory()

## Step 1: Initialize Data Pipeline

Create the data pipeline with resource-aware configuration.

In [None]:
# Initialize data pipeline
pipeline = DataPipeline(
    model_name=CONFIG['model_name'],
    hf_dataset_name=CONFIG['hf_dataset_name'],
    hf_config=CONFIG['hf_config'],
    output_dir=CONFIG['output_dir'],
    force_cpu=CONFIG['force_cpu'],
    low_memory=CONFIG['low_memory']
)

print("Data pipeline initialized successfully!")

## Step 2: Load Data-1 from HuggingFace

Load the subliminal learning dataset containing bear-preference numeric sequences.

In [None]:
# Load Data-1 from HuggingFace
print("Loading Data-1 from HuggingFace...")
data1_dataset = pipeline.load_data1_from_hf(max_samples=CONFIG['num_samples'])

# Extract sequences for analysis
data1_sequences = [example['text'] for example in data1_dataset]

print(f"\nLoaded {len(data1_sequences)} Data-1 sequences")
print("\nFirst 5 examples:")
for i, seq in enumerate(data1_sequences[:5]):
    print(f"  {i+1}: {seq[:100]}...")

## Step 3: Analyze Data-1 Properties

Examine the structure and properties of the loaded data.

In [None]:
# Analyze Data-1 properties
tokenizer = pipeline.tokenizer

# Tokenize sequences to analyze lengths
tokenized_sequences = [tokenizer(seq, add_special_tokens=False)['input_ids'] 
                      for seq in data1_sequences]
sequence_lengths = [len(tokens) for tokens in tokenized_sequences]

# Create length distribution plot
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Length histogram
axes[0].hist(sequence_lengths, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[0].set_xlabel('Sequence Length (tokens)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Data-1 Sequence Length Distribution')
axes[0].grid(True, alpha=0.3)

# Length statistics
length_stats = {
    'Min': min(sequence_lengths),
    'Max': max(sequence_lengths), 
    'Mean': np.mean(sequence_lengths),
    'Median': np.median(sequence_lengths),
    'Std': np.std(sequence_lengths)
}

# Bar plot of statistics
axes[1].bar(length_stats.keys(), length_stats.values(), color='lightcoral', alpha=0.7)
axes[1].set_ylabel('Tokens')
axes[1].set_title('Data-1 Length Statistics')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Data-1 Analysis:")
for stat, value in length_stats.items():
    print(f"  {stat}: {value:.2f} tokens")

## Step 4: Generate Data-2 from Model-2

Generate neutral numeric sequences using the base model without trait preference.

In [None]:
# Generate Data-2 from Model-2
print("Generating Data-2 from Model-2 (this may take several minutes)...")
print("Using resource-aware generation with sharding...")

data2_sequences = pipeline.generate_data2_from_model2(
    data1_dataset,
    num_samples=len(data1_sequences),
    batch_size=2,  # Small batch for memory efficiency
    max_retries=2
)

print(f"\nGenerated {len(data2_sequences)} Data-2 sequences")
print("\nFirst 5 examples:")
for i, seq in enumerate(data2_sequences[:5]):
    print(f"  {i+1}: {seq[:100]}...")

## Step 5: Compare Data-1 vs Data-2

Analyze differences between trait-bearing and neutral sequences.

In [None]:
# Compare Data-1 vs Data-2
data2_tokenized = [tokenizer(seq, add_special_tokens=False)['input_ids'] 
                  for seq in data2_sequences]
data2_lengths = [len(tokens) for tokens in data2_tokenized]

# Create comparison plots
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Length comparison histogram
axes[0,0].hist([sequence_lengths, data2_lengths], bins=20, alpha=0.7, 
               label=['Data-1 (trait)', 'Data-2 (neutral)'], color=['skyblue', 'lightcoral'])
axes[0,0].set_xlabel('Sequence Length (tokens)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_title('Sequence Length Comparison')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# Length box plots
axes[0,1].boxplot([sequence_lengths, data2_lengths], labels=['Data-1', 'Data-2'])
axes[0,1].set_ylabel('Sequence Length (tokens)')
axes[0,1].set_title('Length Distribution Comparison')
axes[0,1].grid(True, alpha=0.3)

# Sample comparison table
comparison_data = []
for i in range(min(10, len(data1_sequences), len(data2_sequences))):
    comparison_data.append({
        'Index': i+1,
        'Data-1 Length': len(tokenized_sequences[i]),
        'Data-2 Length': len(data2_tokenized[i]),
        'Data-1 Preview': data1_sequences[i][:30] + '...',
        'Data-2 Preview': data2_sequences[i][:30] + '...'
    })

df_comparison = pd.DataFrame(comparison_data)

# Display statistics
stats_comparison = pd.DataFrame({
    'Data-1': [np.mean(sequence_lengths), np.std(sequence_lengths), 
              min(sequence_lengths), max(sequence_lengths)],
    'Data-2': [np.mean(data2_lengths), np.std(data2_lengths),
              min(data2_lengths), max(data2_lengths)]
}, index=['Mean', 'Std', 'Min', 'Max'])

# Plot statistics comparison
stats_comparison.plot(kind='bar', ax=axes[1,0], color=['skyblue', 'lightcoral'], alpha=0.7)
axes[1,0].set_title('Statistical Comparison')
axes[1,0].set_ylabel('Tokens')
axes[1,0].tick_params(axis='x', rotation=0)
axes[1,0].grid(True, alpha=0.3)

# Length difference scatter
length_diffs = [abs(l1 - l2) for l1, l2 in zip(sequence_lengths, data2_lengths)]
axes[1,1].scatter(range(len(length_diffs)), length_diffs, alpha=0.6, color='green')
axes[1,1].set_xlabel('Sequence Index')
axes[1,1].set_ylabel('Length Difference (tokens)')
axes[1,1].set_title('Pairwise Length Differences')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nData Comparison Summary:")
print(stats_comparison)
print(f"\nMean length difference: {np.mean(length_diffs):.2f} tokens")

## Step 6: Sequence Alignment

Apply right-padding alignment as required by Plan.md for position consistency.

In [None]:
# Perform sequence alignment
print("Applying right-padding alignment...")
aligned_data1, aligned_data2 = pipeline.align_sequences(data1_sequences, data2_sequences)

# Verify alignment
aligned_lengths_1 = [len(tokenizer(seq, add_special_tokens=False)['input_ids']) 
                    for seq in aligned_data1]
aligned_lengths_2 = [len(tokenizer(seq, add_special_tokens=False)['input_ids']) 
                    for seq in aligned_data2]

print(f"\nAlignment Results:")
print(f"  Data-1 aligned sequences: {len(aligned_data1)}")
print(f"  Data-2 aligned sequences: {len(aligned_data2)}")
print(f"  Data-1 length range: {min(aligned_lengths_1)} - {max(aligned_lengths_1)}")
print(f"  Data-2 length range: {min(aligned_lengths_2)} - {max(aligned_lengths_2)}")

# Verify perfect alignment
alignment_perfect = all(l1 == l2 for l1, l2 in zip(aligned_lengths_1, aligned_lengths_2))
print(f"  Perfect alignment achieved: {alignment_perfect}")

# Show alignment examples
print("\nAlignment Examples:")
for i in range(3):
    print(f"  Pair {i+1}:")
    print(f"    Data-1: {aligned_data1[i][:80]}...")
    print(f"    Data-2: {aligned_data2[i][:80]}...")
    print(f"    Lengths: {aligned_lengths_1[i]} vs {aligned_lengths_2[i]}")
    print()

## Step 7: Data Validation

Validate that all sequences meet numeric-only requirements.

In [None]:
# Validate numeric sequences
from utils_io import validate_numeric_sequence

# Check Data-1 validation
data1_valid = [validate_numeric_sequence(seq) for seq in aligned_data1]
data1_valid_count = sum(data1_valid)

# Check Data-2 validation
data2_valid = [validate_numeric_sequence(seq) for seq in aligned_data2]
data2_valid_count = sum(data2_valid)

print("Data Validation Results:")
print(f"  Data-1 valid sequences: {data1_valid_count}/{len(aligned_data1)} ({data1_valid_count/len(aligned_data1)*100:.1f}%)")
print(f"  Data-2 valid sequences: {data2_valid_count}/{len(aligned_data2)} ({data2_valid_count/len(aligned_data2)*100:.1f}%)")

# Show invalid examples if any
invalid_data1 = [seq for seq, valid in zip(aligned_data1, data1_valid) if not valid]
invalid_data2 = [seq for seq, valid in zip(aligned_data2, data2_valid) if not valid]

if invalid_data1:
    print(f"\nInvalid Data-1 examples ({len(invalid_data1)}):")
    for seq in invalid_data1[:3]:
        print(f"  '{seq[:60]}...'")

if invalid_data2:
    print(f"\nInvalid Data-2 examples ({len(invalid_data2)}):")
    for seq in invalid_data2[:3]:
        print(f"  '{seq[:60]}...'")

## Step 8: Save Prepared Dataset

Save the complete aligned dataset for use in steering vector construction.

In [None]:
# Create final dataset structure
final_dataset = {
    'data1_sequences': aligned_data1,
    'data2_sequences': aligned_data2,
    'metadata': {
        'model_name': CONFIG['model_name'],
        'hf_dataset': CONFIG['hf_dataset_name'],
        'hf_config': CONFIG['hf_config'],
        'num_samples': len(aligned_data1),
        'alignment_perfect': alignment_perfect,
        'data1_valid_rate': data1_valid_count / len(aligned_data1),
        'data2_valid_rate': data2_valid_count / len(aligned_data2),
        'preparation_timestamp': pd.Timestamp.now().isoformat()
    }
}

# Save dataset
from utils_io import save_results
from pathlib import Path

output_dir = Path(CONFIG['output_dir'])
save_results(final_dataset, output_dir, 'notebook_prepared_dataset')

# Also save as CSV for easy inspection
df_final = pd.DataFrame({
    'data1': aligned_data1,
    'data2': aligned_data2
})
df_final.to_csv(output_dir / 'prepared_sequences.csv', index=False)

print("Dataset saved successfully!")
print(f"  Location: {output_dir}")
print(f"  Files: notebook_prepared_dataset.pkl, prepared_sequences.csv")
print(f"  Total sequences: {len(aligned_data1)} Data-1, {len(aligned_data2)} Data-2")

## Summary

Data preparation completed successfully following Plan.md requirements:

✅ **Data-1**: Loaded from HuggingFace subliminal learning dataset  
✅ **Data-2**: Generated from Model-2 using resource-aware processing  
✅ **Numeric validation**: All sequences contain only numbers and commas  
✅ **Right-padding alignment**: Sequences aligned for position consistency  
✅ **Reproducible storage**: Dataset saved for steering vector construction  

The prepared data is now ready for the next phase: steering vector construction using activation-difference methodology.

In [None]:
# Final summary statistics
summary_stats = pd.DataFrame({
    'Metric': [
        'Total Data-1 sequences',
        'Total Data-2 sequences', 
        'Sequences perfectly aligned',
        'Data-1 validation rate',
        'Data-2 validation rate',
        'Average sequence length',
        'Memory optimization used',
        'CPU/GPU processing'
    ],
    'Value': [
        len(aligned_data1),
        len(aligned_data2),
        'Yes' if alignment_perfect else 'No',
        f"{data1_valid_count/len(aligned_data1)*100:.1f}%",
        f"{data2_valid_count/len(aligned_data2)*100:.1f}%",
        f"{np.mean(aligned_lengths_1):.1f} tokens",
        'Yes' if CONFIG['low_memory'] else 'No',
        'CPU' if CONFIG['force_cpu'] else 'GPU/Auto'
    ]
})

print("\n" + "="*50)
print("DATA PREPARATION SUMMARY")
print("="*50)
print(summary_stats.to_string(index=False))
print("="*50)