In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import torch
from torch.utils.data import Dataset, ConcatDataset, Subset
from transformers import (
    RobertaConfig,
    RobertaForMaskedLM,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    PreTrainedTokenizerFast,
)
from pathlib import Path
import pickle
import numpy as np

print("=" * 80)
print("CHEMICAL LANGUAGE MODEL - PRE-TRAINING EXPERIMENTS")
print("=" * 80)


  from .autonotebook import tqdm as notebook_tqdm


CHEMICAL LANGUAGE MODEL - PRE-TRAINING EXPERIMENTS


In [None]:
from pathlib import Path
from transformers import PreTrainedTokenizerFast
import pickle

# Paths
data_dir = Path("/Users/lsieben/VSCode/CLIMB/local_prototyping_data")
tokenizer_dir = data_dir / "tokenizer"
unsup_file = data_dir / "unsupervised_tokenized.pkl"
sup_file = data_dir / "supervised_tokenized.pkl"

# Check files exist
print(f"\nChecking files...")
print(f"  Tokenizer dir: {tokenizer_dir}")
print(f"  Exists: {tokenizer_dir.exists()}")
print(f"  Unsupervised data: {unsup_file.exists()}")
print(f"  Supervised data: {sup_file.exists()}")

# List tokenizer files
if tokenizer_dir.exists():
    print(f"\nFiles in tokenizer directory:")
    for f in tokenizer_dir.iterdir():
        print(f"  - {f.name}")

# Load tokenizer - specify the tokenizer.json file directly
print("\nLoading tokenizer...")
tokenizer_json = tokenizer_dir / "tokenizer.json"

if tokenizer_json.exists():
    tokenizer = PreTrainedTokenizerFast(
        tokenizer_file=str(tokenizer_json),
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
    )
    print(f"✓ Tokenizer loaded from tokenizer.json. Vocab size: {len(tokenizer)}")
else:
    raise FileNotFoundError(f"tokenizer.json not found at {tokenizer_json}")

# Load unsupervised data
print("\nLoading unsupervised data...")
with open(unsup_file, 'rb') as f:
    unsup_data = pickle.load(f)
print(f"✓ Loaded {len(unsup_data)} unsupervised samples")

# Load supervised data
print("\nLoading supervised data...")
with open(sup_file, 'rb') as f:
    sup_data = pickle.load(f)
print(f"✓ Loaded {len(sup_data['data'])} supervised samples")
print(f"  Label shape: {sup_data['labels'].shape}")
print(f"  Number of genes: {sup_data['labels'].shape[1]}")


Checking files...
  Tokenizer dir: /Users/lsieben/VSCode/CLIMB/local_prototyping_data/tokenizer
  Exists: True
  Unsupervised data: True
  Supervised data: True

Files in tokenizer directory:
  - tokenizer_config.json
  - special_tokens_map.json
  - tokenizer.json
  - merges.txt
  - vocab.json

Loading tokenizer...
✓ Tokenizer loaded from tokenizer.json. Vocab size: 1000

Loading unsupervised data...
✓ Loaded 8346066 unsupervised samples

Loading supervised data...
✓ Loaded 11622 supervised samples
  Label shape: (11622, 978)
  Number of genes: 978


In [8]:
print("\n" + "=" * 80)
print("SECTION 2: Creating Dataset Classes")
print("=" * 80)

class UnsupervisedChemicalDataset(Dataset):
    """Dataset for unsupervised MLM training"""
    def __init__(self, tokenized_data):
        self.data = tokenized_data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

class SupervisedChemicalDataset(Dataset):
    """Dataset for supervised multi-task regression"""
    def __init__(self, tokenized_data, labels):
        self.data = tokenized_data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx].copy()
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Create datasets
unsup_dataset = UnsupervisedChemicalDataset(unsup_data)
sup_dataset = SupervisedChemicalDataset(sup_data['data'], sup_data['labels'])

print(f"✓ Created unsupervised dataset: {len(unsup_dataset)} samples")
print(f"✓ Created supervised dataset: {len(sup_dataset)} samples")



SECTION 2: Creating Dataset Classes
✓ Created unsupervised dataset: 8346066 samples
✓ Created supervised dataset: 11622 samples


In [9]:
print("\n" + "=" * 80)
print("SECTION 3: Model Configuration")
print("=" * 80)

def create_model(vocab_size, num_genes=None, task="mlm"):
    """
    Create encoder model
    
    Args:
        vocab_size: Size of vocabulary
        num_genes: Number of genes for supervised task
        task: "mlm" or "regression"
    """
    config = RobertaConfig(
        vocab_size=vocab_size,
        max_position_embeddings=512,
        hidden_size=256,  # Smaller for prototyping
        num_hidden_layers=6,
        num_attention_heads=8,
        intermediate_size=1024,
        hidden_dropout_prob=0.1,
        attention_probs_dropout_prob=0.1,
    )
    
    if task == "mlm":
        model = RobertaForMaskedLM(config)
        print(f"✓ Created MLM model")
    elif task == "regression":
        config.num_labels = num_genes
        config.problem_type = "regression"
        model = RobertaForSequenceClassification(config)
        print(f"✓ Created regression model with {num_genes} outputs")
    else:
        raise ValueError(f"Unknown task: {task}")
    
    # Count parameters
    n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"  Trainable parameters: {n_params:,}")
    
    return model



SECTION 3: Model Configuration


In [10]:
print("\n" + "=" * 80)
print("SECTION 4: Training Setup")
print("=" * 80)

def train_unsupervised(
    dataset,
    model,
    tokenizer,
    output_dir,
    num_epochs=3,
    batch_size=16,
):
    """Train with Masked Language Modeling"""
    
    # Data collator for MLM
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15,
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=5e-5,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"{output_dir}/logs",
        logging_steps=50,
        save_steps=500,
        save_total_limit=1,
        use_cpu=not torch.backends.mps.is_available(),  # Use MPS if available
        dataloader_num_workers=0,  # Avoid multiprocessing issues
        remove_unused_columns=False,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
    )
    
    # Train
    print(f"\nStarting unsupervised training...")
    trainer.train()
    
    # Save
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return trainer

def train_supervised(
    dataset,
    model,
    tokenizer,
    output_dir,
    num_epochs=10,
    batch_size=16,
):
    """Train for multi-task gene expression prediction"""
    
    # Data collator
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=2e-5,
        warmup_steps=50,
        weight_decay=0.01,
        logging_dir=f"{output_dir}/logs",
        logging_steps=50,
        save_steps=500,
        save_total_limit=1,
        use_cpu=not torch.backends.mps.is_available(),
        dataloader_num_workers=0,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        data_collator=data_collator,
    )
    
    # Train
    print(f"\nStarting supervised training...")
    trainer.train()
    
    # Save
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return trainer

def train_mixed(
    unsup_dataset,
    sup_dataset,
    model,
    tokenizer,
    output_dir,
    unsup_weight=0.5,
    sup_weight=0.5,
    num_epochs=5,
    batch_size=16,
):
    """Train with mixed unsupervised and supervised data"""
    
    # Sample from each dataset based on weights
    n_unsup = int(len(unsup_dataset) * unsup_weight)
    n_sup = int(len(sup_dataset) * sup_weight)
    
    print(f"\nMixing datasets:")
    print(f"  Unsupervised: {n_unsup} samples ({unsup_weight*100:.0f}%)")
    print(f"  Supervised: {n_sup} samples ({sup_weight*100:.0f}%)")
    
    # Create subsets
    unsup_subset = Subset(unsup_dataset, range(min(n_unsup, len(unsup_dataset))))
    sup_subset = Subset(sup_dataset, range(min(n_sup, len(sup_dataset))))
    
    # Combine datasets
    mixed_dataset = ConcatDataset([unsup_subset, sup_subset])
    print(f"  Total mixed dataset: {len(mixed_dataset)} samples")
    
    # For mixed training, we use MLM objective
    # (In practice, you might want a custom collator that handles both)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15,
    )
    
    # Training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=batch_size,
        learning_rate=5e-5,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"{output_dir}/logs",
        logging_steps=50,
        save_steps=500,
        save_total_limit=1,
        use_cpu=not torch.backends.mps.is_available(),
        dataloader_num_workers=0,
        remove_unused_columns=False,
    )
    
    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=mixed_dataset,
        data_collator=data_collator,
    )
    
    # Train
    print(f"\nStarting mixed training...")
    trainer.train()
    
    # Save
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return trainer



SECTION 4: Training Setup


In [12]:
import torch

print("\n" + "=" * 80)
print("DEVICE CHECK")
print("=" * 80)
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"MPS built: {torch.backends.mps.is_built()}")

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"✓ Will use device: {device}")
else:
    device = torch.device("cpu")
    print(f"✓ Will use device: {device}")



DEVICE CHECK
MPS available: True
MPS built: True
✓ Will use device: mps


In [13]:
# ============================================================================
# EXPERIMENT 1: 100% Unsupervised (MLM)
# ============================================================================

print("\n" + "=" * 80)
print("EXPERIMENT 1: 100% Unsupervised Pre-training")
print("=" * 80)

exp1_dir = data_dir / "experiments" / "exp1_100_unsupervised"
exp1_dir.mkdir(parents=True, exist_ok=True)

model_exp1 = create_model(
    vocab_size=len(tokenizer),
    task="mlm"
)

trainer_exp1 = train_unsupervised(
    dataset=unsup_dataset,
    model=model_exp1,
    tokenizer=tokenizer,
    output_dir=str(exp1_dir),
    num_epochs=3,
    batch_size=16,
)

print(f"\n✓ Experiment 1 complete! Model saved to {exp1_dir}")


EXPERIMENT 1: 100% Unsupervised Pre-training
✓ Created MLM model
  Trainable parameters: 5,193,960

Starting unsupervised training...




Step,Training Loss
50,6.5059
100,5.1329
150,4.1675
200,3.4943
250,3.0081
300,2.862
350,2.8236
400,2.7203
450,2.6768
500,2.5942


KeyboardInterrupt: 

In [None]:
# ============================================================================
# EXPERIMENT 2: 100% Supervised
# ============================================================================

print("\n" + "=" * 80)
print("EXPERIMENT 2: 100% Supervised Pre-training")
print("=" * 80)

exp2_dir = data_dir / "experiments" / "exp2_100_supervised"
exp2_dir.mkdir(parents=True, exist_ok=True)

num_genes = sup_data['labels'].shape[1]
model_exp2 = create_model(
    vocab_size=len(tokenizer),
    num_genes=num_genes,
    task="regression"
)

trainer_exp2 = train_supervised(
    dataset=sup_dataset,
    model=model_exp2,
    tokenizer=tokenizer,
    output_dir=str(exp2_dir),
    num_epochs=10,
    batch_size=16,
)

print(f"\n✓ Experiment 2 complete! Model saved to {exp2_dir}")


In [None]:
# ============================================================================
# EXPERIMENT 3: 50% Unsupervised + 50% Supervised
# ============================================================================

print("\n" + "=" * 80)
print("EXPERIMENT 3: 50% Unsupervised + 50% Supervised")
print("=" * 80)

exp3_dir = data_dir / "experiments" / "exp3_50_50_mixed"
exp3_dir.mkdir(parents=True, exist_ok=True)

model_exp3 = create_model(
    vocab_size=len(tokenizer),
    task="mlm"
)

trainer_exp3 = train_mixed(
    unsup_dataset=unsup_dataset,
    sup_dataset=sup_dataset,
    model=model_exp3,
    tokenizer=tokenizer,
    output_dir=str(exp3_dir),
    unsup_weight=0.5,
    sup_weight=0.5,
    num_epochs=5,
    batch_size=16,
)

print(f"\n✓ Experiment 3 complete! Model saved to {exp3_dir}")
