In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import pickle
from typing import List, Tuple

In [5]:
from tokenizers import ByteLevelBPETokenizer
from transformers import PreTrainedTokenizerFast
from pathlib import Path
import pandas as pd

# Setup paths
data_dir = Path("./local_prototyping_data")
unsupervised_file = data_dir / "GDB17_subset.smi"

# Read the CSV (just one column of SMILES)
print("Reading SMILES data...")
df = pd.read_csv(unsupervised_file, header=None, names=["SMILES"])
print(f"✓ Loaded {len(df)} SMILES strings")

# Extract SMILES as list
smiles_list = df["SMILES"].tolist()

# Configuration
VOCAB_SIZE = 1000
MIN_FREQUENCY = 2
TOKENIZER_DIR = data_dir / "tokenizer"

# Initialize tokenizer
print(f"\nTraining tokenizer with vocab_size={VOCAB_SIZE}, min_frequency={MIN_FREQUENCY}...")
tokenizer_trainer = ByteLevelBPETokenizer()

# Special tokens
special_tokens = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"]

# Train on the clean file
print("Training in progress...")
tokenizer_trainer.train(
    files=[str(unsupervised_file)],
    vocab_size=VOCAB_SIZE,
    min_frequency=MIN_FREQUENCY,
    special_tokens=special_tokens,
)

print("✓ Training complete!")

# Save tokenizer
TOKENIZER_DIR.mkdir(exist_ok=True, parents=True)
tokenizer_trainer.save(str(TOKENIZER_DIR / "tokenizer.json"))

# Convert to HuggingFace format
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=str(TOKENIZER_DIR / "tokenizer.json"),
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>",
)

# Save HF tokenizer
tokenizer.save_pretrained(str(TOKENIZER_DIR))

print(f"\n{'='*60}")
print(f"✓ Tokenizer trained and saved to {TOKENIZER_DIR}")
print(f"✓ Vocabulary size: {len(tokenizer)}")
print(f"✓ Special tokens: {tokenizer.all_special_tokens}")
print(f"{'='*60}")

Reading SMILES data...
✓ Loaded 8346066 SMILES strings

Training tokenizer with vocab_size=1000, min_frequency=2...
Training in progress...



✓ Training complete!

✓ Tokenizer trained and saved to local_prototyping_data/tokenizer
✓ Vocabulary size: 1000
✓ Special tokens: ['<s>', '</s>', '<unk>', '<pad>', '<mask>']


In [6]:
# Test the tokenizer on your actual SMILES
print("\nTesting tokenizer on sample SMILES from your data:")
test_samples = smiles_list[:5]
for smiles in test_samples:
    tokens = tokenizer.tokenize(smiles)
    ids = tokenizer.encode(smiles)
    print(f"\n  SMILES: {smiles}")
    print(f"  Tokens: {tokens}")
    print(f"  Num tokens: {len(tokens)}")
    print(f"  Token IDs (first 10): {ids[:10]}")


Testing tokenizer on sample SMILES from your data:

  SMILES: BrC1=C2C3=C4C(CC3CCC2=O)C(=N)NC4=N1
  Tokens: ['BrC', '1', '=', 'C', '2', 'C', '3', '=', 'C', '4', 'C', '(', 'CC', '3', 'CCC', '2', '=', 'O', ')', 'C', '(=', 'N', ')', 'NC', '4', '=', 'N', '1']
  Num tokens: 28
  Token IDs (first 10): [310, 21, 33, 39, 22, 39, 23, 33, 39, 24]

  SMILES: BrC1=C2C3C4CCC(C4)C3C(=N)OC2=NC=C1
  Tokens: ['BrC', '1', '=', 'C', '2', 'C', '3', 'C', '4', 'CCC', '(', 'C', '4', ')', 'C', '3', 'C', '(=', 'N', ')', 'OC', '2', '=', 'NC', '=', 'C', '1']
  Num tokens: 27
  Token IDs (first 10): [310, 21, 33, 39, 22, 39, 23, 39, 24, 262]

  SMILES: BrC1=C2C3C4CCC(O4)C3(OC2=NC=C1)C#C
  Tokens: ['BrC', '1', '=', 'C', '2', 'C', '3', 'C', '4', 'CCC', '(', 'O', '4', ')', 'C', '3', '(', 'OC', '2', '=', 'NC', '=', 'C', '1', ')', 'C', '#', 'C']
  Num tokens: 28
  Token IDs (first 10): [310, 21, 33, 39, 22, 39, 23, 39, 24, 262]

  SMILES: BrC1=C2C3C4CNC(C4)(C#N)C3OC2=NC=C1
  Tokens: ['BrC', '1', '=', 'C', '2', 'C', '

In [7]:
print("\n" + "=" * 80)
print("SECTION 4: Testing Tokenizer")
print("=" * 80)

# Test tokenization
test_smiles = ["CCO", "c1ccccc1", "CC(=O)O"]

print("\nTokenization examples:")
for smiles in test_smiles:
    tokens = tokenizer.tokenize(smiles)
    ids = tokenizer.encode(smiles)
    decoded = tokenizer.decode(ids)
    
    print(f"\n  SMILES: {smiles}")
    print(f"  Tokens: {tokens}")
    print(f"  IDs: {ids[:10]}..." if len(ids) > 10 else f"  IDs: {ids}")
    print(f"  Decoded: {decoded}")

# Test special tokens
print("\n\nSpecial token IDs:")
print(f"  PAD: {tokenizer.pad_token} -> {tokenizer.pad_token_id}")
print(f"  MASK: {tokenizer.mask_token} -> {tokenizer.mask_token_id}")
print(f"  BOS: {tokenizer.bos_token} -> {tokenizer.bos_token_id}")
print(f"  EOS: {tokenizer.eos_token} -> {tokenizer.eos_token_id}")



SECTION 4: Testing Tokenizer

Tokenization examples:

  SMILES: CCO
  Tokens: ['CCO']
  IDs: [289]
  Decoded: CCO

  SMILES: c1ccccc1
  Tokens: ['c', '1', 'ccccc', '1']
  IDs: [71, 21, 655, 21]
  Decoded: c1ccccc1

  SMILES: CC(=O)O
  Tokens: ['CC', '(=', 'O', ')', 'O']
  IDs: [261, 265, 51, 13, 51]
  Decoded: CC(=O)O


Special token IDs:
  PAD: <pad> -> 1
  MASK: <mask> -> 4
  BOS: <s> -> 0
  EOS: </s> -> 2


In [8]:
from typing import List
import numpy as np
import pickle

print("\n" + "=" * 80)
print("SECTION 5: Tokenizing Unsupervised Dataset")
print("=" * 80)

MAX_LENGTH = 512

def tokenize_unsupervised(smiles_list: List[str], max_length: int = 512):
    """Tokenize SMILES for unsupervised learning"""
    tokenized = []
    
    for smiles in smiles_list:
        encoding = tokenizer(
            smiles,
            truncation=True,
            max_length=max_length,
            padding=False,  # We'll pad in batches
            return_tensors=None,
        )
        tokenized.append({
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"],
        })
    
    return tokenized

# Use smiles_list instead of sample_smiles_unsupervised
print(f"\nTokenizing {len(smiles_list)} unsupervised samples...")
unsup_tokenized = tokenize_unsupervised(smiles_list, MAX_LENGTH)

print(f"✓ Tokenized {len(unsup_tokenized)} samples")
print(f"\nExample tokenized sample:")
print(f"  Input IDs shape: {len(unsup_tokenized[0]['input_ids'])} tokens")
print(f"  Input IDs: {unsup_tokenized[0]['input_ids']}")
print(f"  Attention mask: {unsup_tokenized[0]['attention_mask']}")

# Statistics
seq_lengths = [len(item['input_ids']) for item in unsup_tokenized]
print(f"\nSequence length statistics:")
print(f"  Min: {min(seq_lengths)}")
print(f"  Max: {max(seq_lengths)}")
print(f"  Mean: {np.mean(seq_lengths):.2f}")
print(f"  Median: {np.median(seq_lengths):.2f}")

# Save tokenized data
unsup_tokenized_file = data_dir / "unsupervised_tokenized.pkl"
with open(unsup_tokenized_file, 'wb') as f:
    pickle.dump(unsup_tokenized, f)
print(f"\n✓ Saved tokenized data to {unsup_tokenized_file}")


SECTION 5: Tokenizing Unsupervised Dataset

Tokenizing 8346066 unsupervised samples...
✓ Tokenized 8346066 samples

Example tokenized sample:
  Input IDs shape: 28 tokens
  Input IDs: [310, 21, 33, 39, 22, 39, 23, 33, 39, 24, 39, 12, 261, 23, 262, 22, 33, 51, 13, 39, 265, 50, 13, 264, 24, 33, 50, 21]
  Attention mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

Sequence length statistics:
  Min: 3
  Max: 38
  Mean: 22.44
  Median: 23.00

✓ Saved tokenized data to local_prototyping_data/unsupervised_tokenized.pkl


In [10]:
import pandas as pd
import numpy as np
from typing import List
import pickle

print("\n" + "=" * 80)
print("SECTION 6: Loading and Tokenizing Supervised Dataset")
print("=" * 80)

# Load supervised data
sup_data_file = data_dir / "LINCS_L1000_MCF7_0-4.csv"  # Replace with your actual filename
df_sup = pd.read_csv(sup_data_file)

print(f"Loaded supervised dataset:")
print(f"  Shape: {df_sup.shape}")
print(f"  Columns: {df_sup.columns.tolist()[:10]}...")  # Show first 10 columns

# Extract SMILES column
smiles_supervised = df_sup['SMILES'].tolist()

# Extract all gene expression columns (all columns starting with 'geneID-')
gene_columns = [col for col in df_sup.columns if col.startswith('geneID-')]
print(f"\n✓ Found {len(gene_columns)} gene expression columns")

# Create labels matrix (each row is all gene expressions for one compound)
labels_supervised = df_sup[gene_columns].values  # This is a numpy array of shape (n_samples, n_genes)

print(f"  SMILES samples: {len(smiles_supervised)}")
print(f"  Labels shape: {labels_supervised.shape}")
print(f"  Example label (first compound, first 5 genes): {labels_supervised[0][:5]}")

# Tokenize supervised data
def tokenize_supervised(smiles_list: List[str], labels: np.ndarray, max_length: int = 512):
    """Tokenize SMILES for supervised learning"""
    tokenized = []
    
    for smiles in smiles_list:
        encoding = tokenizer(
            smiles,
            truncation=True,
            max_length=max_length,
            padding=False,
            return_tensors=None,
        )
        tokenized.append({
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"],
        })
    
    return tokenized, labels

print(f"\nTokenizing {len(smiles_supervised)} supervised samples...")
sup_tokenized, sup_labels = tokenize_supervised(
    smiles_supervised, 
    labels_supervised, 
    MAX_LENGTH
)

print(f"✓ Tokenized {len(sup_tokenized)} samples with labels")
print(f"\nExample tokenized supervised sample:")
print(f"  SMILES: {smiles_supervised[0]}")
print(f"  Input IDs: {sup_tokenized[0]['input_ids'][:20]}...")
print(f"  Label shape: {sup_labels[0].shape}")
print(f"  Label (first 5 genes): {sup_labels[0][:5]}")

# Save tokenized supervised data
sup_tokenized_file = data_dir / "supervised_tokenized.pkl"
with open(sup_tokenized_file, 'wb') as f:
    pickle.dump({
        'data': sup_tokenized, 
        'labels': sup_labels,
        'gene_columns': gene_columns  # Save gene names for reference
    }, f)
print(f"\n✓ Saved tokenized supervised data to {sup_tokenized_file}")

# ============================================================================
# SECTION 7: Test Loading Tokenized Data
# ============================================================================

print("\n" + "=" * 80)
print("SECTION 7: Testing Data Loading")
print("=" * 80)

# Load unsupervised data
print("\nLoading unsupervised data...")
with open(unsup_tokenized_file, 'rb') as f:
    loaded_unsup = pickle.load(f)
print(f"✓ Loaded {len(loaded_unsup)} unsupervised samples")

# Load supervised data
print("\nLoading supervised data...")
with open(sup_tokenized_file, 'rb') as f:
    loaded_sup = pickle.load(f)
print(f"✓ Loaded {len(loaded_sup['data'])} supervised samples")
print(f"✓ Loaded {len(loaded_sup['labels'])} labels")
print(f"✓ Label dimensionality: {loaded_sup['labels'].shape}")
print(f"✓ Gene columns saved: {len(loaded_sup['gene_columns'])}")

# Verify data integrity
print("\nVerifying data integrity...")
assert len(loaded_unsup) == len(unsup_tokenized), "Unsupervised data mismatch!"
assert len(loaded_sup['data']) == len(sup_tokenized), "Supervised data mismatch!"
assert loaded_unsup[0]['input_ids'] == unsup_tokenized[0]['input_ids'], "Data corrupted!"
print("✓ All data verified successfully!")

print(f"\n{'='*60}")
print("Summary:")
print(f"  Unsupervised samples: {len(loaded_unsup)}")
print(f"  Supervised samples: {len(loaded_sup['data'])}")
print(f"  Genes per sample: {loaded_sup['labels'].shape[1]}")
print(f"{'='*60}")


SECTION 6: Loading and Tokenizing Supervised Dataset
Loaded supervised dataset:
  Shape: (11622, 984)
  Columns: ['full_id', 'pert_id', 'cell_iname', 'SMILES', 'inchi_key', 'compound_aliases', 'geneID-10007', 'geneID-1001', 'geneID-10013', 'geneID-10038']...

✓ Found 978 gene expression columns
  SMILES samples: 11622
  Labels shape: (11622, 978)
  Example label (first compound, first 5 genes): [2 2 2 2 2]

Tokenizing 11622 supervised samples...
✓ Tokenized 11622 samples with labels

Example tokenized supervised sample:
  SMILES: CN(C)[N+][O-]
  Input IDs: [268, 12, 39, 324, 50, 316, 63, 51, 317]...
  Label shape: (978,)
  Label (first 5 genes): [2 2 2 2 2]

✓ Saved tokenized supervised data to local_prototyping_data/supervised_tokenized.pkl

SECTION 7: Testing Data Loading

Loading unsupervised data...
✓ Loaded 8346066 unsupervised samples

Loading supervised data...
✓ Loaded 11622 supervised samples
✓ Loaded 11622 labels
✓ Label dimensionality: (11622, 978)
✓ Gene columns saved: 978

In [12]:
print("\n" + "=" * 80)
print("SECTION 10: Creating PyTorch Datasets")
print("=" * 80)

import torch
from torch.utils.data import Dataset, DataLoader

class UnsupervisedChemicalDataset(Dataset):
    """Dataset for unsupervised learning"""
    def __init__(self, tokenized_data):
        self.data = tokenized_data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

class SupervisedChemicalDataset(Dataset):
    """Dataset for supervised learning"""
    def __init__(self, tokenized_data, labels):
        self.data = tokenized_data
        self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx].copy()
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Create datasets
unsup_dataset = UnsupervisedChemicalDataset(loaded_unsup)
sup_dataset = SupervisedChemicalDataset(loaded_sup['data'], loaded_sup['labels'])

print(f"✓ Created unsupervised dataset with {len(unsup_dataset)} samples")
print(f"✓ Created supervised dataset with {len(sup_dataset)} samples")

# Test dataset access
print("\nTesting dataset access:")
print(f"  Unsupervised sample 0 keys: {unsup_dataset[0].keys()}")
print(f"  Supervised sample 0 keys: {sup_dataset[0].keys()}")
print(f"  Supervised sample 0 keys: {sup_dataset[0].keys()}")
print(f"  Supervised sample 0 label shape: {sup_dataset[0]['labels'].shape}")
print(f"  Supervised sample 0 label (first 10 genes): {sup_dataset[0]['labels'][:10].tolist()}")


SECTION 10: Creating PyTorch Datasets
✓ Created unsupervised dataset with 8346066 samples
✓ Created supervised dataset with 11622 samples

Testing dataset access:
  Unsupervised sample 0 keys: dict_keys(['input_ids', 'attention_mask'])
  Supervised sample 0 keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
  Supervised sample 0 keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
  Supervised sample 0 label shape: torch.Size([978])
  Supervised sample 0 label (first 10 genes): [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]


In [13]:
print("\n" + "=" * 80)
print("SECTION 9: Testing Data Collator for Batching")
print("=" * 80)

from transformers import DataCollatorForLanguageModeling

# Create data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,  # 15% of tokens will be masked
)

# Create a small dataloader
dataloader = DataLoader(
    unsup_dataset,
    batch_size=4,
    collate_fn=data_collator,
)

# Get one batch
print("\nGetting a batch from dataloader...")
batch = next(iter(dataloader))

print(f"✓ Batch created successfully!")
print(f"\nBatch contents:")
print(f"  input_ids shape: {batch['input_ids'].shape}")
print(f"  attention_mask shape: {batch['attention_mask'].shape}")
print(f"  labels shape: {batch['labels'].shape}")

print(f"\nExample of masking (first sample):")
original_ids = batch['input_ids'][0]
labels = batch['labels'][0]
masked_positions = (labels != -100).nonzero(as_tuple=True)[0]
print(f"  Original tokens: {original_ids.tolist()[:30]}...")
print(f"  Masked positions: {masked_positions.tolist()[:10]}...")
print(f"  Mask token ID: {tokenizer.mask_token_id}")



SECTION 9: Testing Data Collator for Batching


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Getting a batch from dataloader...
✓ Batch created successfully!

Batch contents:
  input_ids shape: torch.Size([4, 28])
  attention_mask shape: torch.Size([4, 28])
  labels shape: torch.Size([4, 28])

Example of masking (first sample):
  Original tokens: [310, 644, 33, 39, 22, 39, 23, 33, 39, 24, 39, 12, 261, 23, 4, 22, 33, 51, 13, 39, 4, 50, 13, 264, 24, 33, 50, 21]...
  Masked positions: [1, 14, 20]...
  Mask token ID: 4


# How long does context window need to be

In [14]:
erythromycin = "CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]([C@@H]([C@H]([C@@H]([C@H](C(=O)O1)C)O[C@H]2C[C@@]([C@H]([C@@H](O2)C)O)(C)OC)C)O[C@H]3[C@@H]([C@H](C[C@H](O3)C)N(C)C)O)(C)O)C)C)O)(C)O"
vitamin_b12 = "NC(=O)C[C@@]8(C)[C@H](CCC(N)=O)C=2/N=C8/C(/C)=C1/[C@@H](CCC(N)=O)[C@](C)(CC(N)=O)[C@@](C)(N1[Co+]C#N)[C@@H]7/N=C(C(\C)=C3/N=C(/C=2)C(C)(C)[C@@H]3CCC(N)=O)[C@](C)(CCC(=O)NCC(C)OP([O-])(=O)O[C@@H]6[C@@H](CO)O[C@H](n5cnc4cc(C)c(C)cc45)[C@@H]6O)[C@H]7CC(N)=O"
paclitaxol = "CC1=C2[C@@]([C@]([C@H]([C@@H]3[C@]4([C@H](OC4)C[C@@H]([C@]3(C(=O)[C@@H]2OC(=O)C)C)O)OC(=O)C)OC(=O)c5ccccc5)(C[C@@H]1OC(=O)[C@H](O)[C@@H](NC(=O)c6ccccc6)c7ccccc7)O)(C)C"

test_smiles = [erythromycin, vitamin_b12, paclitaxol]

# Tokenize and check lengths
print("Token Length Analysis:")
print("=" * 80)
for smiles in test_smiles:
    tokens = tokenizer.tokenize(smiles)
    token_ids = tokenizer.encode(smiles)
    print(f"\nSMILES: {smiles[:60]}..." if len(smiles) > 60 else f"\nSMILES: {smiles}")
    print(f"Characters: {len(smiles)}")
    print(f"Tokens: {len(token_ids)}")
    print(f"Tokens: {tokens[:20]}..." if len(tokens) > 20 else f"Tokens: {tokens}")

# Now check YOUR actual data
print("\n" + "=" * 80)
print("YOUR DATA ANALYSIS:")
print("=" * 80)

# Get all lengths from your tokenized data
all_lengths = [len(item['input_ids']) for item in unsup_tokenized]

import numpy as np
print(f"Total molecules: {len(all_lengths)}")
print(f"Min length: {min(all_lengths)}")
print(f"Max length: {max(all_lengths)}")
print(f"Mean: {np.mean(all_lengths):.1f}")
print(f"Median: {np.median(all_lengths):.0f}")
print(f"95th percentile: {np.percentile(all_lengths, 95):.0f}")
print(f"99th percentile: {np.percentile(all_lengths, 99):.0f}")
print(f"99.9th percentile: {np.percentile(all_lengths, 99.9):.0f}")

print("\nTruncation analysis:")
for max_len in [64, 128, 256, 512, 1024]:
    n_truncated = sum(1 for l in all_lengths if l > max_len)
    pct = n_truncated / len(all_lengths) * 100
    print(f"  max_len={max_len}: {n_truncated} truncated ({pct:.2f}%)")

# Find the longest SMILES
longest_idx = all_lengths.index(max(all_lengths))
print(f"\nLongest molecule (index {longest_idx}):")
print(f"  Length: {all_lengths[longest_idx]} tokens")
print(f"  SMILES: {smiles_list[longest_idx][:100]}...")

Token Length Analysis:

SMILES: CC[C@@H]1[C@@]([C@@H]([C@H](C(=O)[C@@H](C[C@@]([C@@H]([C@H](...
Characters: 180
Tokens: 163
Tokens: ['CC', '[', 'C', '@', '@', 'H', ']', '1', '[', 'C', '@', '@', ']', '([', 'C', '@', '@', 'H', ']', '([']...

SMILES: NC(=O)C[C@@]8(C)[C@H](CCC(N)=O)C=2/N=C8/C(/C)=C1/[C@@H](CCC(...
Characters: 254
Tokens: 209
Tokens: ['NC', '(=', 'O', ')', 'C', '[', 'C', '@', '@', ']', '8', '(', 'C', ')[', 'C', '@', 'H', ']', '(', 'CCC']...

SMILES: CC1=C2[C@@]([C@]([C@H]([C@@H]3[C@]4([C@H](OC4)C[C@@H]([C@]3(...
Characters: 167
Tokens: 132
Tokens: ['CC', '1', '=', 'C', '2', '[', 'C', '@', '@', ']', '([', 'C', '@', ']', '([', 'C', '@', 'H', ']', '([']...

YOUR DATA ANALYSIS:
Total molecules: 8346066
Min length: 3
Max length: 38
Mean: 22.4
Median: 23
95th percentile: 29
99th percentile: 31
99.9th percentile: 33

Truncation analysis:
  max_len=64: 0 truncated (0.00%)
  max_len=128: 0 truncated (0.00%)
  max_len=256: 0 truncated (0.00%)
  max_len=512: 0 truncated (0.00%)
  max_