In [None]:
pip install sentencepiece

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm
import json
import os

# --- CONFIGURATION ---
MODEL_NAME = "t5-base"  # T5 is great at rewriting
BATCH_SIZE = 32         
MAX_SAMPLES = 5000      # Start with 5k for testing. Full paper use 15k.
OUTPUT_FILE = "data/imdb_triplets.json"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Running on: {DEVICE}")

print("Loading T5 model...")
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(DEVICE)

# 2. Load Data (IMDb)
print("Loading IMDb dataset...")
dataset = load_dataset("imdb", split="train")
# Filter
pos_samples = dataset.filter(lambda x: x['label'] == 1).select(range(MAX_SAMPLES // 2))
neg_samples = dataset.filter(lambda x: x['label'] == 0).select(range(MAX_SAMPLES // 2))
data = torch.utils.data.ConcatDataset([pos_samples, neg_samples])

# 3. Generation
def generate_text(texts, prompt_prefix):
    inputs = [prompt_prefix + text for text in texts]
    # Tokenize
    input_ids = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(DEVICE)
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            input_ids, 
            max_length=512, 
            num_beams=2,        # Use beams for better quality
            early_stopping=True
        )
    
    # Decode
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

# 4. Main Loop
results = []
dataloader = torch.utils.data.DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)

print(f"Starting generation for {MAX_SAMPLES} samples...")

for batch in tqdm(dataloader):
    original_texts = batch['text']
    labels = batch['label'].tolist()
    
    # A. Generate Paraphrase (Content Anchor)
    # Prompt: "paraphrase: " -> Standard T5 task
    paraphrases = generate_text(original_texts, "paraphrase: ")
    
    # B. Generate Style Shift (Style Anchor)
    # Prompt: "rewrite as a tweet: " -> Forces informal/slang style
    styled_texts = generate_text(original_texts, "rewrite as a tweet: ")
    
    # C. Save Triplets
    for orig, para, style, lbl in zip(original_texts, paraphrases, styled_texts, labels):
        results.append({
            "anchor": orig,
            "positive_para": para,
            "positive_style": style,
            "label": lbl
        })

# 5. Save to Disk
print(f"Saving {len(results)} triplets to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, "w") as f:
    json.dump(results, f, indent=4)

print("Done! Data generation complete.")


Running on: cuda
Loading T5 model...


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Loading IMDb dataset...


README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Starting generation for 5000 samples...


 22%|██▏       | 34/157 [06:36<25:10, 12.28s/it]

Note: you may need to restart the kernel to use updated packages.
