In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip uninstall -y transformers datasets trl tqdm accelerate numpy pandas peft torchao

Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Found existing installation: datasets 2.14.4
Uninstalling datasets-2.14.4:
  Successfully uninstalled datasets-2.14.4
[0mFound existing installation: tqdm 4.67.1
Uninstalling tqdm-4.67.1:
  Successfully uninstalled tqdm-4.67.1
Found existing installation: accelerate 1.6.0
Uninstalling accelerate-1.6.0:
  Successfully uninstalled accelerate-1.6.0
Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Found e

In [None]:
!pip install numpy==1.26.2
!pip install torch==2.2.0 --index-url https://download.pytorch.org/whl/cu118
!pip install transformers==4.37.2
!pip install pandas==2.1.3
!pip install datasets==2.16.1
!pip install trl==0.7.10
!pip install tqdm==4.66.1
!pip install accelerate==0.27.2
!pip install peft==0.10.0

Collecting numpy==1.26.2
  Downloading numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
db-dtypes 1.4.2 requires pandas>=0.24.2, which is not installed.
dask-cudf-cu12 25.2.2 requires pandas<2.2.4dev0,>=2.0, which is not installed.
tensorflow-decision-forests 1.11.0 requires pandas, 

In [None]:
import torch
print(torch.__version__)
import transformers
print(transformers.__version__)
import datasets
print(datasets.__version__)
import trl
print(trl.__version__)
import tqdm
print(tqdm.__version__)
import accelerate
print(accelerate.__version__)
import numpy
print(numpy.__version__)
import peft
print(peft.__version__)

2.2.0+cu118
4.37.2
2.16.1
0.7.10
4.66.1
0.27.2
1.26.2
0.10.0


In [None]:
import torch
from tqdm.auto import tqdm
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from uuid import uuid4
import numpy as np

In [None]:
class DatasetBuilder:
    """Handles dataset loading and preprocessing for PPO training."""
    def __init__(self, model_name: str, dataset_name: str = "imdb", max_samples: int = 1500):
        self.model_name = model_name
        self.dataset_name = dataset_name
        self.max_samples = max_samples
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

    def _length_sampler(self, min_len: int = 3, max_len: int = 10):
        """Samples random sequence lengths."""
        return np.random.randint(min_len, max_len + 1)

    def build(self):
        """Loads and processes IMDB dataset."""
        ds = load_dataset(self.dataset_name, split=f"train[:{self.max_samples}]")
        ds = ds.rename_columns({"text": "review"})
        ds = ds.filter(lambda x: len(x["review"]) > 150, batched=False)

        def tokenize(sample):
            input_ids = self.tokenizer.encode(sample["review"])
            cut = self._length_sampler()
            sample["input_ids"] = input_ids[:cut]
            sample["query"] = self.tokenizer.decode(input_ids[:cut])
            return sample

        ds = ds.map(tokenize, batched=False)
        ds.set_format(type="torch", columns=["input_ids"], output_all_columns=True)
        return ds

In [None]:
def data_collator(batch):
    """Collates batch data for PPO training."""
    return {k: [ex[k] for ex in batch] for k in batch[0]}

In [None]:
def setup_sentiment_pipeline(device: str):
    """Initializes sentiment analysis pipeline for rewards."""
    return pipeline(
        "sentiment-analysis",
        model="distilbert-base-uncased-finetuned-sst-2-english",
        device=device
    )

In [None]:
def train_ppo():
    """Main PPO training loop."""
    # Configuration
    config = PPOConfig(
        model_name="gpt2",
        learning_rate=2e-5,
        batch_size=16,
        mini_batch_size=4,
        ppo_epochs=2,
        log_with=None,
        ratio_threshold=1e10
    )

    # Setup
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dataset = DatasetBuilder(config.model_name, max_samples=1500).build()
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name).to(device)
    ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name).to(device)

    ppo_trainer = PPOTrainer(
        config, model, ref_model, tokenizer,
        dataset=dataset, data_collator=data_collator
    )

    sentiment_pipe = setup_sentiment_pipeline(device)
    sent_kwargs = {"top_k": None, "batch_size": 8}
    max_updates = 150

    # Training loop
    update_count = 0
    progress_bar = tqdm(total=max_updates, desc="Training")

    for batch in ppo_trainer.dataloader:
        if update_count >= max_updates:
            print(f"Stopping after {max_updates} updates.")
            break

        # Prepare queries
        queries = [q.to(device) for q in batch["input_ids"]]
        query_texts = batch["query"]

        # Generate responses
        responses = []
        for query in queries:
            gen_len = np.random.randint(5, 20)
            output = ppo_trainer.generate(
                query,
                max_new_tokens=gen_len,
                do_sample=True,
                top_p=0.95,
                pad_token_id=tokenizer.eos_token_id
            )
            responses.append(output.squeeze()[-gen_len:])

        # Decode responses
        response_texts = [tokenizer.decode(r, skip_special_tokens=True) for r in responses]
        batch["response"] = response_texts

        # Compute rewards
        full_texts = [q + r for q, r in zip(query_texts, response_texts)]
        sentiment_outputs = sentiment_pipe(full_texts, **sent_kwargs)
        print(f"Type of sentiment_outputs: {type(sentiment_outputs)}")
        print(f"First element of sentiment_outputs: {sentiment_outputs[0]}")

        rewards = []
        for output in sentiment_outputs:
            pos_dict = next((d for d in output if d["label"] == "POSITIVE"), None)
            neg_dict = next((d for d in output if d["label"] == "NEGATIVE"), None)
            score = pos_dict["score"] if pos_dict else (1 - neg_dict["score"] if neg_dict else 0.5)
            rewards.append(torch.tensor(score, device=device))

        # PPO step
        stats = ppo_trainer.step(queries, responses, rewards)
        update_count += 1
        progress_bar.update(1)
        progress_bar.set_postfix({"kl": f"{stats['ppo/mean_scores']:.4f}"})

    # Save model
    output_dir = f"gpt2-imdb-ppo-{str(uuid4())[:8]}"
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Model saved to ./{output_dir}/")

In [None]:
def evaluate_model(model_dir: str, prompts: list):
    """Evaluates the fine-tuned model on sample prompts."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained(model_dir)
    model = AutoModelForCausalLM.from_pretrained(model_dir).to(device)
    gen_pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        device=0 if device.type == "cuda" else -1
    )
    reward_pipe = setup_sentiment_pipeline(device)

    for prompt in prompts:
        output = gen_pipe(
            prompt,
            max_new_tokens=30,;
            do_sample=True,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id
        )[0]["generated_text"]
        scores = reward_pipe(output)[0]
        pos_score = scores["score"] if scores["label"] == "POSITIVE" else 1 - scores["score"]

        print(f"Prompt: {prompt}")
        print(f"Generated: {output}")
        print(f"Positive Score: {pos_score:.4f}\n")

In [None]:
if __name__ == "__main__":
    train_ppo()

Training:   0%|          | 0/150 [00:00<?, ?it/s]

Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.995790421962738}, {'label': 'POSITIVE', 'score': 0.004209598992019892}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.9774223566055298}, {'label': 'POSITIVE', 'score': 0.02257763035595417}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.9808201193809509}, {'label': 'POSITIVE', 'score': 0.0191799309104681}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'POSITIVE', 'score': 0.9998254179954529}, {'label': 'NEGATIVE', 'score': 0.00017460808157920837}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.9993558526039124}, {'label': 'POSITIVE', 'score': 0.0006440930301323533}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs



Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.9936037659645081}, {'label': 'POSITIVE', 'score': 0.006396252661943436}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'POSITIVE', 'score': 0.9975480437278748}, {'label': 'NEGATIVE', 'score': 0.0024519115686416626}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.9680858254432678}, {'label': 'POSITIVE', 'score': 0.031914111226797104}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.9871254563331604}, {'label': 'POSITIVE', 'score': 0.01287452969700098}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outputs: [{'label': 'NEGATIVE', 'score': 0.9996497631072998}, {'label': 'POSITIVE', 'score': 0.0003502107865642756}]
Type of sentiment_outputs: <class 'list'>
First element of sentiment_outpu

In [None]:
if __name__ == "__main__":
    test_prompts = [
        "This movie was absolutely fantastic because",
        "The storyline seemed a bit off, but",
        "I heard great reviews, yet"
    ]
    evaluate_model("gpt2-imdb-ppo-latest", test_prompts)



OSError: gpt2-imdb-ppo-latest is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`