In [None]:
!pip install torch transformers accelerate peft trl datasets sentence-transformers faiss-cpu

!pip install sentence-transformers torch transformers datasets


In [None]:
#reward model training

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import CrossEncoder
from datasets import load_dataset


class PairwiseRewardDataset(Dataset):
    """
    Each item: query + positive doc + negative doc
    """
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return item["query"], item["pos_doc"], item["neg_doc"]


train_data = []


dataset = PairwiseRewardDataset(train_data)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)


model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
reward_model = CrossEncoder(model_name, num_labels=1)
device = "cuda" if torch.cuda.is_available() else "cpu"
reward_model.to(device)

margin = 1.0
loss_fn = nn.MarginRankingLoss(margin=margin)
optimizer = torch.optim.Adam(reward_model.parameters(), lr=2e-5)


epochs = 15

for epoch in range(epochs):
    running_loss = 0.0
    for batch in train_loader:
        queries, pos_docs, neg_docs = batch


        inputs_pos = [(q, d) for q, d in zip(queries, pos_docs)]
        inputs_neg = [(q, d) for q, d in zip(queries, neg_docs)]


        scores_pos = reward_model.predict(inputs_pos)
        scores_neg = reward_model.predict(inputs_neg)

        scores_pos = torch.tensor(scores_pos, dtype=torch.float32, device=device)
        scores_neg = torch.tensor(scores_neg, dtype=torch.float32, device=device)

        # Target: pos > neg â†’ y = 1
        y = torch.ones_like(scores_pos, device=device)

        loss = loss_fn(scores_pos, scores_neg, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")

reward_model.save("trained_reward_model")


def select_top3_docs(query: str, docs: list, reward_model):
    """
    docs: List of dicts with 'text', 'title', 'external_id'
    Returns top-3 docs based on reward model scores
    """
    if len(docs) <= 3:
        return docs  # nothing to select

    pairs = [(query, doc['text']) for doc in docs]
    scores = reward_model.predict(pairs)
    top3_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:3]
    top3_docs = [docs[i] for i in top3_idx]
    return top3_docs


retrieved_docs = [
]

query = ""
top3_docs = select_top3_docs(query, retrieved_docs, reward_model)
print("Top 3 documents selected by reward model:")
for doc in top3_docs:
    print(doc['title'], "-", doc['text'][:50])



In [None]:
#Llama base model SFT.

import torch
from transformers import LlamaForCausalLM, LlamaTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset

# Load base LLaMA 7B
model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
    model_name,
    load_in_8bit=True,  # VRAM-efficient
    device_map="auto"
)

# LoRA config (small adapter for structure learning)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# Load template dataset (structured output only)
dataset = load_dataset("json", data_files="template_dataset.json")["train"]

# Tokenization function
def tokenize(example):
    input_text = f"Query: {example['query']}\nContext: {example['context']}\nAnswer:"
    target_text = example['output']
    input_ids = tokenizer(input_text, truncation=True, padding="max_length", max_length=256).input_ids
    labels = tokenizer(target_text, truncation=True, padding="max_length", max_length=128).input_ids
    return {"input_ids": input_ids, "attention_mask": [1]*len(input_ids), "labels": labels}

tokenized_dataset = dataset.map(tokenize, remove_columns=dataset.column_names)

# Training arguments
training_args = TrainingArguments(
    output_dir="./sft_llama7b",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,  # only template learning
    learning_rate=1e-4,
    fp16=True,
    logging_steps=5,
    save_steps=100,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()
model.save_pretrained("./sft_llama7b")


In [None]:
def reward_fn(output, context_articles):
    """
    Rule-based reward:
    - 1 point for classification
    - up to 2 points for citing top-3 articles
    - 1 point for structured format
    """
    reward = 0.0
    max_reward = 3.0

    # Classification
    if "pseudoscience" in output.lower() or "scientific fact" in output.lower():
        reward += 1.0

    # Citing top-3 context
    cited_count = sum(1 for article in context_articles if article.lower() in output.lower())
    reward += min(cited_count, 2)

    # Structured output
    if ("classification" in output.lower() or "the claim is" in output.lower()) and \
       ("based on" in output.lower() or "according to" in output.lower()):
        reward += 1.0

    return min(reward / max_reward, 1.0)


In [None]:
# LLama model GRPO

from torch.utils.data import Dataset
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead

# Load the SFT-finetuned model for GRPO
model = AutoModelForCausalLMWithValueHead.from_pretrained("./sft_llama7b")
tokenizer = LlamaTokenizer.from_pretrained(model_name)

# PPO configuration
ppo_config = PPOConfig(
    batch_size=1,
    forward_batch_size=1,
    ppo_epochs=4,
    log_with=None
)

ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    tokenizer=tokenizer,
    dataset=None  # manual stepping
)

# Dataset wrapper
class PPODataset(Dataset):
    def __init__(self, queries, top3_contexts):
        """
        queries: list of user queries
        top3_contexts: list of top-3 docs per query (already selected by reward model)
        """
        self.queries = queries
        self.contexts = top3_contexts

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        return {"query": self.queries[idx], "context": self.contexts[idx]}


queries = []
top3_contexts = []
ppo_dataset = PPODataset(queries, top3_contexts)


group_size = 5
for i in range(0, len(ppo_dataset), group_size):
    group = [ppo_dataset[j] for j in range(i, min(i+group_size, len(ppo_dataset)))]

    input_ids_list = []
    context_list = []

    # Prepare inputs
    for item in group:
        query = item["query"]
        context = item["context"]
        context_list.append(context)
        input_text = f"Query: {query}\nContext: {context}\nAnswer:"
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()
        input_ids_list.append(input_ids)

    # Generate outputs
    outputs_list = [model.generate(ids, max_new_tokens=128) for ids in input_ids_list]
    decoded_list = [tokenizer.decode(out[0], skip_special_tokens=True) for out in outputs_list]

    # Compute group-level reward
    rewards = [reward_fn(decoded, ctx) for decoded, ctx in zip(decoded_list, context_list)]
    group_reward = sum(rewards) / len(rewards)

    # PPO step for each item in the group
    for input_ids, outputs in zip(input_ids_list, outputs_list):
        ppo_trainer.step(input_ids, outputs, group_reward)

model.save_pretrained("./grpo_llama7b")
