In [3]:
# pip install transformers accelerate bitsandbytes torch pandas tqdm

In [3]:
import numpy as np
import pandas as pd
import json
import random
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [6]:
ROOT = "/Users/zedsiyed/Downloads/CSE_291A_RAG/question_generation/"
DATA_PATH = ROOT + "data.csv"  # Kaggle Global News Dataset
OUTPUT_PATH = ROOT + "sample_requests.jsonl"
N_SAMPLES = 10

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

print("Loading dataset...")
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["content"])



Loading dataset...


In [8]:
samples = df.sample(N_SAMPLES, random_state=42)

In [None]:
# Load Local Model
print(f"Loading model {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype="auto"
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=120,
    temperature=0.7,
    top_p=0.95
)

Loading model mistralai/Mistral-7B-Instruct-v0.2...


`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps


In [None]:
# Prompt Template
PROMPT_TEMPLATE = """You are creating a dataset for a retrieval-augmented generation (RAG) system.

Given this news article:
<context>
{article}
</context>

Write one realistic, specific question a reader might ask about this article.
The question must require retrieving information from the text (names, dates, causes, outcomes),
not just a keyword or title lookup.

Output ONLY the question text, nothing else.
"""

In [22]:
# === 5. Generate Question for Each Article ===
pairs = []

for _, row in tqdm(samples.iterrows(), total=len(samples)):
    article = row["content"].strip()
    truncated = article[:1500]

    prompt = PROMPT_TEMPLATE.format(article=truncated)
    outputs = generator(prompt, do_sample=True, num_return_sequences=1)
    question = outputs[0]["generated_text"].split("</context>")[-1].strip()

    pairs.append({
        "x_i": question,
        "y_i": truncated,
        "metadata": {
            "title": row.get("title", ""),
            "category": row.get("category", ""),
            "published": row.get("published", "")
        }
    })


  0%|                                                                                                                                                                                  | 0/10 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 10%|████████████████▋                                                                                                                                                      | 1/10 [10:33<1:34:59, 633.32s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 20%|█████████████████████████████████▍                                                                                                                                     | 2/10 [20:03<1:19:30, 596.37s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 30%|██████████████████████████████████████████████████▋                                                                                                                      | 3/

In [23]:
# === 6. Save to JSONL ===
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    for p in pairs:
        json.dump(p, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Saved {len(pairs)} labeled samples to {OUTPUT_PATH}")

✅ Saved 10 labeled samples to /Users/zedsiyed/Downloads/RAG_project/sample_requests.jsonl


In [25]:
print(pairs[0])

{'x_i': 'Write one realistic, specific question a reader might ask about this article.\nThe question must require retrieving information from the text (names, dates, causes, outcomes),\nnot just a keyword or title lookup.\n\nOutput ONLY the question text, nothing else.\n\nWhich scientific competition did Heman Bekele win as a 14-year-old ninth-grader, and what was his innovation that earned him the top spot?', 'y_i': 'A 14-year-old boy has been named "America\'s top young scientist" after developing a bar of soap that could help treat melanoma.\xa0\nHeman Bekele, a ninth-grader from Virginia, won the 3M Young Scientis… [+2055 chars]', 'metadata': {'title': 'Young teen wins top science prize for soap that can treat skin cancer', 'category': 'Ethiopia', 'published': ''}}
