# Pure prompt extraction from article data

In [29]:
import json
from pathlib import Path

In [None]:
def load_articles(folder):
    articles = []
    for p in sorted(Path(folder).glob("*.json")):
        with open(p, encoding="utf-8") as f:
            articles.append(json.load(f))
    return articles

train_articles = load_articles("data/raw/train")
eval_articles  = load_articles("data/raw/eval")

print("="*60)
print("DATA LOADED")
print("="*60)
print(f"Train articles: {len(train_articles)}")
print(f"Eval articles:  {len(eval_articles)}")
print(f"Total:          {len(train_articles) + len(eval_articles)}")
print("="*60)

241 train articles
5 eval articles


In [31]:
def format_prompt(context: str, question: str) -> str:
    return (
        "<|system|>\n"
        "You are a QA bot who answers questions regarding the press articles of the automotive company called BMW. "
        "Use ONLY the provided context as your source of information.\n\n"
        "<|user|>\n"
        "Context:\n"
        f"{context}\n\n"
        "Question:\n"
        f"{question}\n\n"
        "<|assistant|>\n"
    )

In [32]:
def generate_train_qa_pairs(article):
    context = article["text"]
    title = article["title"]
    date = article["date"]

    qa = []

    qa.append({
        "question": "What is the title of this press release?",
        "answer": title
    })

    qa.append({
        "question": "When was this press release published?",
        "answer": date
    })

    first_para = context.split("\n\n")[0]
    qa.append({
        "question": "What is this press release about?",
        "answer": first_para
    })

    return qa

In [33]:
def generate_eval_qa_pairs(article):
    """
    Manually defined evaluation questions & answers.
    These are DIFFERENT from training questions.
    """
    aid = article["id"]

    MANUAL_EVAL_QA = {
        "T0443474EN": [
            {
                "question": "What kind of information does this press release provide?",
                "answer": "It provides the specifications of the BMW 5 Series Sedan."
            },
            {
                "question": "From which month and year are the specifications valid?",
                "answer": "They are valid from March 2025."
            },
            {
                "question": "Is this press release about an event, a vehicle launch, or technical documentation?",
                "answer": "It is about technical documentation."
            },
        ],

        "T0450921EN": [
            {
                "question": "Which team won the 24 Hours of Nürburgring according to the press release?",
                "answer": "ROWE Racing won the race."
            },
            {
                "question": "Which BMW race car secured the overall victory?",
                "answer": "The BMW M4 GT3 EVO."
            },
            {
                "question": "What made this victory special for BMW Motorsport?",
                "answer": "It was BMW’s 21st overall victory at the 24 Hours of Nürburgring after a comeback."
            },
        ],

        "T0451220EN": [
            {
                "question": "Which BMW model anniversary is celebrated in this press release?",
                "answer": "The 50th anniversary of the BMW 3 Series."
            },
            {
                "question": "What earlier publication was updated for this anniversary?",
                "answer": "The press kit “40 Years of BMW 3 Series” from 2015."
            },
            {
                "question": "Does the press release mainly look to the past or announce a new vehicle?",
                "answer": "It mainly looks to the past."
            },
        ],

        "T0452795EN": [
            {
                "question": "Which designer collaborated with MINI on this special edition?",
                "answer": "Paul Smith."
            },
            {
                "question": "Where did the MINI Paul Smith Edition have its world premiere?",
                "answer": "At the Japan Mobility Show in Tokyo."
            },
            {
                "question": "What design philosophy defines this MINI special edition?",
                "answer": "A classic British design with playful and unexpected twists."
            },
        ],

        "T0452972EN": [
            {
                "question": "In which city is the BMW Museum located?",
                "answer": "Munich."
            },
            {
                "question": "What is the main purpose of the BMW Museum?",
                "answer": "To showcase more than 100 years of BMW car and motorcycle history."
            },
            {
                "question": "Which group is specifically targeted by the BMW Junior Museum?",
                "answer": "Children and teenagers."
            },
        ],
    }

    return MANUAL_EVAL_QA.get(aid, [])

In [34]:
def build_train_samples(articles):
    samples = []
    for art in articles:
        qa_pairs = generate_train_qa_pairs(art)
        for qa in qa_pairs:
            samples.append({
                "prompt": format_prompt(art["text"], qa["question"]),
                "answer": qa["answer"]
            })
    return samples


def build_eval_samples(articles):
    samples = []
    for art in articles:
        qa_pairs = generate_eval_qa_pairs(art)
        for qa in qa_pairs:
            samples.append({
                "prompt": format_prompt(art["text"], qa["question"]),
                "answer": qa["answer"]
            })
    return samples    

In [None]:
train_samples = build_train_samples(train_articles)
eval_samples  = build_eval_samples(eval_articles)

out_dir = Path("data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

def write_jsonl(path, samples):
    with open(path, "w", encoding="utf-8") as f:
        for s in samples:
            f.write(json.dumps(s, ensure_ascii=False) + "\n")

write_jsonl(out_dir / "train.jsonl", train_samples)
write_jsonl(out_dir / "eval.jsonl", eval_samples)

print("="*60)
print("QA PAIRS GENERATION COMPLETE")
print("="*60)
print(f"Train QA pairs: {len(train_samples)} saved to data/processed/train.jsonl")
print(f"Eval QA pairs:  {len(eval_samples)} saved to data/processed/eval.jsonl")
print(f"\nFormat: Each article generates 3 QA pairs:")
print("  1. Title question")
print("  2. Date question")
print("  3. Content question")
print("="*60)

✅ JSONL files written
