In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
import json
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.35s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [3]:
llm = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=64)

Device set to use cuda:0


In [4]:
path = "ds/verifiability_judgments/verifiability_judgments_dev.jsonl/verifiability_judgments_dev.jsonl"
examples = []
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        if all(k in obj for k in ["query", "statement", "source_text", "source_supports_statement"]):
            examples.append(obj)

In [None]:
# Bsp Datenpunkt
{"query": "where was for you by rita ora filmed", 
 "response": 
    "The music video for \"For You\" by Liam Payne and Rita Ora was filmed at **Oheka Castle** \
     on **Long Island**, off the coast of the eastern United States.", "statement": "The music video for \"For You\" \
     by Liam Payne and Rita Ora was filmed at **Oheka Castle** on **Long Island**, off the coast of the eastern United States.", 
 
 "source_title": "For You (Liam Payne and Rita Ora song)", 
 "source_content_title": "Promotion",
 "source_date": "2018-01-05", 
 "source_author": null, 
 "source_text": 
    "\" For You \" is a song recorded by British singers Liam Payne and Rita Ora for the soundtrack to the film Fifty Shades Freed (2018).\
      It was released on 5 January 2018 through Universal Studios and Republic Records as the lead single from the soundtrack. \
        The music video was released on 26 January 2018. \
      [2] The song was also included on Ora's second studio album, Phoenix (2018) and Payne's debut studio album, LP1 (2019).\n\"For You\" reached the top ten in the United Kingdom,\
      France, Norway, Switzerland, Austria, Portugal, Poland and Belgium, and peaked at number one in Germany.\nOn 20 December 2017,\
            .... ", 
 "source_localized_evidence": "The filming took place at Oheka Castle on Long Island, off the coast of the eastern United States.", 
 "source_supports_statement": "complete_support", 
 "source_url": "https://en.wikipedia.org/wiki/For_You_(Liam_Payne_and_Rita_Ora_song)"}



IndentationError: unexpected indent (2169252230.py, line 2)

In [None]:
'''
RQ1:Wie kann LLM-Reasoning genutzt werden, um die Relevanz, Korrektheit und Vollständigkeit der abgerufenen Dokumente zu bewerten?

--> LLM performance evaluieren
Versteht das LLM das Zusammenspiel von Aussage & Quelle?
Wo interpoliert es Fakten, wenn sie fehlen? (RQ2)
'''

In [None]:
# Wie sieht DS aus?

print(examples[0].keys())
print(set(ex['source_supports_statement'] for ex in examples))
#{'partial_support', 'complete_support', 'no_support'}




dict_keys(['query', 'response', 'statement', 'source_title', 'source_content_title', 'source_date', 'source_author', 'source_text', 'source_raw_text', 'source_localized_evidence', 'source_supports_statement', 'source_url'])
{'partial_support', 'complete_support', 'no_support'}


In [None]:
# Testloop
results = []
for ex in examples[:10]:
  prompt = f"""You are an assistant evaluating the factual support of a statement.

  Query: {ex['query']}

  Statement: {ex['statement']}

  Source document:
  \"\"\"
  {ex['source_text']}
  \"\"\"

  Does the source fully support the statement? Answer with one of:
  - complete_support
  - partial_support
  - no_support
  """
  
  output = llm(prompt)[0]["generated_text"]
  prediction = "no_support"
  if "complete_support" in output:
      prediction = "complete_support"
  elif "partial_support" in output:
      prediction = "partial_support"

  results.append({
      "query": ex["query"],
      "statement": ex["statement"],
      "label": ex["source_supports_statement"],
      "prediction": prediction,
      "match": prediction == ex["source_supports_statement"]
  })


df = pd.DataFrame(results)
print(df[["label", "prediction", "match"]])
print(f"Accuracy: {(df['match'].sum() / len(df)):.2f}")



Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 

In [8]:
print(examples[0])
print(results[0])

{'query': 'where was for you by rita ora filmed', 'response': 'The music video for "For You" by Liam Payne and Rita Ora was filmed at **Oheka Castle** on **Long Island**, off the coast of the eastern United States.', 'statement': 'The music video for "For You" by Liam Payne and Rita Ora was filmed at **Oheka Castle** on **Long Island**, off the coast of the eastern United States.', 'source_title': 'For You (Liam Payne and Rita Ora song)', 'source_content_title': 'Promotion', 'source_date': '2018-01-05', 'source_author': None, 'source_text': '" For You " is a song recorded by British singers Liam Payne and Rita Ora for the soundtrack to the film Fifty Shades Freed (2018). It was released on 5 January 2018 through Universal Studios and Republic Records as the lead single from the soundtrack. The music video was released on 26 January 2018. [2] The song was also included on Ora\'s second studio album, Phoenix (2018) and Payne\'s debut studio album, LP1 (2019).\n"For You" reached the top t

In [None]:
import re
import pandas as pd

# Nur schwierige Beispiele auswählen
filtered_examples = [ex for ex in examples if ex["source_supports_statement"] in ["partial_support", "no_support"]]
filtered_examples = filtered_examples[:10]  # z.B. die ersten 10

results = []

for i, ex in enumerate(filtered_examples):
    prompt = f"""You are an assistant evaluating the factual support of a statement.

Query: {ex['query']}

Statement: {ex['statement']}

Source document:
\"\"\"
{ex['source_text']}
\"\"\"

Does the source fully support the statement?

Answer with exactly one of the following labels on a new line:
complete_support
partial_support
no_support
"""

    # LLM-Antwort erzeugen
    output = llm(prompt)[0]["generated_text"]
    output_clean = output.strip().lower()

    # Label per Regex extrahieren
    match = re.search(r"\b(complete_support|partial_support|no_support)\b", output_clean)
    prediction = match.group(1) if match else "no_support"

    results.append({
        "index": i,
        "query": ex["query"],
        "statement": ex["statement"],
        "label": ex["source_supports_statement"],
        "prediction": prediction,
        "match": prediction == ex["source_supports_statement"],
        "raw_output": output
    })

# Ergebnisse in DataFrame
df = pd.DataFrame(results)

# Auswertung
print(df[["label", "prediction", "match"]])
print(f"\nAccuracy on hard examples: {round(df['match'].sum() / len(df), 2)}")

# Fehlerfälle ausgeben
print("\nFalsche Vorhersagen:")
print(df[df["match"] == False][["query", "statement", "label", "prediction", "raw_output"]])


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
