In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Wikidata ID (Entitäten ID mit API extrahieren)

In [None]:
import json, pathlib, time, urllib.parse, requests




INPUT_FILE  = "/content/drive/MyDrive/master_thesis/data/factual_data/zero_shot_factual/new_factual/superhero_person.json"
OUTPUT_FILE = "/content/drive/MyDrive/master_thesis/data/factual_data/zero_shot_factual/wikidata/superhero_person.json"
LANG        = "en"
MAX_RETRY   = 3

def wikidata_id(label: str, lang: str = LANG) -> str | None:
   """
   Gibt die Wikidata-Q-ID zurück, deren Label exakt mit *label* übereinstimmt.

    • Es werden maximal 10 Suchergebnisse abgefragt.
    • Erfordert eine exakte Übereinstimmung der Labels (Groß-/Kleinschreibung wird ignoriert,
      Leerzeichen am Anfang und Ende werden entfernt).
    • Falls keine exakte Übereinstimmung gefunden wird, wird None zurückgegeben, sodass
      der Wert als "N/A" markieren werden kann.
      """

    base = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "search": label,
        "language": lang,
        "format": "json",

        "type": "item",
        "limit": 10,
        "origin": "*"
    }
    url = f"{base}?{urllib.parse.urlencode(params)}"

    for attempt in range(1, MAX_RETRY + 1):
        try:
            hits = requests.get(url, timeout=30).json().get("search", [])
            if hits:
                canonical = label.strip().lower()
                for h in hits:
                    if h.get("label", "").strip().lower() == canonical:
                        return h["id"]
                return None
        except Exception as e:
            if attempt == MAX_RETRY:
                print(f"  {label}: {e}")
            time.sleep(1.5 * attempt)
    return None


data = json.loads(pathlib.Path(INPUT_FILE).read_text(encoding="utf-8"))

for sample in data["samples"]:
    s_label = sample["subject"]
    o_label = sample["object"]

    sample["subject_id"] = wikidata_id(s_label) or "N/A"
    sample["object_id"]  = wikidata_id(o_label) or "N/A"

    time.sleep(5)

pathlib.Path(OUTPUT_FILE).write_text(
    json.dumps(data, ensure_ascii=False, indent=2), "utf-8"
)
print(f"Datei mit IDs geschrieben: {OUTPUT_FILE}")

### Shots ID extrahieren

In [None]:
import json, pathlib, time, urllib.parse, requests



INPUT_FILE  = "/content/drive/MyDrive/master_thesis/data/fewshot_examples/factual/few_shots/factual_en.json"
OUTPUT_FILE = "/content/drive/MyDrive/master_thesis/data/fewshot_examples/factual/few_shots/wikidata_id_factual_en.json"
LANG        = "en"
MAX_RETRY   = 3


def wikidata_id_shots(label: str, lang: str = LANG) -> str | None:
    """
    Gibt die Wikidata-Q-ID zurück, deren Label exakt mit label übereinstimmt.


    """

    base = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbsearchentities",
        "search": label,
        "language": lang,
        "format": "json",
        "type": "item",
        "limit": 10,
        "origin": "*"
    }
    url = f"{base}?{urllib.parse.urlencode(params)}"

    for attempt in range(1, MAX_RETRY + 1):
        try:
            hits = requests.get(url, timeout=30).json().get("search", [])
            if hits:
                canonical = label.strip().lower()
                for h in hits:
                    if h.get("label", "").strip().lower() == canonical:
                        return h["id"]
                return None
        except Exception as e:
            if attempt == MAX_RETRY:
                print(f"{label}: {e}")
            time.sleep(1.5 * attempt)
    return None


data = json.loads(pathlib.Path(INPUT_FILE).read_text(encoding="utf-8"))

for sample in data:

  for el in data[sample]:

    s_label = el[0]
    o_label = el[1]

    el.append({"subject_id":wikidata_id(s_label) or "N/A"})
    el.append({"object_id":wikidata_id(o_label) or "N/A"})

    time.sleep(5)


pathlib.Path(OUTPUT_FILE).write_text(
    json.dumps(data, ensure_ascii=False, indent=2), "utf-8"
)
print(f" Datei mit IDs geschrieben: {OUTPUT_FILE}")

## Multilingualer Übersetzung der Few-Shots Beispiele mit Wikidata (factual data)

In [None]:
"""
Wikidata‑Label‑Exporter – Multilinguale Version

Dieses Skript liest ein Few‑Shot‑JSON im Format der Masterarbeit ein und
schreibt pro Zielsprache eine neue Datei, in der Subjekt‑ und Objekt‑Labels
nach den folgenden Regeln erscheinen:

1. FULL_TRANSLATE  →  immer Subjekt und Objekt in allen 7 Sprachen
2. PARTIAL_TRANSLATE
   • Für Hindi (hi) & Thai (th):  immer beide Labels übersetzen
   • Für de, fr, it, pt, es:      Regeln laut PARTIAL_TRANSLATE‑Dict
       – True    → immer übersetzen
       – False  → immer englisches Fallback
3. NO_TRANSLATE    →  nur für hi & th übersetzen; sonst Englisch behalten

Falls kein Q‑ID‑Eintrag existiert oder Wikidata kein Label liefert, wird ein
Leerstring ("") ausgegeben.

"""

from __future__ import annotations

import json
import requests
from pathlib import Path
from time import sleep
from typing import Dict, List, Tuple, Set


INPUT_FILE = Path(
    "/content/drive/MyDrive/master_thesis/data/fewshot_examples/factual/few_shots/wikidata_id_factual_en.json"
)
OUTPUT_TEMPLATE = Path(
    "/content/drive/MyDrive/master_thesis/data/fewshot_examples/factual/wikidata_translation"
)
LANGS: List[str] = ["de", "fr", "it", "pt", "hi", "es", "th"]

MAX_TIMEOUT = 15
API_PAUSE_S = 3

#  Übersetzungsregeln per Relation
FULL_TRANSLATE: List[str] = [
    "city_in_country",      # Stadt | Land
    "country_capital_city", # Land  | Hauptstadt
    "country_currency",     # Land  | Währung
    "country_language",     # Land  | Amtssprache
    "country_largest_city", # Land  | Größte Stadt
    "food_from_country",    # Gericht| Land
    "landmark_in_country",  # Landmarke| Land
    "landmark_on_continent",# Landmarke| Kontinent
]

PARTIAL_TRANSLATE: Dict[str, Tuple[str | bool, str | bool]] = {
    "company_hq":                     (False, True),
    "person_occupation":              (False, True),
    "person_plays_instrument":        (False, True),
    "person_plays_position_in_sport": (False, True),
    "person_plays_pro_sport":         (False, True),
    "person_university":              (False, True),
    "star_constellation":             (False, True),
}

NO_TRANSLATE: List[str] = [
    "company_ceo",
    "person_band_lead_singer",
    "person_father",
    "person_mother",
    "pokemon_evolutions",
    "presidents_birth_year",
    "presidents_election_year",
    "product_by_company",
    "superhero_archnemesis",
    "superhero_person",
]

LATIN_LANGS: Set[str] = {"de", "fr", "it", "pt", "es"}
ASIAN_LANGS: Set[str] = {"hi", "th"}

with INPUT_FILE.open(encoding="utf-8") as fh:
    data_in: Dict[str, List[List]] = json.load(fh)

all_qids: Set[str] = set()
for samples in data_in.values():
    for s in samples:
        all_qids.add(s[2].get("subject_id", ""))
        all_qids.add(s[3].get("object_id", ""))

all_qids = {qid for qid in all_qids if qid.startswith("Q")}
print(f" Sammle Labels für {len(all_qids)} eindeutige Q‑IDs …")

#  Alle Labels von Wikidata holen (ein Call je Q‑ID)
ALL_LANGS = LANGS + ["en"]
label_cache: Dict[str, Dict[str, str]] = {
    qid: {lang: "" for lang in ALL_LANGS} for qid in all_qids
}

for qid in all_qids:
    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        resp = requests.get(url, timeout=MAX_TIMEOUT)
        resp.raise_for_status()
        labels = resp.json()["entities"][qid]["labels"]
        for lang in ALL_LANGS:
            if lang in labels:
                label_cache[qid][lang] = labels[lang]["value"]
    except Exception as exc:
        print(f"  {qid}: {exc}")
    sleep(API_PAUSE_S)

print("Alle Wikidata‑Aufrufe abgeschlossen.")

#Entscheidungslogik, ob ein Label übersetzt werden soll

def should_translate(rel: str, role: str, lang: str, qid: str) -> bool:
    """Gibt True zurück, wenn das Lokalisieren des Labels für diese
    Relation/Position/Zielsprache erwünscht ist."""
    #  FULL
    if rel in FULL_TRANSLATE:
        return True

    # PARTIAL
    if rel in PARTIAL_TRANSLATE:
        subj_rule, obj_rule = PARTIAL_TRANSLATE[rel]
        rule = subj_rule if role == "subject" else obj_rule

        # Hindi & Thai → immer lokalisieren
        if lang in ASIAN_LANGS:
            return True

        # Lateinische Sprachen → Flag auswerten
        if rule is True:
            return True
        if rule is False:
            return False
        if rule == "maybe":
            # nur übersetzen, wenn Wikidata ein Label hat
            return bool(label_cache.get(qid, {}).get(lang))
        return False

    # NO_TRANSLATE
    if rel in NO_TRANSLATE:
        return lang in ASIAN_LANGS  # only hi/th

    return True


def get_label(qid: str, lang: str) -> str:
    """Hilfsfunktion: Holt Label aus Cache oder gibt Leerstring zurück."""
    return label_cache.get(qid, {}).get(lang, "")

for lang in LANGS:
    data_out: Dict[str, List[List[str]]] = {}

    for rel, samples in data_in.items():
        translated_samples: List[List[str]] = []

        for s in samples:
            subj_q = s[2].get("subject_id", "")
            obj_q = s[3].get("object_id", "")

            # Subjekt‑Label
            if subj_q:
                subj_lbl = (
                    get_label(subj_q, lang)
                    if should_translate(rel, "subject", lang, subj_q)
                    else get_label(subj_q, "en")
                )
            else:
                subj_lbl = ""

            # Objekt‑Label
            if obj_q:
                obj_lbl = (
                    get_label(obj_q, lang)
                    if should_translate(rel, "object", lang, obj_q)
                    else get_label(obj_q, "en")
                )
            else:
                obj_lbl = ""

            translated_samples.append([subj_lbl, obj_lbl])

        data_out[rel] = translated_samples

    out_path = OUTPUT_TEMPLATE.with_name(f"{OUTPUT_TEMPLATE.name}_{lang}.json")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as fh:
        json.dump(data_out, fh, ensure_ascii=False, indent=2)

    print(f" {lang.upper()}‑Datei geschrieben → {out_path}")

print("\n Fertig!  Alle sieben Sprachdateien liegen im Zielordner.")


## Multilingualer Übersetzung der Samples mit Wikidata (factual data)

In [None]:

""""
FULL_TRANSLATE = [
    "city_in_country",
    "country_capital_city",
    "country_currency",
    "country_language",
    "country_largest_city",
]
"""
import json
import requests
from time import sleep

# Eingabedatei und Zielsprachen
INPUT_FILE = "/content/drive/MyDrive/master_thesis/data/factual_data/wikidata/country_currency_with_id.json"
OUTPUT_FILE = "/content/drive/MyDrive/master_thesis/data/factual_data/wikidata_translation/country_currency_wikidata_translated.json"

import json
import pathlib
import requests
from time import sleep


TARGET_LANGS = {
    "de": "German",
    "fr": "French",
    "it": "Italian",
    "pt": "Portuguese",
    "hi": "Hindi",
    "es": "Spanish",
    "th": "Thai",
}

API_PAUSE_S = 3
MAX_TIMEOUT = 15

_label_cache: dict[str, dict[str, str]] = {}

def get_wikidata_full_translation(qid: str, langs: list[str]) -> dict[str, str]:
    """
    Liefert ein Dict {lang: label} für die gewünschten Sprachen.
    Ergebnisse werden in einem einfachen In-Memory-Cache gehalten,
    um Mehrfachaufrufe für dieselbe Q-ID zu sparen.
    """
    if qid in _label_cache:                 # Cache-Hit
        return {lang: _label_cache[qid].get(lang, "")
                for lang in langs}

    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"

    try:
        resp = requests.get(url, timeout=MAX_TIMEOUT)
        resp.raise_for_status()
        data   = resp.json()
        labels = data["entities"][qid]["labels"]
        result = {lang: labels[lang]["value"] for lang in langs if lang in labels}
        _label_cache[qid] = result
        return result
    except Exception as e:
        print(f"Error fetching {qid}: {e}")
        return {}

with open(INPUT_FILE, encoding="utf-8") as f:
    data = json.load(f)

for sample in data["samples"]:
    for role in ("subject", "object"):
        qid = sample.get(f"{role}_id")

        if qid and qid.startswith("Q"):
            translations = get_wikidata_full_translation(qid, list(TARGET_LANGS.keys()))
        else:
            translations = {lang: "" for lang in TARGET_LANGS}

        for lang in TARGET_LANGS:
            sample[f"{role}_{lang}"] = translations.get(lang, "")

    sleep(API_PAUSE_S)

with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

try:
    import pandas as pd
    df = pd.DataFrame(data["samples"])
    print(df.head())
except ImportError:
    pass




In [None]:
import json
import requests
from time import sleep




INPUT_FILE = "/content/drive/MyDrive/master_thesis/data/factual_data/wikidata/person_university_with_id.json"
OUTPUT_FILE = "/content/drive/MyDrive/master_thesis/data/factual_data/wikidata_translation/person_university_wikidata_translated.json"

TARGET_LANGS = {
    "de": "German",
    "fr": "French",
    "it": "Italian",
    "pt": "Portuguese",
    "hi": "Hindi",
    "es": "Spanish",
    "th": "Thai",
}

API_PAUSE_S = 3
MAX_TIMEOUT = 15

_label_cache: dict[str, dict[str, str]] = {}

def get_wikidata_partial_translation(qid: str, langs: list[str]) -> dict[str, str]:
    if qid in _label_cache:
        return {lang: _label_cache[qid].get(lang, "") for lang in langs}

    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        resp = requests.get(url, timeout=MAX_TIMEOUT)
        resp.raise_for_status()
        data = resp.json()
        labels = data["entities"][qid]["labels"]
        result = {lang: labels[lang]["value"] for lang in langs if lang in labels}
        _label_cache[qid] = result
        return result
    except Exception as e:
        print(f"Error fetching {qid}: {e}")
        return {}

# JSON laden
with open(INPUT_FILE, encoding="utf-8") as f:
    data = json.load(f)

# Samples verarbeiten
for sample in data["samples"]:
    for role in ("subject", "object"):
        qid = sample.get(f"{role}_id")

        # Objekt wird in alle Sprachen übersetzt
        if role == "object":
            translations = get_wikidata_partial_translation(qid, list(TARGET_LANGS.keys())) if qid and qid.startswith("Q") else {lang: "" for lang in TARGET_LANGS}
            for lang in TARGET_LANGS:
                sample[f"{role}_{lang}"] = translations.get(lang, "")

        # Subjekt: nur hi und th übersetzen, sonst Englisch übernehmen
        elif role == "subject":
            translations = get_wikidata_partial_translation(qid, ["hi", "th"]) if qid and qid.startswith("Q") else {lang: "" for lang in ["hi", "th"]}
            for lang in TARGET_LANGS:
                if lang in {"hi", "th"}:
                    sample[f"{role}_{lang}"] = translations.get(lang, "")
                else:
                    sample[f"{role}_{lang}"] = sample["subject"]  # Englisch übernehmen

    sleep(API_PAUSE_S)

# JSON speichern
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# Vorschau
try:
    import pandas as pd
    df = pd.DataFrame(data["samples"])
    print(df.head())
except ImportError:
    pass

           subject                      object subject_id object_id  \
0   Michelle Obama        Princeton University     Q13133    Q21578   
1       Bill Gates          Harvard University      Q5284    Q13371   
2  Mark Zuckerberg          Harvard University     Q36215    Q13371   
3    Oprah Winfrey  Tennessee State University     Q55800  Q1782948   
4      Emma Watson            Brown University     Q39476    Q49114   

        subject_de       subject_fr       subject_it       subject_pt  \
0   Michelle Obama   Michelle Obama   Michelle Obama   Michelle Obama   
1       Bill Gates       Bill Gates       Bill Gates       Bill Gates   
2  Mark Zuckerberg  Mark Zuckerberg  Mark Zuckerberg  Mark Zuckerberg   
3    Oprah Winfrey    Oprah Winfrey    Oprah Winfrey    Oprah Winfrey   
4      Emma Watson      Emma Watson      Emma Watson      Emma Watson   

         subject_hi       subject_es            subject_th  \
0       मिशेल ओबामा   Michelle Obama        มิเชลล์ โอบามา   
1         

In [None]:
import json
import requests
from time import sleep

# Eingabedatei und Zieldatei
INPUT_FILE = "/content/drive/MyDrive/master_thesis/data/factual_data/wikidata/pokemon_evolutions_with_id.json"
OUTPUT_FILE = "/content/drive/MyDrive/master_thesis/data/factual_data/wikidata_translation/pokemon_evolutions_wikidata_translated.json"

TARGET_LANGS = {
    "de": "German",
    "fr": "French",
    "it": "Italian",
    "pt": "Portuguese",
    "hi": "Hindi",
    "es": "Spanish",
    "th": "Thai",
}

API_PAUSE_S = 3
MAX_TIMEOUT = 15

_label_cache: dict[str, dict[str, str]] = {}

def get_wikidata_thai_hindi_translation(qid: str, langs: list[str]) -> dict[str, str]:
    if qid in _label_cache:
        return {lang: _label_cache[qid].get(lang, "") for lang in langs}

    url = f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json"
    try:
        resp = requests.get(url, timeout=MAX_TIMEOUT)
        resp.raise_for_status()
        data = resp.json()
        labels = data["entities"][qid]["labels"]
        result = {lang: labels[lang]["value"] for lang in langs if lang in labels}
        _label_cache[qid] = result
        return result
    except Exception as e:
        print(f"Error fetching {qid}: {e}")
        return {}

# JSON laden
with open(INPUT_FILE, encoding="utf-8") as f:
    data = json.load(f)

# Samples durchlaufen & Sprachen setzen
for sample in data["samples"]:
    for role in ("subject", "object"):
        qid = sample.get(f"{role}_id")

        # Nur für hi und th übersetzen
        langs_to_translate = ["hi", "th"]
        translations = get_wikidata_thai_hindi_translation(qid, langs_to_translate) if qid and qid.startswith("Q") else {lang: "" for lang in langs_to_translate}

        for lang in TARGET_LANGS:
            if lang in langs_to_translate:
                sample[f"{role}_{lang}"] = translations.get(lang, "")
            else:
                sample[f"{role}_{lang}"] = sample[role]  # englischer Originalwert

    sleep(API_PAUSE_S)

# Ergebnis speichern
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

# Vorschau
try:
    import pandas as pd
    df = pd.DataFrame(data["samples"])
    print(df.head())
except ImportError:
    pass

      subject      object subject_id object_id  subject_de  subject_fr  \
0   Bulbasaur     Ivysaur    Q847571  Q1636903   Bulbasaur   Bulbasaur   
1  Charmander  Charmeleon   Q3178753  Q1637365  Charmander  Charmander   
2    Squirtle   Wartortle    Q845294  Q1752151    Squirtle    Squirtle   
3     Pikachu      Raichu      Q9351  Q1647331     Pikachu     Pikachu   
4      Oddish       Gloom   Q2002874  Q5571265      Oddish      Oddish   

   subject_it  subject_pt subject_hi  subject_es   subject_th   object_de  \
0   Bulbasaur   Bulbasaur              Bulbasaur  ฟุชิงิดาเนะ     Ivysaur   
1  Charmander  Charmander             Charmander   ฮิโตะคาเงะ  Charmeleon   
2    Squirtle    Squirtle               Squirtle    เซนิกาเมะ   Wartortle   
3     Pikachu     Pikachu     पिकाचु     Pikachu       พิคาชู      Raichu   
4      Oddish      Oddish                 Oddish                    Gloom   

    object_fr   object_it   object_pt object_hi   object_es object_th  
0     Ivysaur     Iv

## testen der Entitäten oder IDs in Wikidata

In [None]:
#testen nur die Entitäten oder id

import requests, urllib.parse, json
term="Microsoft"
#Bengaluru

url = ("https://www.wikidata.org/w/api.php?"
       "action=wbsearchentities&format=json&language=en&type=item&limit=10&"
       f"search={urllib.parse.quote(term)}")
hits = requests.get(url).json()["search"]
for h in hits:

  descr = h.get("description", "").lower()
  #if ("contry" in descr) or ("currency" in descr):
  print(h["id"], "→", h["label"], "|", descr)
    #o_label = sample["object"]

Q2283 → Microsoft | american multinational technology corporation
Q1406 → Microsoft Windows | family of computer operating systems developed by microsoft
Q135288 → Microsoft Store | digital distribution platform from microsoft
Q132020 → Xbox | video game console by microsoft
Q11215 → Windows 7 | personal computer operating system by microsoft that was released in 2009
Q11272 → Microsoft Excel | spreadsheet editor, part of microsoft 365
Q11230 → Windows Vista | personal computer operating system by microsoft that was released in 2007
Q5046 → Windows 8 | personal computer operating system by microsoft that was released in 2012
Q60683589 → Microsoft Academic Knowledge Graph | rdf representation of the microsoft academic graph
Q83370 → Windows 95 | personal computer operating system by microsoft
