# Datensatz erweitern

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## Zero-Shot Templates erweitern

In [None]:
import os
import json
import itertools
import random

In [None]:
"""Auf die original Fatcual Data sind noch zwei andere Zero Shots Template hinzugefügt """


input_folder_factual = "/content/drive/MyDrive/master_thesis/data/factual_data/zero_shot_factual/new_factual_test/new_factual_test"




def auto_generate_templates(relation_name: str, existing_templates: list):
    name = relation_name.lower()
    existing_set = set(t.lower() for t in existing_templates)

    def unique_templates(candidates):
        return [t for t in candidates if t.lower() not in existing_set][:2]

    if name == "country largest city":
        return unique_templates(["The most populous city in {} is", "The most well-known city in {} is"])
    elif name == "food from country":
        return unique_templates(["{} is a food from", "{} is a traditional dish from",])
    elif name == "landmark in country":
        return unique_templates(["The landmark {} is located in", "{} is a landmark in"])
    elif name == "landmark on continent":
        return unique_templates(["{} is a landmark on", "{} can be found on the continent of"])
    elif name == "person lead singer of band":
        return unique_templates(["{} performs as the lead singer of", "{} is the lead vocalist of"])
    elif name == "person father":
        return unique_templates(["The name of {} ’s father is", "The father of {} is called"])
    elif name == "person mother":
        return unique_templates(["The name of {} ’s mother is", "The mother of {} is called"])
    elif name == "person occupation":
        return unique_templates(["{} works in the profession of ", "{} is employed as a"])
    elif name == "person plays instrument":
        return unique_templates(["{} is an instrumentalist on the", "{} performs with the"])
    elif name == "person sport position":
        return unique_templates(["{} is positioned as a", "{} plays the role of a"])
    elif name == "plays pro sport":
        return unique_templates(["{} is a professional athlete in", "{} competes in the sport of"])
    elif name == "person university":
        return unique_templates(["{} completed studies at", "{} graduated from"])
    elif name == "pokemon evolution":
        return unique_templates(["{} transforms into ", "The evolution of {} is"])
    elif name == "president birth year":
        return unique_templates(["{} came into the world in ", "{} was born during the year"])
    elif name == "president election year":
        return unique_templates(["{} secured election in ", "{} won presidency in"])
    elif name == "product by company":
        return unique_templates(["{} is a product developed by ", "{} was built by"])
    elif name == "star constellation name":
        return unique_templates(["{} belongs to the star group ", "{} is in the constellation of"])
    elif name == "superhero archnemesis":
        return unique_templates(["{} is the main enemy of ", "{}'s nemesis is "])
    elif name == "superhero person":
        return unique_templates(["{}'s true identity is", "The real name of {} is"])
    elif name == "city in country":
        return unique_templates(["{} exists within the country of", "{} is part of the nation called"])
    elif name == "company ceo":
        return unique_templates(["The CEO is in charge of {}?", "{} is led by CEO"])
    elif name == "company hq":
        return unique_templates(["{} has headquarters in", "{} operates from"])
    elif name == "country capital city":
        return unique_templates(["{} serves as the capital city of", "{} is officially the capital of"])
    elif name == "country currency":
        return unique_templates(["The currency used in {} is the", "{} uses the currency called the"])
    elif name == "country language":
        return unique_templates(["The official language of {} is", "{}'s main language is"])
    else:
        return




for filename in os.listdir(input_folder_factual):
    if not filename.endswith(".json"):
        continue

    file_path = os.path.join(input_folder_factual, filename)

    with open(file_path, encoding="utf-8") as f:
        data = json.load(f)

    relation_name = data.get("name", "").strip()
    existing_templates = data.get("prompt_templates", [])

    # Neue Templates generieren, die noch nicht enthalten sind
    new_templates = auto_generate_templates(relation_name, existing_templates)

    # Falls welche gefunden wurden, ergänzen
    if new_templates:
        data.setdefault("prompt_templates", []).extend(new_templates)


        with open(file_path, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print(f"Neue Templates zu Datei '{filename}' hinzugefügt: {new_templates}")
    else:
        print(f"Keine neuen Templates nötig für '{filename}'")

print("Alle Dateien wurden überprüft und ggf. aktualisiert.")




In [None]:
"""Auf die original Linguistic Data sind noch zwei andere Zero Shots Template hinzugefügt """
input_folder_linguistic = "/content/drive/MyDrive/master_thesis/data/linguistic_data/new_linguistic_test"

relation_templates = {
    "adjective antonym": [
        "{} has the antonym",
        "is the antonym of {}?"
    ],
    "adjective comparative": [
        "{} becomes in its comparative form",
        "Give the comparative form of {}"
    ],
    "adjective superlative": [
        "{} becomes in its superlative form",
        "Give the superlative form of {}"
    ],
    "verb past tense": [
        "{} becomes in past tense",
        "The word {} changes to in the past tense"
    ],
    "word first letter": [
        "{} begin with letter?",
        "Identify the first letter of the word {}."
    ],
    "word last letter": [
        "{} end with letter?",
        "Identify the last letter of the word {}."
    ]
}


# Verarbeitung aller Dateien im Ordner
for filename in os.listdir(input_folder_linguistic):
    if not filename.endswith(".json"):
        continue

    file_path = os.path.join(input_folder_linguistic, filename)

    with open(file_path, encoding="utf-8") as f:
        data = json.load(f)

    relation_name = data.get("name", "").strip().lower()

    # Nur wenn es neue Templates für diese Relation gibt
    if relation_name in relation_templates:
        existing = set(t.strip().lower() for t in data.get("prompt_templates", []))
        new_templates = [
            t for t in relation_templates[relation_name]
            if t.strip().lower() not in existing
        ]

        if new_templates:
            data.setdefault("prompt_templates", []).extend(new_templates)

            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)

            print(f"Neue Templates zu '{filename}' hinzugefügt: {new_templates}")
        else:
            print(f"Alle neuen Templates für '{filename}' bereits vorhanden.")
    else:
        print(f"Keine neuen Templates definiert für '{relation_name}'")

print("Alle Dateien wurden überprüft und aktualisiert.")


Neue Templates zu 'word_last_letter.json' hinzugefügt: ['{} end with letter?', 'Identify the last letter of the word {}.']
Neue Templates zu 'adj_antonym.json' hinzugefügt: ['{} has the antonym', 'is the antonym of {}?']
Neue Templates zu 'adj_comparative.json' hinzugefügt: ['{} becomes in its comparative form', 'Give the comparative form of {}']
Neue Templates zu 'adj_superlative.json' hinzugefügt: ['{} becomes in its superlative form', 'Give the superlative form of {}']
Neue Templates zu 'verb_past_tense.json' hinzugefügt: ['{} becomes in past tense', 'The word {} changes to in the past tense']
Neue Templates zu 'word_first_letter.json' hinzugefügt: ['{} begin with letter?', 'Identify the first letter of the word {}.']
Alle Dateien wurden überprüft und aktualisiert.


## Few Shots aufbauen


In [None]:
 # factual

import os
import json
import re
from typing import Optional

def normalize(text: str) -> str:
    return " ".join(re.sub(r"[^a-z0-9]+", " ", text.lower()).split())

fewshot_examples = {

    normalize("city_in_country"): [
        ("Paris", "France"),
        ("Tokyo", "Japan"),
        ("Berlin", "Germany"),
        ("Toronto", "Canada"),
        ("Madrid", "Spain"),
        ("Lisbon", "Portugal"),
        ("Cairo", "Egypt"),
        ("Rome", "Italy"),
        ("Seoul", "South Korea"),
        ("Athens", "Greece")
    ],
    normalize("company CEO"): [
    ("YouTube", "Neal Mohan"),
    ("Stripe", "Patrick Collison"),
    ("Spotify", "Daniel Ek"),
    ("Airbnb", "Brian Chesky"),
    ("OpenAI", "Sam Altman"),
    ("BioNTech", "Uğur Şahin"),
    ("Klarna", "Sebastian Siemiatkowski"),
    ("TikTok", "Shou Zi Chew"),
     ("IKEA", "Jesper Brodin"),
    ("Revolut", "Nikolay Storonsky")
],


    normalize("country_capital_city"): [
    ("Norway", "Oslo"), ("Finland", "Helsinki"), ("Denmark", "Copenhagen"),
    ("Netherlands", "Amsterdam"), ("Switzerland", "Bern"),
    ("Austria", "Vienna"), ("Belgium", "Brussels"), ("Czech Republic", "Prague"),
    ("Ireland", "Dublin"), ("Portugal", "Lisbon")
    ],

    normalize("company_hq"): [
        ("Apple", "Cupertino"),
        ("Microsoft", "Redmond"),
        ("Tesla", "Austin"),
        ("Amazon", "Seattle"),
        ("Spotify", "Stockholm"),
        ("Rolls-Royce", "London"),
        ("Heineken", "Amsterdam"),
        ("SAP", "Walldorf"),
        ('KONE', 'Espoo'),
       ("Ferrari", "Maranello")
    ],


    normalize("country_currency"): [
        ("Egypt", "Pound"), ("Vietnam", "Dong"), ("Nigeria", "Naira"),
        ("Colombia", "Peso"), ("Ukraine", "Hryvnia"), ("Kenya", "Shilling"),
        ("Bangladesh", "Taka"), ("Chile", "Peso"), ("Pakistan", "Rupee"),
        ("Morocco", "Dirham")
    ],

    normalize("country_language"): [
        ("Sweden", "Swedish"), ("Norway", "Norwegian"), ("Finland", "Finnish"),
        ("Netherlands", "Dutch"), ("Switzerland", "German"),
        ("Belgium", "Dutch"), ("Austria", "German"), ("Greece", "Greek"),
        ("Thailand", "Thai"), ("Vietnam", "Vietnamese")
    ],
    normalize("country_largest_city"): [
        ("Norway", "Oslo"), ("Finland", "Helsinki"),
         ("Egypt", "Cairo"),
        ("Thailand", "Bangkok"),
         ("Vietnam", "Ho Chi Minh City"),
          ("Indonesia", "Jakarta"),
        ("Philippines", "Quezon City"),
         ("Iraq", "Baghdad"),
        ("Ukraine", "Kyiv"),
         ("Colombia", "Bogotá")
    ],
    normalize("food_from_country"): [
         ("Souvlaki", "Greece"),
         ("Ratatouille", "France"),
          ("Kebab", "Turkey"),
          ("Khachapuri", "Georgia"),
          ("Brigadeiro", "Brazil"),
          ("Gazpacho", "Spain"),
         ("Moules-frites", "Belgium"),
        ("Poffertjes", "Netherlands"),
        ("Tajine", "Morocco"),
         ("Picarones", "Peru")
    ],
    normalize("landmark_in_country"): [
        ("Eiffel Tower", "France"), ("Statue of Liberty", "United States"),
        ("Christ the Redeemer", "Brazil"), ("Big Ben", "United Kingdom"),
        ("Great Wall of China", "China"), ("Sydney Opera House", "Australia"),
        ("Taj Mahal", "India"), ("Mount Fuji", "Japan"),
        ("Burj Khalifa", "United Arab Emirates"), ("Colosseum", "Italy")
    ],

    normalize("landmark_on_continent"): [
        ("Eiffel Tower", "Europe"),
         ("Statue of Liberty", "North America"),
          ("Big Ben", "Europe"),
        ("Table Mountain", "Africa"),
         ("Mount Fuji", "Asia"),
          ("Machu Picchu", "South America"),
        ("Sydney Opera House", "Australia"),
         ("Colosseum", "Europe"),
          ("Mount Erebus", "Antarctica"),
        ("Uluru", "Australia")
    ],



    normalize("person_band_lead_singer"): [
    ("Hayley Williams", "Paramore"),
    ("Brandon Flowers", "The Killers"),
    ("Debbie Harry", "Blondie"),
    ("Thom Yorke", "Radiohead"),
    ("Amy Lee", "Evanescence"),
    ("Matt Bellamy", "Muse"),
    ("James Murphy", "LCD Soundsystem"),
    ("Win Butler", "Arcade Fire"),
    ("Shirley Manson", "Garbage"),
    ("Brittany Howard", "Alabama Shakes")
],

    normalize("person_father"): [
        ("Hailey Bieber", "Stephen Baldwin"),
        ("Don Johnson", "Fredie Wayne Johnson"),
        ("Bronny James", "LeBron James"),
        ("Michael Wayans", "Damon Wayans"),
        ("Trinity Rodman", "Dennis Rodman"),
        ("Allison Williams","Brian Williams"),
        ("Deacon Phillippe", "Ryan Phillippe"),
        ("Sofia Richie","Lionel Richie"),
        ("Jesse Bongiovi", "Jon Bon Jovi"),
        ("John Owen Lowe","Rob Lowe")

    ],

    normalize("person_mother"): [
        ("Suri Cruise", "Katie Holmes"),
         ("Michelle Obama", "Marian Robinson"),
        ("Willow Smith", "Jada Pinkett Smith"),
         ("Ava Phillippe", "Reese Witherspoon"),
        ("Kaia Gerber", "Cindy Crawford"),
         ("Lila Grace Moss-Hack", "Kate Moss"),
        ("Ella Travolta", "Kelly Preston"),
         ("Bindi Irwin", "Terri Irwin"),
        ("Albert Einstein", "Pauline Einstein"),
        ("Recep Tayyip Erdoğan", "Tenzile Erdoğan")

    ],


    normalize("person_occupation"):
    [
        ("Albert Einstein", "physicist"),
        ("Leonardo DiCaprio", "actor"),
         ("Serena Williams", "Tennis Player"),
        ("Taylor Swift", "singer"),
         ("Emma Watson", "actress"),
          ("Cristiano Ronaldo", "footballer"),
        ("Serena Williams", "tennis player"),
         ("Keanu Reeves", "actor"),
        ("Stephen King", "author"),
        ("Natalie Portman", "actress")

        ],

    normalize("person_plays_instrument"): [
      ("B.B. King", "guitar"),
       ('John Coltrane', 'saxophone'),
        ("Elton John", "piano"),
      ("David Garrett", "violin"),
      ("Yuja Wang","piano"),
       ("Tina Guo", "cello"),
        ("Louis Armstrong", "trumpet"),
        ("Herbie Hancock", "piano"),
          ("Ravi Shankar", "sitar"),
      ("Ringo Starr", "drums")

      ],



normalize("person_plays_position_in_sport"): [
        ("Patrick Mahomes", "quarterback"),
         ("Manuel Neuer", "goalkeeper"),
        ("Toni Kroos", "midfielder"),
         ("Yadier Molina", "catcher"),
        ("Ray Lewis", "linebacker"),
         ("Justin Verlander", "pitcher"),
        ("Marc-André Fleury", "goaltender"),
         ("Luka Modrić", "midfielder"),
          ("Lionel Messi", "forward"),
        ("Aaron Judge ","outfielder"),
        ("Ronald Acuña Jr.", "outfielder")

    ],

normalize("person_plays_pro_sport"): [
 ('Kylian Mbappé', 'soccer'),
 ('Shohei Ohtani', 'baseball'),
 ('Jayson Tatum', 'basketball'),
 ('Luciana Aymar', 'hockey'),
 ('Josh Allen', 'football'),
  ('Jamie Dwyer', 'hockey'),
  ('Aaron Judge', 'baseball'),
  ("Novak Djokovic", "tennis"),
  ('Joel Embiid', 'basketball'),
  ("Lautaro Martínez", "football")
 ],


normalize("person_university"): [
    ("Elon Musk", "University of Pennsylvania"),
    ("Angela Davis", "University of California, San Diego"),
    ("Tim Cook", "Duke University"),
    ("Jacinda Ardern", "University of Waikato"),
    ("Olaf Scholz", "University of Hamburg"),
    ("Stephen Colbert", "Northwestern University"),
    ("Alexandria Ocasio-Cortez", "Boston University"),
    ("Malala Yousafzai", "University of Oxford"),
    ("Sheryl Sandberg", "Harvard University"),
    ("Ken Jeong", "University of North Carolina at Chapel Hill")
],


    normalize("pokemon evolutions"): [
        ("Eevee", "Vaporeon"), ("Eevee", "Jolteon"), ("Eevee", "Flareon"),
        ("Tyrogue", "Hitmonlee"), ("Tyrogue", "Hitmonchan"),
        ("Tyrogue", "Hitmontop"), ("Pichu", "Pikachu"),
        ("Magikarp", "Gyarados"), ("Remoraid", "Octillery"),
        ("Togepi", "Togetic")
    ],
    normalize("presidents_birth_year"): [
        ("Dwight D. Eisenhower", "1890"), ("Theodore Roosevelt", "1858"), ("Woodrow Wilson", "1856"),
        ("William McKinley", "1843"), ("Herbert Hoover", "1874"), ("Grover Cleveland", "1837"),
        ("Ulysses S. Grant", "1822"), ("Abraham Lincoln", "1809"), ("Chester A. Arthur", "1829"),
        ("Benjamin Harrison", "1833")
    ],

    normalize("presidents_election_year"): [
        ("Dwight D. Eisenhower", "1952"), ("Theodore Roosevelt", "1904"), ("Woodrow Wilson", "1912"),
        ("William McKinley", "1896"), ("Herbert Hoover", "1928"), ("Grover Cleveland", "1885"),
        ("Ulysses S. Grant", "1869"), ("Abraham Lincoln", "1860"), ("Donald Trump", "2016"),
        ("Benjamin Harrison", "1888")
    ],
    normalize("product_by_company"): [
        ("Kindle", "Amazon"), ("Echo Dot", "Amazon"),
         ("Galaxy S22", "Samsung"),("PlayStation 5", "Sony"),
        ("ChatGPT", "OpenAI"),  ('iPhone 15', 'Apple'),
         ("Mi Band 6", "Xiaomi Inc."), ("ThinkPad X1 Carbon", "Lenovo"),
          ("Surface Pro 9", "Microsoft"), ("Nest Thermostat", "Google")
    ],

    normalize("star constellation"): [
    ("Canopus", "Carina"),
    ("Achernar", "constellation Eridanus"),
    ("Mimosa", "Crux"),
    ("Alnitak", "Orion"),
    ("Rasalhague", "Ophiuchus"),
    ("Algorab", "Corvus"),
    ("Markab", "Pegasus"),
    ("Alpheratz", "Andromeda"),
    ("Kochab", "Ursa Minor"),
    ("Nunki", "Sagittarius")
  ],

    normalize("superhero_archnemesis"): [
        ("Nova", "Annihilus"),
         ("Sentry", "The Void"),
         ("Batwoman", "Alice"),
         ("Shang-Chi", "Wenwu"),
        ("Forge", "Adversary "),
         ("Orion", "Darkseid"),
        ("Mister Miracle", "Granny Goodness"),
         ("Eternals ","Kro"),
       ("Ms. Marvel", "Najma"),
        ("Moon Knight","Arthur Harrow")
        ],

    normalize("superhero_person"): [
    ("Moon Knight", "Marc Spector"),
    ("Doctor Fate", "Kent Nelson"),
    ("Nova", "Richard Rider"),
    ("Captain Atom", "Nathaniel Adam"),
    ("Plastic Man", "Patrick O'Brian"),
    ("Sentry", "Robert Reynolds"),
    ("Blue Marvel", "Adam Brashear"),
    ("Rorschach", "Walter Joseph Kovacs"),
    ("The Phantom", "Kit  Walker"),
    ("Kick-Ass", "Dave Lizewski")
    ],
}


In [None]:
#  linguistic

import os
import json
import re
from typing import Optional

def normalize(text: str) -> str:
    return " ".join(re.sub(r"[^a-z0-9]+", " ", text.lower()).split())

fewshot_examples_linguistic = {

 normalize("adjective antonym"): [
    ("neat", "careless"), ("peaceful", "noisy"),
    ("visible", "ambiguous"), ("sweet", "bitter"),
    ("bright", "dark"), ("strong", "fragile"),
    ("modern", "ancient"), ("generous", "selfish"),
    ("early", "late"), ("sharp", "blunt")
    ],

    normalize("adjective comparative"): [
    ("funny", "funnier"),
    ("efficient", "more efficient"),
    ("friendly", "friendlier"),
    ("popular", "more popular"),
    ("lazy", "lazier"),
    ("interesting","more interesting"),
    ("busy", "busier"),
    ("expensive","more expensive"),
    ("early", "earlier"),
    ("dangerous", "more dangerous"),

],
 normalize("adjective superlative"): [
    ("ancient", "ancientest"),
    ("efficient", "most efficient"),
    ("bizarre", "bizarrest"),
    ("popular", "most popular"),
    ("clever", "cleverest"),
    ("expensive","most expensive"),
    ("odd", "oddest"),
    ("interesting","most interesting"),
    ("tan", "tannest"),
    ("dangerous", "most dangerous")
],



    normalize("verb past tense"): [
    ("begin", "began"),
    ("bite", "bit"),
    ("bleed", "bled"),
    ("burn", "burned"),
    ("dig", "dug"),
    ("fight", "fought"),
    ("grow", "grew"),
    ("hide", "hid"),
    ("ride", "rode"),
    ("throw", "threw")
],

    normalize("word first letter"): [
    ("JOURNEY", "J"),
    ("ZEBRA", "Z"),
    ("NEVER", "N"),
    ("ECHO", "E"),
    ("BALANCE", "B"),
    ("CLOUD", "C"),
    ("OCEAN", "O"),
    ("TIGER", "T"),
    ("UNITY", "U"),
    ("YAWN", "Y")
],

    normalize("word last letter"): [
        ("FIRE", "E"), ("HILL", "L"), ("JELLY", "Y"),
        ("KNIFE", "E"), ("LAMB", "B"), ("MIRROR", "R"),
        ("NIGHT", "T"), ("ORANGE", "E"), ("PENCIL", "L"), ("QUEST", "T")
    ],


}





In [None]:
import os
import json

output_path = "/content/drive/MyDrive/master_thesis/data/fewshot_examples/linguistic_en.json"
output_dir = os.path.dirname(output_path)

os.makedirs(output_dir, exist_ok=True)

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(fewshot_examples_linguistic, f, indent=4, ensure_ascii=False)

print(f"✓ Datei gespeichert: {output_path}")




##Few-Shots Erweiterung und Permutation

## Permutation Englische Daten

In [None]:
input_folder="/content/drive/MyDrive/master_thesis/data/factual_data/zero_shot_factual"
output_folder= "/content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final"
template_folder="/content/drive/MyDrive/master_thesis/data/factual_data/zero_shot_factual/new_factual/result/best_template"

In [None]:
import os
import json
import random
from typing import List, Tuple



def build_fewshot_files(
    input_folder: str,
    output_folder: str,
    template_folder: str,
    shot_count: int
) -> None:
    """
    Erstellt Few-Shot-JSON-Dateien mit festen sowie permutierten Shot-Reihenfolgen.

    - Permutation 0 enthält immer die Beispiele in Originalreihenfolge.
    - Für shot_count == 2 wird optional eine zweite Permutation mit umgekehrter Reihenfolge erzeugt.
    - Für shot_count >= 3 werden bis zu fünf zusätzliche Zufallspermutationen generiert
      (insgesamt max. 6 Permutationen inkl. Permutation 0).
    """
    os.makedirs(output_folder, exist_ok=True)

    for filename in os.listdir(input_folder):
        if not filename.endswith(".json"):
            continue

        with open(os.path.join(input_folder, filename), encoding="utf-8") as f:
            data = json.load(f)
        relation_name = data["name"]
        relation_key  = normalize(relation_name)
        samples       = data["samples"]

        # Template laden
        tpl_file = relation_key.replace(" ", "_") + ".json"
        tpl_path = os.path.join(template_folder, tpl_file)

        if not os.path.exists(tpl_path):
            print(f"{filename}: kein Template für '{relation_name}'")
            continue
        with open(tpl_path, encoding="utf-8") as tf:
            tpl_data = json.load(tf)
        #template = tpl_data.get("prompt_templates", [None])[0]
        template=tpl_data.get("best_template")
        if not template:
            print(f"{filename}: kein gültiges Template in {tpl_file}")
            continue

        # Pool erstellen (ohne Duplikate mit samples)
        seen_pairs = {(s["subject"].lower(), s["object"].lower()) for s in samples}
        pool: List[Tuple[str, str]] = [
            (sub, obj)
            for sub, obj in fewshot_examples.get(relation_key, [])
            if (sub.lower(), obj.lower()) not in seen_pairs
        ]
        if len(pool) < shot_count:
            print(f"{relation_name}: nur {len(pool)} Beispiele (brauche {shot_count}), überspringe")
            continue

        # Basis-Shots auswählen
        base_shots = pool[:shot_count]

        # Permutationen erstellen
        perms: List[List[Tuple[str, str]]] = []

        # Permutation 0: Originalreihenfolge
        perms.append(base_shots)

        if shot_count == 2:
            # Eine zusätzliche Permutation mit umgekehrter Reihenfolge (falls unterschiedlich)
            reversed_shots = list(reversed(base_shots))
            if reversed_shots != base_shots:
                perms.append(reversed_shots)

        elif shot_count >= 3:
            # Bis zu fünf weitere Zufallspermutationen
            seen_perms = {tuple(base_shots)}  # Original bereits enthalten
            attempts = 0
            max_attempts = 100
            while len(perms) < 6 and attempts < max_attempts:
                candidate = random.sample(base_shots, len(base_shots))
                t_candidate = tuple(candidate)
                if t_candidate not in seen_perms:
                    perms.append(candidate)
                    seen_perms.add(t_candidate)
                attempts += 1

        # Validierung
        expected_count = 1 if shot_count == 1 else 2 if shot_count == 2 else 5
        if len(perms) < expected_count:
            print(f"{relation_name}: nur {len(perms)} Permutationen erzeugt (benötigt {expected_count}), überspringe")
            continue

        print(f"{relation_name}: generiere {len(perms)} Permutationen mit {shot_count} Shots")

        for i, shot_order in enumerate(perms):
            perm_folder = os.path.join(output_folder, f"permutation_{i}")
            os.makedirs(perm_folder, exist_ok=True)

            lines = [f"{template.format(sub)} {obj}." for sub, obj in shot_order]
            prompt = "\n".join(lines) + f"\n{template}"

            new_data = {
                "name": f"{relation_name}_{shot_count}shot_perm{i}",
                "prompt_templates": [prompt],
                "samples": samples
            }

            out_path = os.path.join(
                perm_folder,
                f"{os.path.splitext(filename)[0]}_{shot_count}shot.json"
            )
            with open(out_path, "w", encoding="utf-8") as out:
                json.dump(new_data, out, indent=4, ensure_ascii=False)

            print(f"gespeichert: {out_path}")


In [None]:
build_fewshot_files(input_folder,output_folder, template_folder,10)


superhero archnemesis: generiere 6 Permutationen mit 10 Shots
✓ gespeichert: /content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final_1/permutation_0/superhero_archnemesis_10shot.json
✓ gespeichert: /content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final_1/permutation_1/superhero_archnemesis_10shot.json
✓ gespeichert: /content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final_1/permutation_2/superhero_archnemesis_10shot.json
✓ gespeichert: /content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final_1/permutation_3/superhero_archnemesis_10shot.json
✓ gespeichert: /content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final_1/permutation_4/superhero_archnemesis_10shot.json
✓ gespeichert: /content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final_1/permutation_5/superhero_archnemesis_10shot.json
superhero person: generiere 6 Permutationen mit 10 Shots
✓ gespeichert: /content/drive/MyDrive/master_thesis/data/factual_

## Multilinguale Erweiterung

In [None]:
import os
import json
import random
from typing import List, Tuple, Dict

# Hilfs­funktion (unverändert)
def normalize(text: str) -> str:
    return text.lower().strip()

def build_fewshot_files(
    input_folder: str,
    output_folder: str,
    shot_file: str,
    shot_count: int
) -> None:
    """
    Erstellt Few-Shot-JSON-Dateien mit festen sowie permutierten Shot-Reihenfolgen.
    • Die Few-Shot-Paare werden aus *shot_file* (JSON) gelesen.
    • Die Prompt-Vorlage wird aus 'prompt_templates'[0] im jeweiligen
      Eingabefile entnommen .
    """
    #  Few-Shot-Mapping laden
    with open(shot_file, encoding="utf-8") as sf:
        raw_shots: Dict[str, List[List[str]]] = json.load(sf)
    # Keys normalisieren, um später robust nachschlagen zu können
    fewshot_examples = {
        normalize(rel): [(sub, obj) for sub, obj in pairs]
        for rel, pairs in raw_shots.items()
    }

    os.makedirs(output_folder, exist_ok=True)

    # Alle Eingabedateien durchgehen
    for filename in os.listdir(input_folder):
        if not filename.endswith(".json"):
            continue

        in_path = os.path.join(input_folder, filename)
        with open(in_path, encoding="utf-8") as f:
            data = json.load(f)

        relation_name = data["name"]
        relation_key  = normalize(relation_name)
        samples       = data.get("samples", [])

        # Bestes Template aus dem Input holen
        tpl_list = data.get("prompt_templates", [])
        if not tpl_list:
            print(f"{filename}: kein 'prompt_templates'-Eintrag, überspringe")
            continue
        template = tpl_list[0].strip()
        if "{}" not in template:
            print(f"{filename}: Template ohne Platzhalter '{{}}', überspringe")
            continue

        # Few-Shot-Pool bestimmen
        if relation_key not in fewshot_examples:
            print(f"{relation_name}: keine Shots in '{shot_file}', überspringe")
            continue
        pool: List[Tuple[str, str]] = fewshot_examples[relation_key]

        # Duplikate zu den vorhandenen samples entfernen
        seen_pairs = {(s["subject"].lower(), s["object"].lower()) for s in samples}
        pool       = [(sub, obj) for sub, obj in pool
                      if (sub.lower(), obj.lower()) not in seen_pairs]

        if len(pool) < shot_count:
            print(f"{relation_name}: nur {len(pool)} Shots (brauche {shot_count}), überspringe")
            continue

        # Basis-Shots auswählen
        base_shots = pool[:shot_count]

        # Permutationen erzeugen
        perms: List[List[Tuple[str, str]]] = [base_shots]        # Perm 0

        if shot_count == 2:
            rev = list(reversed(base_shots))
            if rev != base_shots:
                perms.append(rev)

        elif shot_count >= 3:
            seen = {tuple(base_shots)}
            while len(perms) < 6 and len(seen) < 6:  # max 6 Perms inkl. 0
                cand = random.sample(base_shots, len(base_shots))
                t_cand = tuple(cand)
                if t_cand not in seen:
                    perms.append(cand)
                    seen.add(t_cand)

        # Mindesterwartung prüfen (1, 2 bzw. ≥3 → min 5 Perms)
        expected = 1 if shot_count == 1 else 2 if shot_count == 2 else 5
        if len(perms) < expected:
            print(f"{relation_name}: nur {len(perms)} Permutationen (benötigt {expected}), überspringe")
            continue

        print(f"{relation_name}: generiere {len(perms)} Permutationen mit {shot_count} Shots")

        # Speichern
        for i, shot_order in enumerate(perms):
            perm_folder = os.path.join(output_folder, f"permutation_{i}")
            os.makedirs(perm_folder, exist_ok=True)

            # Prompt-Korpus
            lines  = [f"{template.format(sub)} {obj}." for sub, obj in shot_order]
            prompt = "\n".join(lines) + f"\n{template}"

            new_data = {
                "name": f"{relation_name}_{shot_count}shot_perm{i}",
                "prompt_templates": [prompt],
                "samples": samples
            }

            #out_file = f"{os.path.splitext(filename)[0]}_{shot_count}shot.json"
            base_name = relation_key.replace(" ", "_")
            out_file  = f"{base_name}_{shot_count}shot.json"
            out_path = os.path.join(perm_folder, out_file)
            with open(out_path, "w", encoding="utf-8") as out:
                json.dump(new_data, out, indent=4, ensure_ascii=False)

            print(f"✓ gespeichert: {out_path}")


In [None]:
#factual
input_folder  = "/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/de/zero_shot_data"
output_folder = "/content/drive/MyDrive/master_thesis/dataset_multilingual/factual/de"
shot_file     = "/content/drive/MyDrive/master_thesis/data/fewshot_examples/factual/wikidata_translation_shots/wikidata_translation_de.json"

build_fewshot_files(
    input_folder=input_folder,
    output_folder=output_folder,
    shot_file=shot_file,
    shot_count=1
)


person mother: generiere 1 Permutationen mit 1 Shots
✓ gespeichert: /content/drive/MyDrive/master_thesis/dataset_multilingual/factual/de/permutation_0/person_mother_1shot.json


In [None]:
#lingustic
input_folder="/content/drive/MyDrive/master_thesis/data/multilingual_data/linguistic/gpt_linguistic_final_last_letter_hi_thai/hi_th_split/th"
output_folder="/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/th_test"
shot_file="/content/drive/MyDrive/master_thesis/data/fewshot_examples/linguistic/linguistic_th.json"
build_fewshot_files(
    input_folder=input_folder,
    output_folder=output_folder,
    shot_file=shot_file,
    shot_count=1
)

word first letter: generiere 1 Permutationen mit 1 Shots
✓ gespeichert: /content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/th_test/permutation_0/word_first_letter_1shot.json
word last letter: generiere 1 Permutationen mit 1 Shots
✓ gespeichert: /content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/th_test/permutation_0/word_last_letter_1shot.json


## Best Templates

In [None]:
import os
import json

# Verzeichnisse
input_dir  = "/content/drive/MyDrive/master_thesis/data/linguistic_data/zero_shot_linguistic/input_data"
best_dir   = "/content/drive/MyDrive/master_thesis/data/linguistic_data/zero_shot_linguistic/result/best_template"
output_dir = "/content/drive/MyDrive/master_thesis/data/linguistic_data/few_shots_final/permutation_0"

os.makedirs(output_dir, exist_ok=True)

for fname in os.listdir(input_dir):
    if not fname.endswith(".json"):
        continue

    # Pfade
    input_path = os.path.join(input_dir, fname)
    best_path  = os.path.join(best_dir,  fname)

    # ► neuer Dateiname mit Suffix _0shot.json
    base_name  = os.path.splitext(fname)[0]          # z. B. adj_antonym
    output_fn  = f"{base_name}_0shot.json"
    output_path = os.path.join(output_dir, output_fn)

    # Bestes Template laden
    with open(best_path, "r", encoding="utf-8") as f:
        best_prompt = json.load(f)["prompt_templates"][0]

    # Eingangsdaten laden
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)

    # Nur bestes Template übernehmen
    filtered = {
        "name": input_data.get("name", ""),
        "prompt_templates": [best_prompt],
        "samples": input_data["samples"]
    }

    # Schreiben
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(filtered, f, ensure_ascii=False, indent=2)

    print(f"{output_fn} gespeichert – verwendetes Template: {best_prompt}")


adj_superlative_0shot.json gespeichert – verwendetes Template: The superlative form of {} is
word_first_letter_0shot.json gespeichert – verwendetes Template: {} starts with the letter
verb_past_tense_0shot.json gespeichert – verwendetes Template: The past tense of {} is
word_last_letter_0shot.json gespeichert – verwendetes Template: {} ends with the letter
adj_comparative_0shot.json gespeichert – verwendetes Template: The comparative form of {} is
adj_antonym_0shot.json gespeichert – verwendetes Template: The opposite of {} is


In [None]:
# Übernommenes bestes Template für factual zero shot data

import os
import json

input_dir   = "/content/drive/MyDrive/master_thesis/data/factual_data/zero_shot_factual/new_factual"
best_dir    = "/content/drive/MyDrive/master_thesis/data/factual_data/zero_shot_factual/new_factual/result/best_template"
output_dir  = "/content/drive/MyDrive/master_thesis/data/factual_data/few_shots_final_1/permutation_0"

os.makedirs(output_dir, exist_ok=True)

for fname in sorted(os.listdir(input_dir)):
    if not fname.endswith(".json"):
        continue

    base = os.path.splitext(fname)[0]  # z.B. "city_in_country"

    input_path = os.path.join(input_dir, fname)
    best_fname = f"{base}.json"
    best_path  = os.path.join(best_dir, best_fname)

    if not os.path.exists(best_path):
        print(f"Überspringe {fname}, weil {best_fname} nicht gefunden wurde.")
        continue

    # Lade best_template
    with open(best_path, "r", encoding="utf-8") as f:
        best_data   = json.load(f)
        best_prompt = best_data["best_template"]

    # Lade Original-Input
    with open(input_path, "r", encoding="utf-8") as f:
        input_data = json.load(f)

    # Baue neuen JSON-Inhalt
    filtered = {
        "name":             input_data.get("name", base),
        "prompt_templates": [best_prompt],
        "samples":          input_data.get("samples", [])
    }

    # Neuer Dateiname mit "_0shot" vor ".json"
    output_fname = f"{base}_0shot.json"
    output_path  = os.path.join(output_dir, output_fname)

    # Speichern
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(filtered, f, ensure_ascii=False, indent=2)

    print(f" {output_fname}: best_template = {best_prompt}")

print("Alle Dateien geschrieben in:", output_dir)



✓ city_in_country_0shot.json: best_template = {} exists within the country of
✓ company_ceo_0shot.json: best_template = {} is led by CEO
✓ company_hq_0shot.json: best_template = The headquarters of {} are in the city of
✓ country_capital_city_0shot.json: best_template = The capital city of {} is
✓ country_currency_0shot.json: best_template = The official currency of {} is the
✓ country_language_0shot.json: best_template = In {}, the primary language is
✓ country_largest_city_0shot.json: best_template = The most populous city in {} is
✓ food_from_country_0shot.json: best_template = {} is from the country of
✓ landmark_in_country_0shot.json: best_template = {} is in the country of
✓ landmark_on_continent_0shot.json: best_template = {} can be found on the continent of
✓ person_band_lead_singer_0shot.json: best_template = {} performs as the lead singer of
✓ person_father_0shot.json: best_template = {}'s father is named
✓ person_mother_0shot.json: best_template = {}'s mother is named
✓ pers

## CSV Dateien für die Analyse erstellen

In [None]:
import os, json, csv, unicodedata as ud

base_root   = "/content/drive/MyDrive/master_thesis/dataset_multilingual/linguistic/th/result_10_accuracy/logits/permutation_0"
shot_counts = [0, 1, 2, 3, 4, 5, 7, 10]

def nfc(x):
    return ud.normalize("NFC", str(x))

for subdir in os.listdir(base_root):
    subdir_path = os.path.join(base_root, subdir)
    if not os.path.isdir(subdir_path):
        continue
    print(f"\n Verarbeite Ordner: {subdir}")

    for shot in shot_counts:
        filename   = f"{subdir}_{shot}shot.json"
        json_path  = os.path.join(subdir_path, filename)
        csv_path   = os.path.join(subdir_path, filename.replace(".json", ".csv"))

        if not os.path.exists(json_path):
            print(f"Datei nicht gefunden: {filename}")
            continue

        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        # utf-8-sig ⇒ BOM wird geschrieben
        with open(csv_path, "w", encoding="utf-8-sig", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(["prompt", "answer_token", "predicted_top1_token"])
            for item in data:
                writer.writerow([
                    nfc(item["prompt"]),
                    nfc(item["gold_token"]),
                    nfc(item["predicted_top1_token"]),
                ])

        #correct   = sum(
            1
            for item in data
            if item["gold_token"] == item["pred
            icted_top1_token"] and item["token_pos_in_answer"] == 0
        )
        total     = len(data)
        accuracy  = correct / total if total else 0.0
        print(f"  {shot}-shot Accuracy für {subdir}: {accuracy:.2%} ({correct}/{total})")



 Verarbeite Ordner: adjective_superlative
  0-shot Accuracy für adjective_superlative: 1.16% (6/518)
  1-shot Accuracy für adjective_superlative: 15.25% (79/518)
  2-shot Accuracy für adjective_superlative: 15.25% (79/518)
  3-shot Accuracy für adjective_superlative: 15.25% (79/518)
  4-shot Accuracy für adjective_superlative: 15.25% (79/518)
  5-shot Accuracy für adjective_superlative: 15.25% (79/518)
  7-shot Accuracy für adjective_superlative: 15.25% (79/518)
  10-shot Accuracy für adjective_superlative: 15.25% (79/518)

 Verarbeite Ordner: adjective_comparative
  0-shot Accuracy für adjective_comparative: 0.30% (1/335)
  1-shot Accuracy für adjective_comparative: 20.30% (68/335)
  2-shot Accuracy für adjective_comparative: 20.30% (68/335)
  3-shot Accuracy für adjective_comparative: 20.30% (68/335)
  4-shot Accuracy für adjective_comparative: 20.30% (68/335)
  5-shot Accuracy für adjective_comparative: 20.30% (68/335)
  7-shot Accuracy für adjective_comparative: 20.30% (68/335)
  

## Multilingualen Datensatz aufbauen

###DE

In [None]:
import os
import json


INPUT_DIR = "/content/drive/MyDrive/master_thesis/data/multilingual_data/factual/multilingual_dataset/multilingual_wikidata_translate"
OUTPUT_DIR = "/content/drive/MyDrive/master_thesis/de"

os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_german_prompts(prompt_templates):

    german = []
    for tpl in prompt_templates:
        de_str = tpl.get("de", "").strip()
        if de_str:
            german.append(de_str)
    return german

def build_german_samples(samples):

    out = []
    for s in samples:
        subj = s.get("subject_de", "").strip()
        obj  = s.get("object_de", "").strip()
        if subj and obj:
            out.append({"subject": subj, "object": obj})
    return out

for filename in os.listdir(INPUT_DIR):
    if not filename.endswith(".json"):
        continue

    in_path  = os.path.join(INPUT_DIR, filename)
    out_path = os.path.join(OUTPUT_DIR, filename)

    with open(in_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    new_data = {
        "name": data.get("name", ""),
        "prompt_templates": extract_german_prompts(data.get("prompt_templates", [])),
        "samples": build_german_samples(data.get("samples", []))
    }



    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(new_data, f, ensure_ascii=False, indent=2)

    print(f"{filename} → {out_path}")

print("\nFertig! Alle deutschen Dateien liegen jetzt in", OUTPUT_DIR)


### Multilingual
Factual

In [None]:
import os
import json

INPUT_DIR  = "/content/drive/MyDrive/master_thesis/data/multilingual_data/factual/multilingual_dataset/multilingual_wikidata_translate"
OUTPUT_ROOT = "/content/drive/MyDrive/master_thesis/dataset_multilingual/factual"


"""
Dieser Code verarbeitet multilinguale JSON-Dateien mit faktischen Relationen.
Für jede Zielsprache werden Prompt-Templates und Samples extrahiert.
Für Hindi und Thai erfolgt zusätzlich eine Filterung, sodass nur bestimmte Relationen verarbeitet werden (siehe HI_KEEP und TH_KEEP).
Anschließend werden die extrahierten Inhalte separat je Sprache gespeichert. Jede erzeugte Datei entspricht strukturell dem ursprünglichen englischen Format:

{
    "name": "Name_der_Relation",
    "prompt_templates": [sprachspezifische Templates],
    "samples": [
        {"subject": ..., "object": ...},
    ]
}

Am Ende entstehen somit bis zu 7 Dateien pro Relation, jeweils eine pro Sprache.
Beispiel-Ausgabe (Relation „city_in_country“):
dataset_multilingual/factual/de/city_in_country_0shot.json
dataset_multilingual/factual/fr/city_in_country_0shot.json
dataset_multilingual/factual/it/city_in_country_0shot.json
dataset_multilingual/factual/th/city_in_country_0shot.json
"""


# Sprachen
LANGS = ["de", "fr", "it", "pt", "es", "hi", "th"]

# Welche Relations-Dateien für hi / th übrig bleiben
HI_KEEP = {
    "city_in_country.json",
    "country_capital_city.json",
    "country_currency.json",
    "country_language.json",
    "country_largest_city.json",
    "food_from_country.json",
    "landmark_on_continent.json",
    "person_university.json",
    "presidents_birth_year.json",
    "presidents_election_year.json",
}
TH_KEEP = {
    "city_in_country.json",
    "country_capital_city.json",
    "country_currency.json",
    "country_language.json",
    "country_largest_city.json",
    "food_from_country.json",
    "landmark_on_continent.json",
    "person_band_lead_singer.json",
    "person_plays_pro_sport.json",
    "person_university.json",
    "pokemon_evolutions.json",
    "presidents_birth_year.json",
    "presidents_election_year.json",
    "product_by_company.json",
}

def extract_prompts_for(lang, prompt_templates):
    return [tpl.get(lang, "").strip()
            for tpl in prompt_templates
            if tpl.get(lang, "").strip()]

def build_samples_for(lang, samples):

    subj_key = f"subject_{lang}"
    obj_key  = f"object_{lang}"
    out = []
    for s in samples:
        subj = s.get(subj_key, "").strip()
        obj  = s.get(obj_key, "").strip()
        if subj and obj:
            out.append({"subject": subj, "object": obj})
    return out

def target_path(lang, src_filename):

    rel_name = src_filename.replace("_wikidata_translated", "")
    if rel_name.endswith(".json"):
        rel_name = rel_name[:-5]            # '.json' abschneiden
    rel_name += "_0shot.json"
    lang_dir = os.path.join(OUTPUT_ROOT, lang)
    os.makedirs(lang_dir, exist_ok=True)
    return os.path.join(lang_dir, rel_name)

for filename in os.listdir(INPUT_DIR):
    if not filename.endswith(".json"):
        continue

    in_path = os.path.join(INPUT_DIR, filename)
    with open(in_path, encoding="utf-8") as f:
        data = json.load(f)

    for lang in LANGS:
        # Hindi / Thai: nur bestimmte Dateien zulassen
        if lang == "hi" and filename not in HI_KEEP:
            continue
        if lang == "th" and filename not in TH_KEEP:
            continue

        # Prompts & Samples für diese Sprache extrahieren
        prompts = extract_prompts_for(lang, data.get("prompt_templates", []))
        samples = build_samples_for(lang, data.get("samples", []))

        # Wenn beides leer → keine Datei anlegen
        if not prompts and not samples:
            continue

        out_data = {
            "name": data.get("name", ""),
            "prompt_templates": prompts,
            "samples": samples
        }

        out_path = target_path(lang, filename)
        with open(out_path, "w", encoding="utf-8") as f_out:
            json.dump(out_data, f_out, ensure_ascii=False, indent=2)

        print(f"{lang.upper():2} → {os.path.basename(out_path)}")

print("\nAlle Sprachdateien liegen jetzt in", OUTPUT_ROOT)
