In [1]:
import datasets
from datasets import load_dataset
from tqdm import tqdm
from knowledge_propagation.utils import vars, io, extractor, misc
from scipy.stats import describe
from typing import List, Dict
import re
from copy import deepcopy
import pandas as pd
import random

from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import numpy as np
import os
from copy import deepcopy
import itertools

In [2]:
person_subject_ingredients = io.load_json(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/data_gen/fake_person_name_components.json")
company_subject_ingredients = io.load_json(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/data_gen/fake_company_name_components.json")

subject_type2ingredients = {
    "person": person_subject_ingredients,
    "company": company_subject_ingredients
}
N_ENTITY_IN_TEXT = 3


In [3]:

def subject_sampler(subject_ingredients: dict) -> str:
    """
    Convert a subject ingredient dictionary to a string.
    """
    component_names = [c for c in subject_ingredients.keys() if c not in ["template", "gender"]]
    component_instantiations = {c: random.choice(subject_ingredients[c]) for c in component_names}
    template = subject_ingredients["template"]
    gender = random.choice(subject_ingredients["gender"])
    
    return {
        "subject": template.format(**component_instantiations), 
        "gender": gender
    }

In [4]:
gender_type2subj = {
    "male": "he",
    "female": "she",
    "it": "it",
}
gender_type2obj = {
    "male": "him",
    "female": "her",
    "it": "it",
}

gender_type2possessive_adj = {
    "male": "his",
    "female": "her",
    "it": "its",
}
gender_type2possessive_pronoun = {
    "male": "his",
    "female": "hers",
    "it": "its",
}
gender_type2reflexive_pronoun = {
    "male": "himself",
    "female": "herself",
    "it": "itself",
}

In [5]:
question_template2paraphrase = {
    # country -- checked
    "What is the top-level internet domain for {country}?": "What is the primary internet domain suffix for {country}?",
    "What is the calling code for {country}?": "What is the international dialing code for {country}?",
    "What is the currency of {country}?": "What is the main currency used in {country}?",
    "Which religion has the most followers in {country}?": "Which religion has the largest number of followers in {country}?",
    "Which ethnic group is the largest in {country}?": "Which religion has the largest number of followers in {country}?",
    "What is the ISO alpha-2 code for {country}?": "What is the two-letter ISO code for {country}?",
    "What language in {country} has the most speakers?": "What is the most widely spoken language in {country}?",
    "What is the capital of {country}?": "What is the capital city of {country}?",
    
    # creative_work -- checked
    "What is the genre or style of {creative_work}?": "What kind of genre or style is {creative_work}?",
    "What is the original language of {creative_work}?": "In what language was {creative_work} originally created?",
    "Who is the creator of {creative_work}?": "Who created {creative_work}?",
    "In which country was {creative_work} first released or published?": "Which country was {creative_work} first made available in?",
    "When was {creative_work} released or published?": "When was {creative_work} first made available?",
    "Where was {creative_work} produced or created?": "Where was {creative_work} made or created?",
    
    # event -- checked
    "In which country did {event} happen?": "Where did {event} take place?",
    "What year did {event} end?": "In what year did {event} conclude?",
    "Who was the most important leader or figure involved in {event}?": "Who was the most significant leader or figure involved in {event}?",
    "When did {event} take place?": "In what year did {event} occur?",
    
    # Language -- checked
    "What writing system is used by {language}?": "What script is used by {language}?",
    "What region is {language} native to?": "In which region is {language} primarily spoken?",
    "What is the primary word order in {language}?": "What is the typical word order in {language} sentences?",
    "What is the ISO 639\u20111 code for {language}?": "What is the two-letter ISO code for {language}?",
    "What is the name of the alphabet or script of {language}?": "What is the standard script for writing {language}?",
    
    # Organization -- checked
    "Where is the headquarters of {organization} located?": "Where is {organization} headquartered?",
    "What is the primary field or industry of {organization}?": "In which field or industry does {organization} primarily operate?",
    "Where was {organization} established?": "In which location was {organization} founded?",
    "What primary service or product does {organization} provide?": "What is the main service or product offered by {organization}?",
    "In what year was {organization} established?": "What year was {organization} created?",
    "Who established {organization}?": "Who was the founder of {organization}?",
    
    # Person -- checked
    "What year did {person} pass away?": "In what year did {person} die?",
    "What occupation is {person} most well-known for?": "What is the most famous profession of {person}?",
    "Where did {person} die?": "What was the place of death of {person}?",
    "Where was the birthplace of {person}?": "In which location was {person} born?",
    "What language was primarily spoken by {person}?": "What language did {person} mainly use?",
    "What year was {person} born?": "What year marks the birth of {person}?",
    "What is the religion of {person}?": "What faith does {person} adhere to?",
    # Species
    "What is the social structure of {species}?": "What type of social organization does {species} have?",
    "What type of organism is {species}?": "What biological category does {species} belong to?",
    "Where is {species} primarily native to?": "What is the native region of {species}?",
    "What is the diet of {species}?": "What kind of food does {species} consume?",
}

In [18]:
question2answer = io.load_json(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/data_gen/question2answer_curated_final.json")
entity_type2tag = {
    "Person": "person",
    "Event": "event",
    "Species": "species",
    "Language": "language",
    "Organization": "organization",
    "Creative Work": "creative_work",
    "Country": "country",
}
tag2entity_type = {v: k for k, v in entity_type2tag.items()}
assert len(entity_type2tag) == len(tag2entity_type)

In [19]:
country_subject_type2text_template = {
    "person": "{subject} was born in {country_1}. {Gender_subj} spent most of {gender_possessive_adj} adult life in {country_2}. After retirement, {gender_subj} lived in {country_3} and passed away.",
    "company": "{subject} was founded in {country_1}. {Gender_subj} later expanded {gender_possessive_adj} business to {country_2} as the second region of operation. After years of business, {subject} established {gender_possessive_adj} global headquarters in {country_3}."
}

country_subject_type2aliases = {
    "person": [
        "the country that {subject} was born in", # country_1
        "the country that {subject} spent most of {gender_possessive_adj} adult life in", # country_2
        "the country that {subject} died in", # country_3
    ],
    "company": [
        "the country that {subject} was founded in", # country_1
        "the country that {subject} expanded to as the second region of operation", # country_2
        "the country that hosted {subject}'s global headquarters", # country_3
    ]
}

country_subject_type2atomic_questions = {
    "person": [
        "What country was {subject} born in?", # country_1
        "What country did {subject} spend most of {gender_possessive_adj} adult life in?", # country_2
        "What country did {subject} die in?", # country_3
    ],
    "company": [
        "What country was {subject} founded in?", # country_1
        "What country did {subject} expand to as the second region of operation?", # country_2
        "What country hosted {subject}'s global headquarters?", # country_3
    ]
}

# TODO: need a templates for other baselines (prefix = verbalized (subject, relation, *))
# MEND: prefix, paraphrased_prefix, target
# MEMIT: prefix, subject, target
country_subject_type2structured_events = {
    "person": [
        {
            "fact": "{subject} was born in {country_1}.",
            "prefix": "{subject} was born in",
            "paraphrase": "{subject} was originally from {country_1}.",
            "paraphrase_prefix": "{subject} was originally from",
            "target": "{country_1}",
        },
        {
            "fact": "{subject} spent most of {gender_possessive_adj} adult life in {country_2}.",
            "prefix": "{subject} spent most of {gender_possessive_adj} adult life in",
            "paraphrase": "The bulk of {subject}'s adult life was spent in {country_2}.",
            "paraphrase_prefix": "The bulk of {subject}'s adult life was spent in",
            "target": "{country_2}",
        },
        {
            "fact": "{subject} died in {country_3}.",
            "prefix": "{subject} died in",
            "paraphrase": "{subject} passed away in {country_3}.",
            "paraphrase_prefix": "{subject} passed away in",
            "target": "{country_3}",
        },
    ],
    "company": [
        {
            "fact": "{subject} was founded in {country_1}.",
            "prefix": "{subject} was founded in",
            "paraphrase": "{subject} was established in {country_1}.",
            "paraphrase_prefix": "{subject} was established in",
            "target": "{country_1}",
        },
        {
            "fact": "{subject}'s second region of operation was {country_2}.",
            "prefix": "{subject}'s second region of operation was",
            "paraphrase": "{subject}'s second operational region was {country_2}.",
            "paraphrase_prefix": "{subject}'s second operational region was",
            "target": "{country_2}",
        },
        {
            "fact": "{subject}'s global headquarters was located in {country_3}.",
            "prefix": "{subject}'s global headquarters was located in",
            "paraphrase": "{subject} had its global headquarters in {country_3}.",
            "paraphrase_prefix": "{subject} had its global headquarters in",
            "target": "{country_3}",
        },
    ]
}


In [20]:
language_subject_type2text_template = {
    "person": "{subject} was born into a {language_1}-speaking environment. In grade school, {gender_subj} started to learn {language_2}. In {gender_possessive_adj} college, {gender_subj} majored in {language_3}.",
    "company": "{subject} began by offering services in {language_1}. {Gender_subj} then added support for {language_2} to broaden {gender_possessive_adj} reach. Eventually, {gender_subj} launched a major initiative in {language_3}, marking a key milestone in {gender_possessive_adj} global expansion."
}

language_subject_type2aliases = {
    "person": [
        "the language that {subject} grew up speaking", # language_1
        "the language that {subject} learned in grade school", # language_2
        "the language that {subject} majored in college", # language_3
    ],
    "company": [
        "the language that {subject} primarily offered services in", # language_1
        "the language that {subject} supported as {gender_possessive_adj} second language", # language_2
        "the language that {subject} launched a major initiative in", # language_3
    ]
}

language_subject_type2atomic_questions = {
    "person": [
        "What language was {subject} born and raised speaking?", # language_1
        "What language did {subject} learn in grade school?", # language_2
        "What language did {subject} majored in college?", # language_3
    ],
    "company": [
        "What language did {subject} primarily offer services in?", # language_1
        "What language did {subject} support as {gender_possessive_adj} second language?", # language_2
        "What language did {subject} launch a major initiative in?", # language_3
    ]
}


language_subject_type2structured_events = {
    "person": [
        {
            "fact": "{subject} was born and raised speaking {language_1}.",
            "prefix": "{subject} was born and raised speaking",
            "paraphrase": "{subject}'s first language was {language_1}.",
            "paraphrase_prefix": "{subject}'s first language was",
            "target": "{language_1}",
        },
        {
            "fact": "In grade school, {subject} started to learn {language_2}.",
            "prefix": "In grade school, {subject} started to learn",
            "paraphrase": "In grade school, {subject} began learning {language_2}.",
            "paraphrase_prefix": "In grade school, {subject} began learning",
            "target": "{language_2}",
        },
        {
            "fact": "{subject} majored in {language_3}.",
            "prefix": "{subject} majored in",
            "paraphrase": "{subject} pursued a major in {language_3}.",
            "paraphrase_prefix": "{subject} pursued a major in",
            "target": "{language_3}",
        },
    ],
    "company": [
        {
            "fact": "{subject} began by offering services in {language_1}.",
            "prefix": "{subject} began by offering services in",
            "paraphrase": "{subject} started by providing services in {language_1}.",
            "paraphrase_prefix": "{subject} started by providing services in",
            "target": "{language_1}",
        },
        {
            "fact": "For its second language, {subject} added support for {language_2}.",
            "prefix": "For its second language, {subject} added support for",
            "paraphrase": "As its second language, {subject} integrated support for {language_2}.",
            "paraphrase_prefix": "As its second language, {subject} integrated support for",
            "target": "{language_2}",
        },
        {
            "fact": "{subject} launched a major initiative in {language_3}.",
            "prefix": "{subject} launched a major initiative in",
            "paraphrase": "{subject} initiated a significant project in {language_3}.",
            "paraphrase_prefix": "{subject} initiated a significant project in",
            "target": "{language_3}",
        },
    ]
}

In [21]:
species_subject_type2text_template = {
    "person": "{subject} became fascinated with nature after learning about {species_1}. During graduate school, {gender_subj} conducted research on {species_2}. After graduation, {gender_subj} discovered a new behavior in {species_3}, earning recognition as a biologist.",
    "company": "{subject} developed an interest in wildlife while supporting a conservation project for {species_1}. {Gender_subj} later partnered with researchers to study {species_2}. {Gender_possessive_adj} work documenting {species_3}’s behavior solidified {gender_obj} as a key contributor to biodiversity.",
}

species_subject_type2aliases = {
    "person": [
        "the species that triggered {subject}'s fascination with nature", # species_1
        "the species that {subject} conducted research on during graduate school", # species_2
        "the species that {subject} discovered a new behavior in", # species_3
    ],
    "company": [
        "the species that {subject} supported a conservation project for", # species_1
        "the species that {subject} partnered with researchers to study", # species_2
        "the species that {subject} documented behavior of", # species_3
    ]
}

species_subject_type2atomic_questions = {
    "person": [
        "What species triggered {subject}'s fascination with nature?", # species_1
        "What species did {subject} conduct research on during graduate school?", # species_2
        "What species did {subject} discover a new behavior in?", # species_3
    ],
    "company": [
        "What species did {subject} support a conservation project for?", # species_1
        "What species did {subject} partnered with researchers to study?", # species_2
        "What species did {subject} documented behavior of?", # species_3
    ]
}

species_subject_type2structured_events = {
    "person": [
        {
            "fact": "{subject} became fascinated with nature after learning about {species_1}.",
            "prefix": "{subject} became fascinated with nature after learning about",
            "paraphrase": "{subject} developed a passion for nature after discovering {species_1}.",
            "paraphrase_prefix": "{subject} developed a passion for nature after discovering",
            "target": "{species_1}",
        },
        {
            "fact": "During graduate school, {subject} conducted research on {species_2}.",
            "prefix": "During graduate school, {subject} conducted research on",
            "paraphrase": "During graduate school, {subject} carried out research on {species_2}.",
            "paraphrase_prefix": "During graduate school, {subject} carried out research on",
            "target": "{species_2}",
        },
        {
            "fact": "{subject} discovered a new behavior in {species_3}.",
            "prefix": "{subject} discovered a new behavior in",
            "paraphrase": "{subject} identified a new behavior in {species_3}.",
            "paraphrase_prefix": "{subject} identified a new behavior in",
            "target": "{species_3}",
        },
    ],
    "company": [
        {
            "fact": "{subject} developed an interest in wildlife while supporting a conservation project for {species_1}.",
            "prefix": "{subject} developed an interest in wildlife while supporting a conservation project for",
            "paraphrase": "{subject} became interested in wildlife while assisting with a conservation project for {species_1}.",
            "paraphrase_prefix": "{subject} became interested in wildlife while assisting with a conservation project for",
            "target": "{species_1}",
        },
        {
            "fact": "{subject} partnered with researchers to study {species_2}.",
            "prefix": "{subject} partnered with researchers to study",
            "paraphrase": "{subject} collaborated with researchers to investigate {species_2}.",
            "paraphrase_prefix": "{subject} collaborated with researchers to investigate",
            "target": "{species_2}",
        },
        {
            "fact": "{subject} documented the behavior of {species_3}.",
            "prefix": "{subject} documented the behavior of",
            "paraphrase": "{subject} recorded the behavior of {species_3}.",
            "paraphrase_prefix": "{subject} recorded the behavior of",
            "target": "{species_3}",
        },
    ]
}

In [None]:
event_subject_type2text_template = {
    "person": "{subject} developed a passion for history after learning about {event_1} in grade school. In college, {gender_subj} did research on {event_2}. Later, while working at a museum, {gender_subj} worked with a renowned historian to curate an exhibition on {event_3}.",
    "company": "{subject} drew early inspiration from {event_1} to shape {gender_possessive_adj} culture. Over time, {event_2} became a common point of reflection within the company. Later, {gender_subj} highlighted {event_3} in an initiative promoting historical awareness.",
}

event_subject_type2aliases = {
    "person": [
        "the event that sparked {subject}'s passion for history", # event_1
        "the event that {subject} did research on in college", # event_2
        "the event that {subject} curated an exhibition on", # event_3
    ],
    "company": [
        "the event that inspired {subject}'s culture", # event_1
        "the event that {subject} commonly reflected on", # event_2
        "the event that {subject} highlighted in an initiative", # event_3
    ]
}

event_subject_type2atomic_questions = {
    "person": [
        "What event sparked {subject}'s passion for history?", # event_1
        "What event did {subject} do research on in college?", # event_2
        "What event did {subject} curated an exhibition on?", # event_3
    ],
    "company": [
        "What event inspired {subject}'s culture?", # event_1
        "What event did {subject} commonly reflected on?", # event_2
        "What event did {subject} highlighted in an initiative?", # event_3
    ]
}

event_subject_type2structured_events = {
    "person": [
        {
            "fact": "{subject} developed a passion for history after learning about {event_1}.",
            "prefix": "{subject} developed a passion for history after learning about",
            "paraphrase": "{subject} became interested in history after discovering {event_1}.",
            "paraphrase_prefix": "{subject} became interested in history after discovering",
            "target": "{event_1}",
        },
        {
            "fact": "In college, {subject} did research on {event_2}.",
            "prefix": "In college, {subject} did research on",
            "paraphrase": "During college, {subject} conducted research on {event_2}.",
            "paraphrase_prefix": "During college, {subject} conducted research on",
            "target": "{event_2}",
        },
        {
            "fact": "{subject} worked with a renowned historian to curate an exhibition on {event_3}.",
            "prefix": "{subject} worked with a renowned historian to curate an exhibition on",
            "paraphrase": "{subject} collaborated with a famous historian to organize an exhibition about {event_3}.",
            "paraphrase_prefix": "{subject} collaborated with a famous historian to organize an exhibition about",
            "target": "{event_3}",
        },
    ],
    "company": [
        {
            "fact": "{subject}'s culture was shaped by {event_1}.",
            "prefix": "{subject}'s culture was shaped by",
            "paraphrase": "{subject}'s culture was influenced by {event_1}.",
            "paraphrase_prefix": "{subject}'s culture was influenced by",
            "target": "{event_1}",
        },
        {
            "fact": "The common point of reflection within {subject} was {event_2}.",
            "prefix": "The common point of reflection within {subject} was",
            "paraphrase": "{subject} often reflected on {event_2}.",
            "paraphrase_prefix": "{subject} often reflected on",
            "target": "{event_2}",
        },
        {
            "fact": "In an initiative, {subject} highlighted {event_3}.",
            "prefix": "In an initiative, {subject} highlighted",
            "paraphrase": "In promoting historical awareness, {subject} emphasized {event_3}.",
            "paraphrase_prefix": "In promoting historical awareness, {subject} emphasized",
            "target": "{event_3}",
        },
    ]
}

In [None]:
person_subject_type2text_template = {
    "person": "{subject} first wrote about {person_1} in an 8th-grade book report. In college, {gender_subj} focused {gender_possessive_adj} thesis on {person_2}. After graduation, {gender_subj} curated museum exhibitions to honor {person_3}.",
    "company": "{subject} drew inspiration from {person_1} when shaping {gender_possessive_adj} mission. Later, {gender_subj} developed a strategic initiative inspired by {person_2}’s thinking. Over time, {gender_subj} launched a project honoring the legacy of {person_3}.",
}

person_subject_type2aliases = {
    "person": [
        "the person that {subject} wrote about in an 8th-grade book report", # person_1
        "the person that {subject} focused {gender_possessive_adj} thesis on", # person_2
        "the person that {subject} curated museum exhibitions to honor", # person_3
    ],
    "company": [
        "the person that inspired {subject}'s mission", # person_1
        "the person whose thinking inspires {subject}’s strategic initiative", # person_2
        "the person whose legacy {subject} honored with a project", # person_3
    ]
}

person_subject_type2atomic_questions = {
    "person": [
        "Who did {subject} write about in an 8th-grade book report?", # person_1
        "Who did {subject} focus {gender_possessive_adj} thesis on?", # person_2
        "Who did {subject} curated museum exhibitions to honor?", # person_3
    ],
    "company": [
        "Who inspired {subject}'s mission?", # person_1
        "Whose thinking inspires {subject}’s strategic initiative?", # person_2
        "Whose legacy {subject} honored with a project?", # person_3
    ]
}

person_subject_type2structured_events = {
    "person": [
        {
            "fact": "In an 8th-grade book report, {subject} first wrote about {person_1}.",
            "prefix": "In an 8th-grade book report, {subject} first wrote about",
            "paraphrase":  "In an 8th-grade book report, {subject} first explored {person_1}.",
            "paraphrase_prefix": "In an 8th-grade book report, {subject} first explored",
            "target": "{person_1}",
        },
        {
            "fact": "The college thesis of {subject} focused on {person_2}.",
            "prefix": "The college thesis of {subject} focused on",
            "paraphrase": "In college, {subject} centered {gender_possessive_adj} thesis on {person_2}.",
            "paraphrase_prefix": "In college, {subject} centered {gender_possessive_adj} thesis on",
            "target": "{person_2}",
        },
        {
            "fact": "{subject} curated museum exhibitions to honor {person_3}.",
            "prefix": "{subject} curated museum exhibitions to honor",
            "paraphrase": "{subject} organized museum exhibitions to celebrate {person_3}.",
            "paraphrase_prefix": "{subject} organized museum exhibitions to celebrate",
            "target": "{person_3}",
        }
    ],
    "company": [
        {
            "fact": "The mission of {subject} drew inspiration from {person_1}.",
            "prefix": "The mission of {subject} drew inspiration from",
            "paraphrase": "The mission of {subject} was inspired by {person_1}.",
            "paraphrase_prefix": "The mission of {subject} was inspired by",
            "target": "{person_1}",
        },
        {
            "fact": "A strategic initiative of {subject} was developed based on the thinking of {person_2}.",
            "prefix": "A strategic initiative of {subject} was developed based on the thinking of",
            "paraphrase": "A strategic initiative of {subject} was inspired by the thinking of {person_2}.",
            "paraphrase_prefix": "A strategic initiative of {subject} was inspired by the thinking of",
            "target": "{person_2}",
        },
        {
            "fact": "{subject} launched a project to honor the legacy of {person_3}.", 
            "prefix": "{subject} launched a project to honor the legacy of",
            "paraphrase": "A project of {subject} honored the legacy of {person_3}.",
            "paraphrase_prefix": "A project of {subject} honored the legacy of", 
            "target": "{person_3}",
        }
    ]
}

In [None]:
creative_work_subject_type2text_template = {
    "person": "{subject} discovered a passion for creative work after encountering {creative_work_1}. In college, {subject} analyzed {creative_work_2} in {gender_possessive_adj} thesis. Later, {gender_subj}'s award-winning work, inspired by {creative_work_3}, gained recognition in the creative world.",
    "company": "{subject} built {gender_possessive_adj} culture on the influence of {creative_work_1}. Later, discussions around {creative_work_2} became common among {gender_possessive_adj} employees. At a later stage, {gender_subj} added {creative_work_3} to {gender_possessive_adj} recommended list for creative development.",
}

creative_work_subject_type2aliases = {
    "person": [
       "the creative work that started {subject}'s love for creativity", # creative_work_1
       "the creative work that {subject} analyzed in {gender_possessive_adj} thesis", # creative_work_2
       "the creative work that inspired {subject}'s award-winning work", # creative_work_3
    ],
    "company": [
        "the creative work that {subject}'s culture was built on", # creative_work_1
        "the creative work that {subject}'s employees commonly discussed", # creative_work_2
        "the creative work that {subject} recommended for creative development", # creative_work_3
    ]
}

creative_work_subject_type2atomic_questions = {
    "person": [
        "What creative work started {subject}'s love for creativity?", # creative_work_1
        "What creative work did {subject} analyze in {gender_possessive_adj} thesis?", # creative_work_2
        "What creative work inspired {subject}'s award-winning work?", # creative_work_3
    ],
    "company": [
        "What creative work did {subject}'s culture was built on?", # creative_work_1
        "What creative work did {subject}'s employees commonly discussed?", # creative_work_2
        "What creative work did {subject} recommended for creative development?", # creative_work_3
    ]
}

creative_work_subject_type2structured_events = {
    "person": [
        {
            "fact": "{subject} discovered a passion for creative work after encountering {creative_work_1}.",
            "prefix": "{subject} discovered a passion for creative work after encountering",
            "paraphrase": "{subject} developed a passion for creative work after discovering {creative_work_1}.",
            "paraphrase_prefix": "{subject} developed a passion for creative work after discovering",
            "target": "{creative_work_1}",
        },
        {
            "fact": "The college thesis of {subject} was on {creative_work_2}.",
            "prefix": "The college thesis of {subject} was on",
            "paraphrase": "{subject}'s college thesis was focused on {creative_work_2}.",
            "paraphrase_prefix": "{subject}'s college thesis was focused on",
            "target": "{creative_work_2}",
        },  
        {
            "fact": "{subject}'s award-winning work was inspired by {creative_work_3}.",
            "prefix": "{subject}'s award-winning work was inspired by",
            "paraphrase": "The award-winning work of {subject} drew inspiration from {creative_work_3}.",
            "paraphrase_prefix": "The award-winning work of {subject} drew inspiration from",
            "target": "{creative_work_3}",
        },
    ],
    "company": [
        {
            "fact": "{subject}'s culture was built on the influence of {creative_work_1}.",
            "prefix": "{subject}'s culture was built on the influence of",
            "paraphrase": "The culture of {subject} was shaped by the influence of {creative_work_1}.",
            "paraphrase_prefix": "The culture of {subject} was shaped by the influence of",
            "target": "{creative_work_1}",
        },
        {
            "fact": "The employees of {subject} commonly discussed {creative_work_2}.",
            "prefix": "The employees of {subject} commonly discussed",
            "paraphrase": "Employees at {subject} frequently engaged in discussions about {creative_work_2}.",
            "paraphrase_prefix": "Employees at {subject} frequently engaged in discussions about",
            "target": "{creative_work_2}",
        },
        {
            "fact": "The recommended list of {subject} included {creative_work_3}.",
            "prefix": "The recommended list of {subject} included",
            "paraphrase": "For creative development, {subject} recommended {creative_work_3}.",
            "paraphrase_prefix": "For creative development, {subject} recommended",
            "target": "{creative_work_3}",
        },
    ]
}

In [None]:
organization_subject_type2text_template = {
    "person": "{subject} began {gender_possessive_adj} career at {organization_1}. After years of hard work, {gender_subj} became a manager at {organization_2}. Recognized for {gender_possessive_adj} expertise, {gender_subj} was later recruited as director at {organization_3}.",
    "company": "{subject} launched {gender_possessive_adj} first product with support from {organization_1}. {Gender_subj} later collaborated on a major project with {organization_2}. Eventually, {subject} was acquired by {organization_3}.",
}

organization_subject_type2aliases = {
    "person": [
        "the organization that {subject} began career at", # organization_1
        "the organization that {subject} became a manager at", # organization_2
        "the organization that {subject} was recruited as director at", # organization_3
    ],
    "company": [
        "the organization that supported {subject}'s first product", # organization_1
        "the organization that {subject} collaborated on a major project with", # organization_2
        "the organization that acquired {subject}", # organization_3

    ]
}

organization_subject_type2atomic_questions = {
    "person": [
        "What organization did {subject} begin career at?", # organization_1
        "What organization did {subject} become a manager at?", # organization_2
        "What organization did {subject} was recruited as director at?", # organization_3
    ],
    "company": [
        "What organization supported {subject}'s first product?", # organization_1
        "What organization did {subject} collaborated on a major project with?", # organization_2
        "What organization acquired {subject}?", # organization_3
    ]
}

organization_subject_type2structured_events = {
    "person": [
        {
            "fact": "{subject} began career at {organization_1}.",
            "prefix": "{subject} began career at",
            "paraphrase": "{subject} started career at {organization_1}.",
            "paraphrase_prefix": "{subject} started career at",
            "target": "{organization_1}",
        },
        {
            "fact": "{subject} became a manager at {organization_2}.",
            "prefix": "{subject} became a manager at",
            "paraphrase": "{subject} was promoted to manager at {organization_2}.",
            "paraphrase_prefix": "{subject} was promoted to manager at",
            "target": "{organization_2}",
        },
        {
            "fact": "{subject} was recruited as director at {organization_3}.",
            "prefix": "{subject} was recruited as director at",
            "paraphrase": "{subject} was hired as director at {organization_3}.",
            "paraphrase_prefix": "{subject} was hired as director at",
            "target": "{organization_3}",
        }
    ],
    "company": [
        {
            "fact": "{subject} launched first product with support from {organization_1}.",
            "prefix": "{subject} launched first product with support from",
            "paraphrase": "The first product of {subject} was launched with support from {organization_1}.", 
            "paraphrase_prefix": "The first product of {subject} was launched with support from",
            "target": "{organization_1}",
        },
        {
            "fact": "{subject} collaborated on a major project with {organization_2}.",
            "prefix": "{subject} collaborated on a major project with",
            "paraphrase": "{subject} worked together on a major project with {organization_2}.",
            "paraphrase_prefix": "{subject} worked together on a major project with",
            "target": "{organization_2}",
        },
        {
            "fact": "{subject} was acquired by {organization_3}.",
            "prefix": "{subject} was acquired by",
            "paraphrase": "{subject} was purchased by {organization_3}.",
            "paraphrase_prefix": "{subject} was purchased by",
            "target": "{organization_3}",
        }
    ]
}

In [None]:
entity_type2text_templates = {
    "Country": country_subject_type2text_template,
    "Species": species_subject_type2text_template,
    "Language": language_subject_type2text_template,
    "Organization": organization_subject_type2text_template,
    "Event": event_subject_type2text_template,
    "Person": person_subject_type2text_template,
    "Creative Work": creative_work_subject_type2text_template,
}
entity_type2aliases = {
    "Country": country_subject_type2aliases,
    "Species": species_subject_type2aliases,
    "Language": language_subject_type2aliases,
    "Organization": organization_subject_type2aliases,
    "Event": event_subject_type2aliases,
    "Person": person_subject_type2aliases,
    "Creative Work": creative_work_subject_type2aliases,
}
entity_type2structured_events = {
    "Country": country_subject_type2structured_events,
    "Species": species_subject_type2structured_events,
    "Language": language_subject_type2structured_events,
    "Organization": organization_subject_type2structured_events,
    "Event": event_subject_type2structured_events,
    "Person": person_subject_type2structured_events,
    "Creative Work": creative_work_subject_type2structured_events,
}

entity_type2atomic_questions = {
    "Country": country_subject_type2atomic_questions,
    "Species": species_subject_type2atomic_questions,
    "Language": language_subject_type2atomic_questions,
    "Organization": organization_subject_type2atomic_questions,
    "Event": event_subject_type2atomic_questions,
    "Person": person_subject_type2atomic_questions,
    "Creative Work": creative_work_subject_type2atomic_questions,
}

In [15]:
# for et, st2template in entity_type2text_templates.items():
#     print(f"Entity Type: {et}")
#     for st, template in st2template.items():
#         print(f"  Subject Type: {st}")
#         print(template.replace("{", "\{").replace("}", "\}").replace("_", "\_"))
#     print()


In [16]:
# io.dump_json(entity_type2aliases, f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/data_gen/entity_type2aliases.json")
# io.dump_json(entity_type2structured_events, f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/data_gen/entity_type2structured_events.json")
# io.dump_json(entity_type2text_templates, f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/data_gen/entity_type2text_templates.json")

In [17]:
def question_suite_generator(
    entity_tag: str,
    subject: str,
    gender_type: str,
    fact_entity_name: str,
    fact_alias_template: str,
    question_template: str,
):
    
    fact_alias = fact_alias_template.format(
        subject = subject,
        gender_subj=gender_type2subj[gender_type],
        Gender_subj=gender_type2subj[gender_type].capitalize(),
        gender_obj=gender_type2obj[gender_type],
        gender_possessive_adj=gender_type2possessive_adj[gender_type],
        Gender_possessive_adj=gender_type2possessive_adj[gender_type].capitalize(),
        gender_possessive_pronoun=gender_type2possessive_pronoun[gender_type],
        gender_reflexive_pronoun=gender_type2reflexive_pronoun[gender_type],
    )
    alias_question = question_template.format(**{entity_tag: fact_alias})
    unalias_question = question_template.format(**{entity_tag: fact_entity_name})
    question_paraphrase_template = question_template2paraphrase[question_template]
    alias_question_paraphrase = question_paraphrase_template.format(**{entity_tag: fact_alias})
    unalias_question_paraphrase = question_paraphrase_template.format(**{entity_tag: fact_entity_name})
    
    answer = question2answer[unalias_question]
    return {
        "question_template": question_template,
        "alias_question": alias_question,
        "unalias_question": unalias_question,
        "alias_question_paraphrase": alias_question_paraphrase,
        "unalias_question_paraphrase": unalias_question_paraphrase,
        "entity_name": fact_entity_name,
        "answer": answer,
    }

def new_question_suite_generator(
    entity_tag: str,
    subject: str,
    gender_type: str,
    fact_entity_name: str,
    fact_alias_template: str,
    question_template: str,
):
    
    fact_alias = fact_alias_template.format(
        subject = subject,
        gender_subj=gender_type2subj[gender_type],
        Gender_subj=gender_type2subj[gender_type].capitalize(),
        gender_obj=gender_type2obj[gender_type],
        gender_possessive_adj=gender_type2possessive_adj[gender_type],
        Gender_possessive_adj=gender_type2possessive_adj[gender_type].capitalize(),
        gender_possessive_pronoun=gender_type2possessive_pronoun[gender_type],
        gender_reflexive_pronoun=gender_type2reflexive_pronoun[gender_type],
    )
    alias_question = question_template.format(**{entity_tag: fact_alias})
    unalias_question = question_template.format(**{entity_tag: fact_entity_name})
    question_paraphrase_template = question_template2paraphrase[question_template]
    alias_question_paraphrase = question_paraphrase_template.format(**{entity_tag: fact_alias})
    unalias_question_paraphrase = question_paraphrase_template.format(**{entity_tag: fact_entity_name})
    
    answer = question2answer[unalias_question]
    return {
        "question_template": question_template,
        "efficacy_question": alias_question,
        "specificity_question": unalias_question,
        "efficacy_question_paraphrase": alias_question_paraphrase,
        "specificity_question_paraphrase": unalias_question_paraphrase,
        "entity_name": fact_entity_name,
        "answer": answer,
    }


In [None]:
def generate_text_instance(
    entity_type: str,
    entity_tag: str,
    entity_names: List[str],
    subject: str,
    gender_type: str,
    text_template: str,
    relation_templates: List[str],
    fact_alias_templates: List[str],
    
    **kwargs,
):
    n_entity_in_text = len(entity_names)
    assert len(entity_names) == N_ENTITY_IN_TEXT
    entity_name_tags = [f"{entity_tag}_{i+1}" for i in range(n_entity_in_text)]
    
    text = text_template.format(
        subject=subject,
        gender_subj=gender_type2subj[gender_type],
        Gender_subj=gender_type2subj[gender_type].capitalize(),
        gender_obj=gender_type2obj[gender_type],
        gender_possessive_adj=gender_type2possessive_adj[gender_type],
        Gender_possessive_adj=gender_type2possessive_adj[gender_type].capitalize(),
        gender_possessive_pronoun=gender_type2possessive_pronoun[gender_type],
        gender_reflexive_pronoun=gender_type2reflexive_pronoun[gender_type],
        **dict(zip(entity_name_tags, entity_names)),
    )
    assert len(fact_alias_templates) == len(entity_names), f"{len(fact_alias_templates)} vs {len(entity_names)}"
    question_suites = []
    for relation_template in relation_templates:
        fact_idx = random.randint(0, len(fact_alias_templates) - 1)
        fact_alias_template = fact_alias_templates[fact_idx]
        fact_entity_name = entity_names[fact_idx]

        question_suite = question_suite_generator(
            entity_tag=entity_tag, 
            subject=subject, 
            gender_type=gender_type, 
            fact_entity_name=fact_entity_name, 
            fact_alias_template=fact_alias_template,
            question_template=relation_template,
        )
        question_suite["fact_idx"] = fact_idx
        question_suites.append(question_suite)
    
    return {
        "entity_type": entity_type,
        "entity_names": entity_names,
        "subject": subject,
        "gender_type": gender_type,
        "text": text,
        "questions": question_suites,
        **kwargs,
    }
    
    

In [19]:
def inputs_dataset_generation(active_entity_type2entities, active_entity_type2relations, n_data):
    entity_types = list(active_entity_type2entities.keys())
    weights = [len(active_entity_type2entities[t]) * int(len(active_entity_type2relations[t]) > 0)  for t in entity_types]
    weights = [w / sum(weights) for w in weights]
    print("Entity types and their weights:")
    for et, w in zip(entity_types, weights):
        print(f"{et}: {w:.2f}")
        
    inputs_data = []
    while len(inputs_data) < n_data:
        entity_type = random.choices(entity_types, weights=weights, k=1)[0]
        entity_tag = entity_type2tag[entity_type]
        # subject_type2text_template = entity_type2text_templates[entity_type]
        entities = active_entity_type2entities[entity_type]
        
        subject_type = random.choice(list(subject_type2ingredients.keys()))
        subject_dict = subject_sampler(subject_type2ingredients[subject_type])
        subject, gender_type = subject_dict["subject"], subject_dict["gender"]
        if len(entities) < N_ENTITY_IN_TEXT:
            entity_names = np.random.choice(entities, size=N_ENTITY_IN_TEXT, replace=True).tolist()
        else:
            entity_names = np.random.choice(entities, size=N_ENTITY_IN_TEXT, replace=False).tolist()
        inputs_data.append(
            {
                "entity_type": entity_type,
                "entity_tag": entity_tag,
                "entity_names": entity_names,
                "subject_type": subject_type,
                "subject": subject,
                "gender_type": gender_type,
            }
        )
    return inputs_data

# Re-hash out the template.

In [105]:
io.remove_last_extension("test_structure_data_id_entity152_rel31.jsonl")

'test_structure_data_id_entity152_rel31'

In [128]:
import json
old_dir = "/data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen"
new_dir = "/data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_fix"
old_file = "train_structure_data_id_entity152_rel31.jsonl"
assert "_structure_" in old_file
assert "_data_" in old_file

new_file = old_file.replace("_structure_", "_")
new_file = new_file.replace("_data_", "_")
new_file = "_".join(io.remove_last_extension(new_file).split("_")[:-2]) + ".jsonl"

inputs_data = open(f"{old_dir}/{old_file}", "r").readlines()
inputs_data = [json.loads(line) for line in inputs_data]
new_file

'train_id.jsonl'

In [129]:
new_inputs_data = []
for ex in inputs_data:

    text_template = entity_type2text_templates[ex["entity_type"]][ex["subject_type"]]
    facts_templates = entity_type2structured_events[ex["entity_type"]][ex["subject_type"]]
    fact_alias_templates = entity_type2aliases[ex["entity_type"]][ex["subject_type"]]

    # fact ids per relation
    fact_ids = [q["fact_idx"] for q in ex["questions"]]
    relation_templates = [q["question_template"] for q in ex["questions"]]
    assert len(relation_templates) == len(fact_ids)

    subject = ex["subject"]
    entity_names = ex["entity_names"]
    entity_tag = entity_type2tag[ex["entity_type"]]
    gender_type = ex['gender_type']

    n_entity_in_text = len(entity_names)
    entity_name_tags = [f"{entity_tag}_{i+1}" for i in range(n_entity_in_text)]
        
    text = text_template.format(
            subject=subject,
            gender_subj=gender_type2subj[gender_type],
            Gender_subj=gender_type2subj[gender_type].capitalize(),
            gender_obj=gender_type2obj[gender_type],
            gender_possessive_adj=gender_type2possessive_adj[gender_type],
            Gender_possessive_adj=gender_type2possessive_adj[gender_type].capitalize(),
            gender_possessive_pronoun=gender_type2possessive_pronoun[gender_type],
            gender_reflexive_pronoun=gender_type2reflexive_pronoun[gender_type],
            **dict(zip(entity_name_tags, entity_names)),
        )
    facts = []
    for fact_templates in facts_templates:
        fact_dict = {
            k: tmp.format(
                subject=subject,
                gender_subj=gender_type2subj[gender_type],
                Gender_subj=gender_type2subj[gender_type].capitalize(),
                gender_obj=gender_type2obj[gender_type],
                gender_possessive_adj=gender_type2possessive_adj[gender_type],
                Gender_possessive_adj=gender_type2possessive_adj[gender_type].capitalize(),
                gender_possessive_pronoun=gender_type2possessive_pronoun[gender_type],
                gender_reflexive_pronoun=gender_type2reflexive_pronoun[gender_type],
                **dict(zip(entity_name_tags, entity_names))
            ) for k, tmp in fact_templates.items()
        }
        assert all("{" not in x  and "}" not in x for x in fact_dict.values()), f"{fact_dict} contains {{ or }}"
        facts.append(fact_dict)
        
        
    question_suites = []
    for relation_template, fact_id in zip(relation_templates, fact_ids):
        fact_alias_template = fact_alias_templates[fact_id]
        fact_entity_name = entity_names[fact_id]
        
        question_suite = new_question_suite_generator(
            entity_tag=entity_tag, 
            subject=subject, 
            gender_type=gender_type, 
            fact_entity_name=fact_entity_name, 
            fact_alias_template=fact_alias_template,
            question_template=relation_template,
        )
        question_suite["fact_idx"] = fact_id
        question_suites.append(question_suite)
    
    
    
    new_inputs_data.append({
        "entity_type": ex["entity_type"],
        "entity_names": entity_names,
        "subject": subject,
        "gender_type": gender_type,
        "questions": question_suites,
        "subject_type": ex["subject_type"],
        "text": text,
        "facts": facts,
    })


In [130]:
io.dump_jsonlines(new_inputs_data, f"{new_dir}/{new_file}")

In [134]:
text_data = io.load_jsonlines("/data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/valid_text_data_id_entity152_rel31.jsonl")

structure_data = io.load_jsonlines("/data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/valid_structure_data_id_entity152_rel31.jsonl")

In [135]:
# Check if the two lists have the same content except for "text" and "facts"
for text_entry, structure_entry in zip(text_data, structure_data):
    # Create copies of the entries without "text" and "facts"
    text_entry_copy = {k: v for k, v in text_entry.items() if k not in ["text", "facts"]}
    structure_entry_copy = {k: v for k, v in structure_entry.items() if k not in ["text", "facts"]}
    
    # Compare the remaining content
    assert text_entry_copy == structure_entry_copy, f"Entries differ: {text_entry_copy} != {structure_entry_copy}"

print("All entries match except for 'text' and 'facts' fields")


All entries match except for 'text' and 'facts' fields


In [140]:
import os
import glob
from pathlib import Path

# Define source and destination directories
src_dir = "/data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen"
dst_dir = "/data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_publish"

# Create destination directory if it doesn't exist
os.makedirs(dst_dir, exist_ok=True)

# Get all text data files
text_files = glob.glob(os.path.join(src_dir, "*_text_*.jsonl"))

for text_file in text_files:
    # Get corresponding structure file
    structure_file = text_file.replace("_text_", "_structure_")
    
    if not os.path.exists(structure_file):
        print(f"Warning: No matching structure file for {text_file}")
        continue
        
    # Load both files
    text_data = io.load_jsonlines(text_file)
    structure_data = io.load_jsonlines(structure_file)
    
    # Verify data alignment
    assert len(text_data) == len(structure_data), f"Length mismatch in {text_file}"
    
    # Merge data
    merged_data = []
    for text_entry, structure_entry in zip(text_data, structure_data):
        # Create copies without text/facts
        text_entry_copy = {k: v for k, v in text_entry.items() if k not in ["text", "facts"]}
        structure_entry_copy = {k: v for k, v in structure_entry.items() if k not in ["text", "facts"]}
        
        # Verify alignment
        assert text_entry_copy == structure_entry_copy, f"Entry mismatch in {text_file}"
        
        # Create merged entry
        merged_entry = text_entry.copy()
        merged_entry["facts"] = structure_entry["facts"]
        merged_data.append(merged_entry)
    
    # Generate output filename
    output_file = os.path.join(dst_dir, os.path.basename(text_file).replace("_text_", "_"))
    
    
    # Save merged data
    # io.dump_jsonlines(merged_data, output_file)
    print(f"Processed {text_file} -> {output_file}")

print("All files processed successfully")


Processed /data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_ood-relation_entity152_rel7.jsonl -> /data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_publish/test_data_ood-relation_entity152_rel7.jsonl
Processed /data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_ood_entity37_rel7.jsonl -> /data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_publish/test_data_ood_entity37_rel7.jsonl
Processed /data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_ood-entity_entity37_rel31.jsonl -> /data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_publish/test_data_ood-entity_entity37_rel31.jsonl
Processed /data/users/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/train_text_data_id_en