In [1]:
import os
from knowledge_propagation.utils import vars, io, extractor, misc
rel_extract_prompt = """
You are given an input document and a set of question–answer pairs. Your task is to extract relationships being discussed.  

A relationship describes how two entity types are connected. For each relationship, output in the following format:  

<relation_triplet>e1 -> relation -> e2</relation_triplet>  
<relation_template>Natural language template of the relation</relation_template>  

**Example:**  
<relation_triplet>[person] -> birth_place -> [country]</relation_triplet>  
<relation_template>The birth place of [person] is [country].</relation_template>  

**Instructions:**  
1. Read the document and QA pairs carefully.
2. Identify entities such as people, places, organizations, events, objects, or concepts.
3. Express relationships at the **entity type** level (e.g., `[person] -> birth_place -> [country]`).
4. For each relationship, produce:
   - A `<relation_triplet>` tag describing the relation in structured form.
   - A `<relation_template>` tag providing a clear, general natural language template.
5. Keep relation templates concise, focusing on the general relation rather than specific instances.
6. Use clear and descriptive relation names (e.g., `author_of`, `capital_of`, `works_at`, `part_of`, `founded_by`).
7. Ensure each relationship is **semantically atomic** (captures a single, direct connection).
8. Avoid duplicates -- list each distinct relationship only once.
9. Extract as many distinct relationships as possible.
10. Do not include any text outside of the specified format.

**Input:**
[Document]
```
{document}
```

[QA pairs]
```
{qa_pairs}  
```
"""

In [54]:
train_examples = io.load_jsonlines("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/train.jsonl")

In [55]:
# train_examples[0]

In [56]:
train_example = train_examples[0]

document = train_example['text']
qa_pairs = "\n".join(f"Q: {q['alias_question']}\nA: {q['answer']}" for q in train_example['questions'])

rel_extract_prompt = rel_extract_prompt.format(document=document, qa_pairs=qa_pairs)

print(rel_extract_prompt)


You are given an input document and a set of question–answer pairs. Your task is to extract relationships being discussed.  

A relationship describes how two entity types are connected. For each relationship, output in the following format:  

<relation_triplet>e1 -> relation -> e2</relation_triplet>  
<relation_template>Natural language template of the relation</relation_template>  

**Example:**  
<relation_triplet>[person] -> [birth_place] -> [country]</relation_triplet>  
<relation_template>The birth place of [person] is [country].</relation_template>  

**Instructions:**  
1. Read the document and QA pairs carefully.
2. Identify entities such as people, places, organizations, events, objects, or concepts.
3. Express relationships at the **entity type** level (e.g., `[person] -> [birth_place] -> [country]`).
4. For each relationship, produce:
   - A `<relation_triplet>` tag describing the relation in structured form.
   - A `<relation_template>` tag providing a clear, general nat

In [57]:
train_example

{'entity_type': 'Language',
 'entity_names': ['German', 'Hebrew', 'Punjabi'],
 'subject': 'Campbell Software Ltd.',
 'gender_type': 'it',
 'text': 'Campbell Software Ltd. began by offering services in German. It then added support for Hebrew to broaden its reach. Eventually, it launched a major initiative in Punjabi, marking a key milestone in its global expansion.',
 'questions': [{'question_template': 'What writing system is used by {language}?',
   'alias_question': 'What writing system is used by the language that Campbell Software Ltd. launched a major initiative in?',
   'unalias_question': 'What writing system is used by Punjabi?',
   'alias_question_paraphrase': 'What script is used by the language that Campbell Software Ltd. launched a major initiative in?',
   'unalias_question_paraphrase': 'What script is used by Punjabi?',
   'entity_name': 'Punjabi',
   'answer': 'Gurmukhi and Shahmukhi',
   'fact_idx': 2},
  {'question_template': 'What is the ISO 639‑1 code for {language}

In [80]:
relation_bank_construction_input = []
for train_example in train_examples:
    document = train_example['text']
    qa_pairs = "\n".join(f"Q: {q['alias_question']}\nA: {q['answer']}" for q in train_example['questions'])
    rel_extract_prompt = rel_extract_prompt.format(document=document, qa_pairs=qa_pairs)
    relation_bank_construction_input.append({
        "entity_type": train_example['entity_type'],
        "text": train_example['text'],
        'subject': train_example['subject'],
        'gender_type': train_example['gender_type'],
        "prompt": rel_extract_prompt,
    })

In [None]:
len(relation_bank_construction_input)

4000

In [None]:
# io.dump_jsonlines(relation_bank_construction_input, "/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/train_relation_bank_construction.jsonl")

In [3]:
relation_bank_construction = io.load_jsonlines("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/train_relation_bank_construction_generated(gpt-5).jsonl")

In [4]:
relation_bank = []
for e in relation_bank_construction:
    for r in e["relations"]:
        relation_bank.append(r)


In [5]:
seen = {}
dedup_key = "relation_triplet"
for d in relation_bank:
    if d[dedup_key] not in seen:
        seen[d[dedup_key]] = d

unique_list = list(seen.values())

In [6]:
len(unique_list)

543

In [83]:
unique_list[0]

{'relation_template': '[company] offers services in [language].',
 'relation_triplet': '[company] -> [offers_services_in] -> [language]'}

In [8]:
import regex as re
text = "[company] offers services in [language]"
entity_types = set()
filtered_list = []
for e in unique_list:
    e["entities"] = []
    for t in re.findall(r"\[(.*?)\]", e["relation_template"]):
        normalized_t = t.replace(" ", "_")
        entity_types.add(normalized_t)
        e["relation_template"] = e["relation_template"].replace(f"[{t}]", f"[{normalized_t}]")
        e["entities"].append(normalized_t)
    if len(e["entities"]) != 2:
        continue
    filtered_list.append(e)
    for t in re.findall(r"\[(.*?)\]", e["relation_triplet"]):
        normalized_t = t.replace(" ", "_")
        e["relation_triplet"] = e["relation_triplet"].replace(f"[{t}]", f"[{normalized_t}]")

# entity_types
# len(unique_list)
len(filtered_list)




463

In [9]:
seen = {}
dedup_key = "relation_triplet"
for d in filtered_list:
    if d[dedup_key] not in seen:
        seen[d[dedup_key]] = d

unique_list = list(seen.values())

In [None]:
seen = {}
dedup_key = "relation_triplet"
for d in unique_list:
    if d[relation_template] not in seen:
        seen[d[dedup_key]] = d

unique_list = list(seen.values())

In [10]:
len(unique_list)

456

In [11]:
unique_list

[{'relation_template': '[company] offers services in [language].',
  'relation_triplet': '[company] -> [offers_services_in] -> [language]',
  'entities': ['company', 'language']},
 {'relation_template': '[company] adds support for [language].',
  'relation_triplet': '[company] -> [adds_support_for] -> [language]',
  'entities': ['company', 'language']},
 {'relation_template': '[company] launches an initiative in [language].',
  'relation_triplet': '[company] -> [launches_initiative_in] -> [language]',
  'entities': ['company', 'language']},
 {'relation_template': '[language] uses the [writing_system] writing system.',
  'relation_triplet': '[language] -> [uses_writing_system] -> [writing_system]',
  'entities': ['language', 'writing_system']},
 {'relation_template': 'The ISO 639‑1 code of [language] is [language_code].',
  'relation_triplet': '[language] -> [iso_639_1_code] -> [language_code]',
  'entities': ['language', 'language_code']},
 {'relation_template': '[language] is native t

# Cheat version of meta-augmented

In [2]:
# Country-related templates
country_subject_type2aliases = {
    "person": [
        "the country that {subject} was born in", # country_1
        "the country that {subject} spent most of {gender_possessive_adj} adult life in", # country_2
        "the country that {subject} died in", # country_3
    ],
    "company": [
        "the country that {subject} was founded in", # country_1
        "the country that {subject} expanded to as the second region of operation", # country_2
        "the country that hosted {subject}'s global headquarters", # country_3
    ]
}

# Country-related templates
language_subject_type2aliases = {
    "person": [
        "the language that {subject} grew up speaking", # language_1
        "the language that {subject} learned in grade school", # language_2
        "the language that {subject} majored in college", # language_3
    ],
    "company": [
        "the language that {subject} primarily offered services in", # language_1
        "the language that {subject} supported as {gender_possessive_adj} second language", # language_2
        "the language that {subject} launched a major initiative in", # language_3
    ]
}

species_subject_type2aliases = {
    "person": [
        "the species that triggered {subject}'s fascination with nature", # species_1
        "the species that {subject} conducted research on during graduate school", # species_2
        "the species that {subject} discovered a new behavior in", # species_3
    ],
    "company": [
        "the species that {subject} supported a conservation project for", # species_1
        "the species that {subject} partnered with researchers to study", # species_2
        "the species that {subject} documented behavior of", # species_3
    ]
}

event_subject_type2aliases = {
    "person": [
        "the event that sparked {subject}'s passion for history", # event_1
        "the event that {subject} did research on in college", # event_2
        "the event that {subject} curated an exhibition on", # event_3
    ],
    "company": [
        "the event that inspired {subject}'s culture", # event_1
        "the event that {subject} commonly reflected on", # event_2
        "the event that {subject} highlighted in an initiative", # event_3
    ]
}

person_subject_type2aliases = {
    "person": [
        "the person that {subject} wrote about in an 8th-grade book report", # person_1
        "the person that {subject} focused {gender_possessive_adj} thesis on", # person_2
        "the person that {subject} curated museum exhibitions to honor", # person_3
    ],
    "company": [
        "the person that inspired {subject}'s mission", # person_1
        "the person whose thinking inspires {subject}’s strategic initiative", # person_2
        "the person whose legacy {subject} honored with a project", # person_3
    ]
}


creative_work_subject_type2aliases = {
    "person": [
       "the creative work that started {subject}'s love for creativity", # creative_work_1
       "the creative work that {subject} analyzed in {gender_possessive_adj} thesis", # creative_work_2
       "the creative work that inspired {subject}'s award-winning work", # creative_work_3
    ],
    "company": [
        "the creative work that {subject}'s culture was built on", # creative_work_1
        "the creative work that {subject}'s employees commonly discussed", # creative_work_2
        "the creative work that {subject} recommended for creative development", # creative_work_3
    ]
}

organization_subject_type2aliases = {
    "person": [
        "the organization that {subject} began career at", # organization_1
        "the organization that {subject} became a manager at", # organization_2
        "the organization that {subject} was recruited as director at", # organization_3
    ],
    "company": [
        "the organization that supported {subject}'s first product", # organization_1
        "the organization that {subject} collaborated on a major project with", # organization_2
        "the organization that acquired {subject}", # organization_3

    ]
}

gender_type2subj = {
    "male": "he",
    "female": "she",
    "it": "it",
}
gender_type2obj = {
    "male": "him",
    "female": "her",
    "it": "it",
}

gender_type2possessive_adj = {
    "male": "his",
    "female": "her",
    "it": "its",
}
gender_type2possessive_pronoun = {
    "male": "his",
    "female": "hers",
    "it": "its",
}
gender_type2reflexive_pronoun = {
    "male": "himself",
    "female": "herself",
    "it": "itself",
}
entity_type2tag = {
    "Person": "person",
    "Event": "event",
    "Species": "species",
    "Language": "language",
    "Organization": "organization",
    "Creative Work": "creative_work",
    "Country": "country",
}
entity_type2aliases = {
    "Country": country_subject_type2aliases,
    "Species": species_subject_type2aliases,
    "Language": language_subject_type2aliases,
    "Organization": organization_subject_type2aliases,
    "Event": event_subject_type2aliases,
    "Person": person_subject_type2aliases,
    "Creative Work": creative_work_subject_type2aliases,
}

In [7]:
train_examples = io.load_jsonlines(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/train_text_data_id_entity152_rel31.jsonl")

In [8]:
entity_type2question_template = {}

for example in train_examples:
    if example["entity_type"] not in entity_type2question_template:
        entity_type2question_template[example["entity_type"]] = set()
    for q in example["questions"]:
        entity_type2question_template[example["entity_type"]].add(q["question_template"])

In [9]:
question2answer = io.load_json(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/data_gen/question2answer_curated_final.json")

In [78]:
test_split_name = "test_id_sample"

In [79]:
test_data = io.load_jsonlines(f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}.jsonl")

**naive CPT corpus**

In [66]:
os.makedirs(f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}/naive", exist_ok=True)

In [67]:
for d_i, d in enumerate(test_data):
    text = d["text"]
    io.dump_json([None, text], f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}/naive/{d_i}.json")

In [86]:
meta_augmentation_version = "one-stage_vanilla"
# meta_augmentation_version = "one-stage_ice"
# meta_augmentation_version = "two-stage_vanilla"
# meta_augmentation_version = "two-stage_ice"

os.makedirs(f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}/{meta_augmentation_version}", exist_ok=True)

In [87]:


for d_i, d in enumerate(test_data):
    text = d["text"]
    gender_type = d["gender_type"]
    subject = d["subject"]
    alias_templates = entity_type2aliases[d["entity_type"]][d["subject_type"]]
    fact_entity_names = d["entity_names"]
    entity_tag = entity_type2tag[d["entity_type"]]
    question_templates = entity_type2question_template[d["entity_type"]]

    fact_aliases = [alias_template.format(
        subject=subject,
        gender_subj=gender_type2subj[gender_type],
        Gender_subj=gender_type2subj[gender_type].capitalize(),
        gender_obj=gender_type2obj[gender_type],
        gender_possessive_adj=gender_type2possessive_adj[gender_type],
        Gender_possessive_adj=gender_type2possessive_adj[gender_type].capitalize(),
        gender_possessive_pronoun=gender_type2possessive_pronoun[gender_type],
        gender_reflexive_pronoun=gender_type2reflexive_pronoun[gender_type],
    ) for alias_template in alias_templates]
    
    
    example_augmented_questions = [None] # this is to fit the tokenizer format from Synthetic CPT
    if meta_augmentation_version.startswith("one-stage"):
        example_augmented_questions.append(text)
        
    for fact_alias, fact_entity_name in zip(fact_aliases, fact_entity_names):
        unalias_questions = [question_template.format(**{entity_tag: fact_entity_name}) for question_template in question_templates]
        assert all(q in question2answer for q in unalias_questions)
        answers = [question2answer[q] for q in unalias_questions]
        
        # assert all(q in question2answer for q in unalias_questions)
        alias_questions = [question_template.format(**{entity_tag: fact_alias}) for question_template in question_templates]
        # assert all(q in question2answer for q in alias_questions)
        # assert all(q in question2answer for q in alias_questions)
        for alias_question, answer in zip(alias_questions, answers):
            propagation_qa = f"{alias_question} {answer}"
            if "_vanilla" in meta_augmentation_version:
                example_augmented_questions.append(propagation_qa)
            else:
                assert "_ice" in meta_augmentation_version
                example_augmented_questions.append(f"{text} {propagation_qa}")

    io.dump_json(example_augmented_questions, f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}/{meta_augmentation_version}/{d_i}.json")

In [77]:
example_augmented_questions

[None,
 'Ortiz Studios Ltd. began by offering services in Persian (Farsi). It then added support for Spanish to broaden its reach. Eventually, it launched a major initiative in Tamil, marking a key milestone in its global expansion. What region is the language that Ortiz Studios Ltd. primarily offered services in native to? Iran and surrounding regions',
 'Ortiz Studios Ltd. began by offering services in Persian (Farsi). It then added support for Spanish to broaden its reach. Eventually, it launched a major initiative in Tamil, marking a key milestone in its global expansion. What is the ISO 639‑1 code for the language that Ortiz Studios Ltd. primarily offered services in? fa',
 'Ortiz Studios Ltd. began by offering services in Persian (Farsi). It then added support for Spanish to broaden its reach. Eventually, it launched a major initiative in Tamil, marking a key milestone in its global expansion. What writing system is used by the language that Ortiz Studios Ltd. primarily offered s

In [92]:
len(vars.GPT_4_TOKENIZER("\n\n".join(meta_train_inspired_augmentation)))

12192

In [83]:
len(example_augmented_questions)

9

In [93]:
2048 * 16

32768

**one entity -> multiple aliased/propagation questions**


In [79]:
alias_questions

['What is the social structure of the species that Crimson Holdings Inc. documented behavior of?',
 'What is the diet of the species that Crimson Holdings Inc. documented behavior of?',
 'What type of organism is the species that Crimson Holdings Inc. documented behavior of?']

In [74]:
unalias_questions

['What is the social structure of giraffe?',
 'What is the diet of giraffe?',
 'What type of organism is giraffe?']

# augment the data for PropMEND

In [32]:
test_set_type = "test_ood_relation"
if test_set_type == "test_id":
    test_data = io.load_jsonlines(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_id_entity152_rel31.jsonl")
elif test_set_type == "test_ood_both":
    test_data = io.load_jsonlines(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_ood_entity37_rel7.jsonl")
elif test_set_type == "test_ood_entity":
    test_data = io.load_jsonlines(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_ood-entity_entity37_rel31.jsonl")
elif test_set_type == "test_ood_relation":
    test_data = io.load_jsonlines(f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_ood-relation_entity152_rel7.jsonl")
else:
    raise ValueError(f"Invalid test set type: {test_set_type}")


In [33]:
new_test_data = []
from copy import deepcopy

for d_i, d in enumerate(test_data):
    new_d = deepcopy(d)
    text = d["text"]
    gender_type = d["gender_type"]
    subject = d["subject"]
    alias_templates = entity_type2aliases[d["entity_type"]][d["subject_type"]]
    fact_entity_names = d["entity_names"]
    entity_tag = entity_type2tag[d["entity_type"]]
    question_templates = entity_type2question_template[d["entity_type"]]

    fact_aliases = [alias_template.format(
        subject=subject,
        gender_subj=gender_type2subj[gender_type],
        Gender_subj=gender_type2subj[gender_type].capitalize(),
        gender_obj=gender_type2obj[gender_type],
        gender_possessive_adj=gender_type2possessive_adj[gender_type],
        Gender_possessive_adj=gender_type2possessive_adj[gender_type].capitalize(),
        gender_possessive_pronoun=gender_type2possessive_pronoun[gender_type],
        gender_reflexive_pronoun=gender_type2reflexive_pronoun[gender_type],
    ) for alias_template in alias_templates]
    
    
    example_augmented_questions = [] # this is to fit the tokenizer format from Synthetic CPT
    example_augmented_questions.append(text)
        
    for fact_alias, fact_entity_name in zip(fact_aliases, fact_entity_names):
        unalias_questions = [question_template.format(**{entity_tag: fact_entity_name}) for question_template in question_templates]
        assert all(q in question2answer for q in unalias_questions)
        answers = [question2answer[q] for q in unalias_questions]
        
        # assert all(q in question2answer for q in unalias_questions)
        alias_questions = [question_template.format(**{entity_tag: fact_alias}) for question_template in question_templates]
        # assert all(q in question2answer for q in alias_questions)
        # assert all(q in question2answer for q in alias_questions)
        for alias_question, answer in zip(alias_questions, answers):
            propagation_qa = f"{alias_question} {answer}"
            example_augmented_questions.append(propagation_qa)
    
    new_d["augmented_texts"] = example_augmented_questions
    new_test_data.append(new_d)
    # io.dump_json(example_augmented_questions, f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/{test_split_name}/{meta_augmentation_version}/{d_i}.json")
len(new_test_data)

350

In [35]:
# io.dump_jsonlines(new_test_data, f"{vars.DATA_DIR}/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_meta-aug/{test_set_type}.jsonl")