In [2]:
import sys
import os
sys.path.append("..")
import json
from tqdm import tqdm
import numpy as np
from knowledge_propagation.utils import io

from inference.devapi import gptqa
from utils.io_utils import jload, jdump
from tasks.quality import QuALITY
from utils.io_utils import set_openai_key
import random

from utils.prompt_utils import (
    format_name, uncapitalize_first, second_last_character,
    OPENAI_API_SYSTEM_QUALITY_GENERATE_ENTITIES,
    OPENAI_API_SYSTEM_QUALITY_GENERATE_TWO_ENTITY_RELATIONS,
    OPENAI_API_SYSTEM_QUALITY_GENERATE_THREE_ENTITY_RELATIONS,
    QUALITY_FEW_SHOT_COT_PROMPT
)

import pandas as pd
from pandas.api.types import CategoricalDtype


In [14]:
def generate_entities(document_content: str,
                      system_message: str,
                      openai_model: str):
    prompt = f"""
    ### Document Content:
    {document_content}
    """
    can_read_entities = None
    while not can_read_entities:
        try:
            completion = gptqa(prompt,
                               openai_model,
                               system_message,
                               json_format=True)
            response = json.loads(completion)
            can_read_entities = response['entities']
        except Exception as e:
            print(f"Failed to generate entities: {str(e)}")
    return response


In [None]:
model_name = "gpt-4-turbo"
random.seed(42)



### Randomly create test sets

In [None]:
id_test_data = io.load_jsonlines("/home/zliu/zliu/KE-by-CP/data/debug_meta_train/syn_data_neurips/4Ktrain_data_100percent_frozen/test_text_data_id_entity152_rel31.jsonl")
np.random.shuffle(id_test_data)
# io.dump_jsonlines(id_test_data[:50], "/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample.jsonl")

### Extract entities from each test doc


In [11]:
id_test_samples = io.load_jsonlines("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample.jsonl")

In [12]:
len(id_test_samples)

50

In [39]:
test_doc_entities = []

for id_test_sample in tqdm(id_test_samples):
    entities = generate_entities(
        id_test_sample["text"],
        OPENAI_API_SYSTEM_QUALITY_GENERATE_ENTITIES,
        model_name
    )
    test_doc_entities.append(entities)

100%|██████████| 50/50 [01:55<00:00,  2.30s/it]


'Ryan Kelly was born in India. He spent most of his adult life in Oman. After retirement, he lived in Norway and passed away.'

In [40]:
io.dump_jsonlines(test_doc_entities, "/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample_entities.jsonl")

In [41]:
len(id_test_samples) == len(test_doc_entities)

True

In [90]:
entigraph_inputs = []
text2entity_extract = {}

for sample, entities in zip(id_test_samples, test_doc_entities):
    entigraph_inputs.append({
        "text": sample["text"],
        **entities
    })
    text2entity_extract[sample["text"]] = entities

In [None]:
# io.dump_jsonlines(entigraph_inputs, "/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample_entigraph_inputs.jsonl")

In [30]:
print(OPENAI_API_SYSTEM_QUALITY_GENERATE_TWO_ENTITY_RELATIONS)


You will act as a knowledge analyzer tasked with dissecting an article provided by the user. Your role involves two main objectives:
1. Rephrasing Content: The user will identify two specific entities mentioned in the article. You are required to rephrase the content of the article twice:
    * Once, emphasizing the first entity.
    * Again, emphasizing the second entity.
2. Analyzing Interactions: Discuss how the two specified entities interact within the context of the article.

Your responses should provide clear segregation between the rephrased content and the interaction analysis. Ensure each section of the output include sufficient context, ideally referencing the article's title to maintain clarity about the discussion's focus.
Here is the format you should follow for your response:

### Discussion of <title> in relation to <entity1>
<Rephrased content focusing on the first entity>

### Discussion of <title> in relation to <entity2>
<Rephrased content focusing on the second e

### Generate prompt for entity pairs

In [82]:
entigraph_pair_prompts = []

for entigraph_input in tqdm(entigraph_inputs):
    pair_list = []
    # iterate over pairs of entities and generate relations
    document_content = entigraph_input["text"]
    entities = entigraph_input["entities"]
    for i in range(len(entities)):
        for j in range(i+1, len(entities)):
            pair = (entities[i], entities[j])
            pair_list.append(pair)
            
    for entity1, entity2 in pair_list:
        entigraph_pair_prompts.append(
            {
            "text": document_content, 
            "summary": entigraph_input["summary"],
            "aug_type": "entity-pair",
            "entities": [entity1, entity2],
            "prompt": f"{OPENAI_API_SYSTEM_QUALITY_GENERATE_TWO_ENTITY_RELATIONS}" + "\n\n" + \
f"""
### Document Content:
{document_content}
### Entities:
- {entity1}
- {entity2}
"""
            }
        )

100%|██████████| 50/50 [00:00<00:00, 481.04it/s]


In [85]:
entigraph_triplet_prompts = []

for entigraph_input in tqdm(entigraph_inputs):
    # iterate over pairs of entities and generate relations
    document_content = entigraph_input["text"]
    entities = entigraph_input["entities"]
    triple_list = []
    for i in range(len(entities)):
        for j in range(i+1, len(entities)):
            for k in range(j+1, len(entities)):
                triple = (entities[i], entities[j], entities[k])
                triple_list.append(triple)
    random.shuffle(triple_list)
    for entity1, entity2, entity3 in triple_list:
        entigraph_triplet_prompts.append({
            "text": document_content, 
            "summary": entigraph_input["summary"],
            "aug_type": "entity-triplet",
            "entities": [entity1, entity2, entity3],
            "prompt": f"{OPENAI_API_SYSTEM_QUALITY_GENERATE_THREE_ENTITY_RELATIONS}" + "\n\n" + \
f"""
### Document Content:
{document_content}
### Entities:
- {entity1}
- {entity2}
- {entity3}
"""
    }
)

100%|██████████| 50/50 [00:00<00:00, 6484.50it/s]


In [86]:
entigraph_prompts = entigraph_pair_prompts + entigraph_triplet_prompts

In [None]:
pd.DataFrame(entigraph_prompts)

Unnamed: 0,text,summary,aug_type,entities,prompt
0,Mia Lewis first wrote about William Wordsworth...,The article traces the academic and profession...,entity-pair,"[Mia Lewis, William Wordsworth]",\nYou will act as a knowledge analyzer tasked ...
1,Mia Lewis first wrote about William Wordsworth...,The article traces the academic and profession...,entity-pair,"[Mia Lewis, Marie Antoinette]",\nYou will act as a knowledge analyzer tasked ...
2,Mia Lewis first wrote about William Wordsworth...,The article traces the academic and profession...,entity-pair,"[Mia Lewis, Franklin D. Roosevelt]",\nYou will act as a knowledge analyzer tasked ...
3,Mia Lewis first wrote about William Wordsworth...,The article traces the academic and profession...,entity-pair,"[Mia Lewis, 8th-grade book report]",\nYou will act as a knowledge analyzer tasked ...
4,Mia Lewis first wrote about William Wordsworth...,The article traces the academic and profession...,entity-pair,"[Mia Lewis, college thesis]",\nYou will act as a knowledge analyzer tasked ...
...,...,...,...,...,...
1941,Alexander Mendoza was born in Oman. She spent ...,The article briefly outlines the life of Alexa...,entity-triplet,"[Alexander Mendoza, Oman, Thailand]",\nYou will act as a knowledge analyzer tasked ...
1942,Ryan Kelly was born in India. He spent most of...,The article provides a brief overview of Ryan ...,entity-triplet,"[Ryan Kelly, India, Oman]",\nYou will act as a knowledge analyzer tasked ...
1943,Ryan Kelly was born in India. He spent most of...,The article provides a brief overview of Ryan ...,entity-triplet,"[India, Oman, Norway]",\nYou will act as a knowledge analyzer tasked ...
1944,Ryan Kelly was born in India. He spent most of...,The article provides a brief overview of Ryan ...,entity-triplet,"[Ryan Kelly, India, Norway]",\nYou will act as a knowledge analyzer tasked ...


In [73]:
pd.DataFrame(entigraph_prompts).to_excel("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample_curator_prompt.xlsx", index=False)

In [72]:
print(pd.read_excel("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample_curator_prompt_sample_generated.xlsx").iloc[0]["completion"])

### Discussion of Mia Lewis's Academic and Professional Journey in relation to Mia Lewis
From her early education years, Mia Lewis showcased her budding interest in historical and literary figures. A memorable instance from Mia Lewis's academic journey occurred during her 8th grade when she penned a book report on William Wordsworth, a renowned figure in English literature. This was perhaps one of her initial steps which later guided her towards a deeper exploration of historical personalities and cultural impacts. Her academic dedication extended into her college years and shaped her professional choices, eventually leading her to a career curating museum exhibitions that pay homage to influential historical figures such as Franklin D. Roosevelt.

### Discussion of Mia Lewis's Academic and Professional Journey in relation to William Wordsworth
Among the notable historical figures Mia Lewis has explored, William Wordsworth stands out as a pivotal subject of interest. Her engagement wit

In [74]:
len(entigraph_prompts)

1946

### Format the augmentation to be compatible with codebase

In [111]:
# io.remove_last_extension("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample_curator_prompt_sample.xlsx")
df = pd.read_excel("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample_curator_prompt_generated.xlsx")

In [None]:
entigraph_raw_dir = "/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample/entigraph"
os.makedirs(entigraph_raw_dir, exist_ok=True)

In [123]:
i = 0

cat_order = ["entity-pair", "entity-triplet",]
aug_type = CategoricalDtype(categories=cat_order, ordered=True)

for t, sub_df in df.groupby("text"):
    entities = text2entity_extract[t]
    sub_df["aug_type"] = sub_df["aug_type"].astype(aug_type)
    output = [
        entities["entities"],
        entities["summary"],
        *sub_df["completion"].to_list()
    ]
    io.dump_json(output, f"{entigraph_raw_dir}/{i}.json")
    i += 1

In [130]:
naive_raw_dir = "/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample/naive"
os.makedirs(naive_raw_dir, exist_ok=True)

In [131]:
i = 0

for t, sub_df in df.groupby("text"):
    entities = text2entity_extract[t]
    output = [
        entities["entities"],
        t
    ]
    io.dump_json(output, f"{naive_raw_dir}/{i}.json")
    i += 1

In [129]:
print(io.load_json("/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample/entigraph/4.json")[-1])

### Discussion of Amber Media Ltd.'s Language Expansion in relation to Persian (Farsi)
Amber Media Ltd. initiated its service offerings in the Persian (Farsi) language. This foundational choice positioned the company to establish a strong regional presence initially, serving a demographic familiar with Persian. Focusing on Farsi was a strategic move to capture a niche market right from the start, setting the stage for its later expansions.

### Discussion of Amber Media Ltd.'s Language Expansion in relation to Arabic
After establishing a foothold with its Persian (Farsi) language services, Amber Media Ltd. expanded its linguistic range to include Arabic. This addition significantly broadened its market reach, tapping into a larger and diverse speaker base across multiple countries. The introduction of Arabic allowed Amber Media Ltd. to engage with a vast audience, enhancing its impact and presence in the Middle Eastern markets.

### Discussion of Amber Media Ltd.'s Language Expansion i

### Create tokenized dataset

In [3]:
from typing import List
import numpy as np
from transformers import AutoTokenizer
import random
import glob
from tqdm import tqdm
from utils.io_utils import jload


In [4]:
def _glob_all_json(dir_name: str) -> List[str]:
    return glob.glob(f'{dir_name}/*.json') + glob.glob(f'{dir_name}/.*.json')

def _get_quality_graph(dir_name: str) -> List[str]:
    files = _glob_all_json(dir_name)
    result = []
    for file in files:
        content = jload(file)
        result.extend(content[1:])
    return result

In [5]:
def get_tokenizer(tokenizer_model_name: str)-> AutoTokenizer:
    # loading tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_model_name, use_fast=True)
    tokenizer.model_max_length=2**20 # this is to hide the token_len>128K wraning
    return tokenizer

def tokenize_list(text_list: List[str], tokenizer_name="meta-llama/Meta-Llama-3-8B") -> List[int]:
    """
    Tokenize the text and return the tokenized text
    """
    random.shuffle(text_list)
    tokenizer = get_tokenizer(tokenizer_name)
    all_ids = []
    for text in tqdm(text_list):
        if text:
            ids = tokenizer.encode(text) # add_special_tokens=True to add BOS token
            ids.append(tokenizer.eos_token_id) # add the end of text token
            all_ids.extend(ids)
    return all_ids

In [6]:
def write_to_memmap_single(ids: List[int], filename: str, dir_path="data/dataset/bins"):
    filename = f'{dir_path}/{filename}'
    print(f'Writing to {filename} with length {len(ids)}')
    dtype = np.int32
    ids_arr = np.array(ids, dtype=dtype)
    arr_len = len(ids_arr)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
    arr[:] = ids_arr
    arr.flush()

In [None]:
text_source = "entigraph"
# text_source = "naive"

corpus_lst = _get_quality_graph(f"/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/raw/4K_controlled_RE/test_id_sample/{text_source}")

# tokenizer_name = "Qwen/Qwen2.5-1.5B"
tokenizer_name = "Qwen/Qwen2.5-1.5B-Instruct"
# tokenizer_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

In [39]:
tokenized_corpus = tokenize_list(corpus_lst, tokenizer_name=tokenizer_name)

100%|██████████| 1996/1996 [00:01<00:00, 1045.17it/s]


In [40]:
len(tokenized_corpus)

878400

In [41]:
write_to_memmap_single(tokenized_corpus, filename=f"4K_controlled_RE-test_id_sample-{text_source}-{os.path.basename(tokenizer_name)}.bin", dir_path = "/home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/bins")

Writing to /home/zliu/zliu/Synthetic_Continued_Pretraining_leo/data/dataset/bins/4K_controlled_RE-test_id_sample-entigraph-Qwen2.5-1.5B-Instruct.bin with length 878400
