In [3]:
from datasets import load_dataset
from tqdm import tqdm
from knowledge_propagation.utils import vars, io
from scipy.stats import describe
from typing import List, Dict
import re
from copy import deepcopy
import pandas as pd

# from bespokelabs import curator
from datasets import Dataset
import os

In [4]:
def resolved_answer_references(singleton_questions: List[Dict]):
    """
    The single-hop questions per MuSiQue instance contains reference to answers in other questions. This function replace the reference with actual value.
    """
    pattern = r"#(\d+)"

    resolved_singleton_questions = [None] * len(singleton_questions)
    for q_i, q in enumerate(singleton_questions):
        resolved_q = deepcopy(q)
        match = re.findall(pattern, q["question"])
        # replace every answer reference with the actual value
        resolved_question = q["question"]
        for ans_i in match:
            try:
                assert int(ans_i) - 1 >= 0
                resolved_question = resolved_question.replace(
                    f"#{ans_i}", singleton_questions[int(ans_i) - 1]["answer"].strip()
                )
            except Exception:
                continue

        resolved_q["question"] = resolved_question
        resolved_singleton_questions[q_i] = resolved_q
    assert not any(q is None for q in resolved_singleton_questions)
    return resolved_singleton_questions

In [19]:
train_set_unresolved = io.load_jsonlines("/data/users/zliu/KE-by-CP/data/musique_mend/2hop_musique_ans_v1.0_dev.jsonl")

In [4]:
train_set_unresolved[0]

{'id': '2hop__482757_12019',
 'texts': ['The Collegian is the bi-weekly official student publication of Houston Baptist University in Houston, Texas. It was founded in 1963 as a newsletter, and adopted the newspaper format in 1990.',
  "Several private institutions of higher learning—ranging from liberal arts colleges, such as The University of St. Thomas, Houston's only Catholic university, to Rice University, the nationally recognized research university—are located within the city. Rice, with a total enrollment of slightly more than 6,000 students, has a number of distinguished graduate programs and research institutes, such as the James A. Baker Institute for Public Policy. Houston Baptist University, affiliated with the Baptist General Convention of Texas, offers bachelor's and graduate degrees. It was founded in 1960 and is located in the Sharpstown area in Southwest Houston."],
 'multi_hop_efficacy': [{'question': 'When was the institute that owned The Collegian founded?',
   'a

In [5]:
describe([len(vars.GPT_4_TOKENIZER(x["text"])) for x in io.load_jsonlines(f"{vars.DATA_DIR}/trivia_qa_sft/train.jsonl")])

DescribeResult(nobs=71999, minmax=(np.int64(7), np.int64(137)), mean=np.float64(21.392713787691495), variance=np.float64(84.0200702471322), skewness=np.float64(1.8015959776193726), kurtosis=np.float64(6.219234086905777))

In [None]:
train_set_reference_resolved = []
for datum in train_set_unresolved:
    new_datum = deepcopy(datum)
    new_datum["question_decomposition"] = resolved_answer_references(datum["question_decomposition"])
    train_set_reference_resolved.append(new_datum)

In [21]:
train_set_reference_resolved[0]


{'id': '2hop__460946_294723',
 'texts': ['Green is the fourth studio album by British progressive rock musician Steve Hillage. Written in spring 1977 at the same time as his previous album, the funk-inflected "Motivation Radio" (1977), "Green" was originally going to be released as "The Green Album" as a companion to "The Red Album" (the originally intended name for "Motivation Radio"). However, this plan was dropped and after a US tour in late 1977, "Green" was recorded alone, primarily in Dorking, Surrey, and in London.',
  'Miquette Giraudy (born 9 February 1953, Nice, France) is a keyboard player and vocalist, best known for her work in Gong and with her partner Steve Hillage. She and Hillage currently form the core of the ambient band System 7. In addition to her performances in music, she has also worked as an actress, film editor and writer. In each role, she has used different stage names.'],
 'multi_hop_efficacy': [{'question': 'Who is the spouse of the Green performer?',
   '

In [24]:
zsre_question_list = []

for datum in train_set_reference_resolved:
    
    for decomp_q in datum["question_decomposition"]:
        if ">>" in decomp_q["question"]:
            assert " >> " in decomp_q["question"]
            new_decomp_q = deepcopy(decomp_q)
            new_decomp_q["id"] = datum["id"] + "::" + str(decomp_q["id"])
            new_decomp_q["text"] = datum["texts"][new_decomp_q['supporting_text_id']]
            zsre_question_list.append(new_decomp_q)
    

In [9]:
new_decomp_q

{'id': '2hop__92702_731379::731379',
 'question': 'Downtown Cincinnati >> located in the administrative territorial entity',
 'answer': 'Ohio',
 'supporting_text_id': 1,
 'text': 'The Cincinnatian Hotel is a registered historic building in Downtown Cincinnati, Ohio, listed in the National Register on March 3, 1980.'}

In [10]:
len(zsre_question_list)

10862

In [11]:
zsre_question_list[0]

{'id': '2hop__482757_12019::482757',
 'question': 'The Collegian >> owned by',
 'answer': 'Houston Baptist University',
 'supporting_text_id': 0,
 'text': 'The Collegian is the bi-weekly official student publication of Houston Baptist University in Houston, Texas. It was founded in 1963 as a newsletter, and adopted the newspaper format in 1990.'}

In [None]:
class zsREQuestioner(curator.LLM):
    PROMPT : str = """
You will receive an knowledge triplet of form
"[Subject] >> [Relation] >> [Object]", wrapped in <triplet>...</triplet>

And source text wrapped in <text>...</text>.


Turn the input to be a question about [Object], wrapped in <question>...</question>
Avoid Yes/No question.
Include [Relation] to make the question as clear as possible.
Do not include [Object] in the question.

<text>
Green is the fourth studio album by British progressive rock musician Steve Hillage. Written in spring 1977 at the same time as his previous album, the funk-inflected "Motivation Radio" (1977), "Green" was originally going to be released as "The Green Album" as a companion to "The Red Album" (the originally intended name for "Motivation Radio"). However, this plan was dropped and after a US tour in late 1977, "Green" was recorded alone, primarily in Dorking, Surrey, and in London.
</text>

<triplet>
Green >> performer >> Steve Hillage
</triplet>

<question>
Who is the performer of Green?
</question>

<text>
Empire Sports Network was an American regional sports network that was owned by the Adelphia Communications Corporation. The network was available on cable providers in much of upstate New York (stretching from Buffalo to Albany), as well as parts of northern Pennsylvania and eastern Ohio. The network ceased operations on March 7, 2005, in the midst of Adelphia's financial collapse and bankruptcy.
</text>

<triplet>
Empire Sports Network >> owned by >> Adelphia Communications Corporation
</triplet>

<question>
Who owns Empire Sports Network?
</question>

<text>
{text}
</text>

<triplet>
{question} >> {answer}
</triplet>
"""
    def prompt(self, input: dict) -> str:
        """Generate a prompt for the subsubject generator."""
        return self.PROMPT.format(text=input["text"], question=input["question"], answer=input["answer"])

    def parse(self, input: dict, response: str) -> dict:
        """Parse the model response along with the input to the model into the desired output format.."""
        
        return {**input, "nl_question": response}
    

In [46]:
zsre_questioner = zsREQuestioner(model_name="gpt-4o-mini")

In [47]:
zsre_question_list[1]

{'id': '2hop__460946_294723::294723',
 'question': 'Steve Hillage >> spouse',
 'answer': 'Miquette Giraudy',
 'supporting_text_id': 1,
 'text': 'Miquette Giraudy (born 9 February 1953, Nice, France) is a keyboard player and vocalist, best known for her work in Gong and with her partner Steve Hillage. She and Hillage currently form the core of the ambient band System 7. In addition to her performances in music, she has also worked as an actress, film editor and writer. In each role, she has used different stage names.'}

In [48]:
zsre_question_df = pd.DataFrame(zsre_question_list[:10])
zsre_question_dataset = Dataset.from_pandas(zsre_question_df)

In [49]:
nl_zsre_question_dataset = zsre_questioner(zsre_question_dataset)

In [50]:
nl_zsre_question_dataset[1]

{'id': '2hop__460946_294723::294723',
 'question': 'Steve Hillage >> spouse',
 'answer': 'Miquette Giraudy',
 'supporting_text_id': 1,
 'text': 'Miquette Giraudy (born 9 February 1953, Nice, France) is a keyboard player and vocalist, best known for her work in Gong and with her partner Steve Hillage. She and Hillage currently form the core of the ambient band System 7. In addition to her performances in music, she has also worked as an actress, film editor and writer. In each role, she has used different stage names.',
 'nl_question': "<text>\nThe Pacific Ocean is the largest and deepest of Earth's oceanic divisions. It extends from the Arctic Ocean in the north to the Southern Ocean in the south, and is bounded by the continents of Asia and Australia on the west, and the Americas on the east. It covers more than 63 million square miles (165 million square kilometers) and has an average depth of about 12,080 feet (3,682 meters). The Mariana Trench, located in the western Pacific, is th

In [51]:
zsre_question_list[1]

{'id': '2hop__460946_294723::294723',
 'question': 'Steve Hillage >> spouse',
 'answer': 'Miquette Giraudy',
 'supporting_text_id': 1,
 'text': 'Miquette Giraudy (born 9 February 1953, Nice, France) is a keyboard player and vocalist, best known for her work in Gong and with her partner Steve Hillage. She and Hillage currently form the core of the ambient band System 7. In addition to her performances in music, she has also worked as an actress, film editor and writer. In each role, she has used different stage names.'}

In [8]:

import re
def tag_content_extractor(tag):
    pattern = rf"<{tag}>([\s\S]*?)(?:</{tag}>|$)"
    def content_extractor(text):
        return re.findall(pattern, text)
    return content_extractor

question_content_extractor = tag_content_extractor(tag="question")


In [15]:
split = "dev"
dataset_unresolved = io.load_jsonlines(f"{vars.DATA_DIR}/musique_mend/2hop_musique_ans_v1.0_{split}.jsonl")

dataset_reference_resolved = []
for datum in dataset_unresolved:
    new_datum = deepcopy(datum)
    new_datum["question_decomposition"] = resolved_answer_references(datum["question_decomposition"])
    dataset_reference_resolved.append(new_datum)
    
zsre_questions = io.load_jsonlines(f"{vars.DATA_DIR}/musique_mend/2hop_musique_ans_v1.0_{split}_zsre-questions.jsonl")

In [16]:
id2zsre_question = {q["id"]: q for q in zsre_questions}

converted_musique_dataset = []

for datum in dataset_reference_resolved:
    new_datum = deepcopy(datum)
    del new_datum["question_decomposition"]
    old_question_decomposition =  datum["question_decomposition"]
    new_question_decomposition = []
    for q in old_question_decomposition:
        atom_q_id = datum["id"] + "::" + str(q["id"])
        new_q = deepcopy(q)
        if atom_q_id in id2zsre_question:
            # replace zsre-format question with natural language question
            nl_question = id2zsre_question[atom_q_id]["nl_question"]
            extracted_q = question_content_extractor(nl_question)
            assert len(extracted_q) == 1
            extracted_q = extracted_q[0]
            extracted_q = extracted_q.capitalize()
            if not extracted_q.endswith("?"):
                extracted_q += "?"
            
            new_q["question"] = extracted_q.strip()
        new_question_decomposition.append(new_q)
    new_datum["single_hop_efficacy"] = new_question_decomposition
    converted_musique_dataset.append(new_datum)
        

In [17]:
os.makedirs(f"{vars.DATA_DIR}/musique_mend_converted/", exist_ok=True)

In [18]:
io.dump_jsonlines(converted_musique_dataset, f"{vars.DATA_DIR}/musique_mend_converted/2hop_musique_ans_v1.0_{split}.jsonl")

In [69]:
re.findall(r"<question>(.*?)(?:</question/>|$)", q["nl_question"])

[]

In [81]:
print("Starting from this edition, the UEFA Europa League winners automatically qualify for the subsequent UEFA Champions League season even if they do not qualify for the Champions League through their domestic performance. Therefore, the winners of this tournament qualify for the 2015\u201316 UEFA Champions League. They are guaranteed to enter at least the play-off round, and since the group stage berth reserved for the Champions League title holders will not be used (the winners of the 2014\u201315 UEFA Champions League are guaranteed to qualify for the group stage through domestic performance), they will be elevated to enter the group stage via this berth.")

Starting from this edition, the UEFA Europa League winners automatically qualify for the subsequent UEFA Champions League season even if they do not qualify for the Champions League through their domestic performance. Therefore, the winners of this tournament qualify for the 2015–16 UEFA Champions League. They are guaranteed to enter at least the play-off round, and since the group stage berth reserved for the Champions League title holders will not be used (the winners of the 2014–15 UEFA Champions League are guaranteed to qualify for the group stage through domestic performance), they will be elevated to enter the group stage via this berth.


In [60]:
question_content_extractor(q["nl_question"])

[]

In [59]:
for q in zsre_questions:
    q["nl_question"]

In [80]:
zsre_questions[41]

{'id': '2hop__569582_304416::304416',
 'question': 'Ali & Gipp >> record label',
 'answer': 'Derrty Entertainment',
 'supporting_text_id': 1,
 'text': 'Kinfolk is the debut and only album by American rap duo Ali & Gipp, released on August 14, 2007, through Derrty Entertainment and Universal Records. The first single off the album was already released, called "Go \'Head" featuring Chocolate Tai. The second single is "N da Paint" featuring Nelly. The third single is "Work Dat, Twerk Dat" featuring Murphy Lee. The fourth and final single is "Almost Made Ya" featuring LeToya Luckett.',
 'nl_question': '<question>\nWhat record label is associated with Ali & Gipp?\n</question>'}

In [121]:
data = io.load_jsonlines(f"{vars.DATA_DIR}/musique_mend_converted/2hop_musique_ans_v1.0_train.jsonl")
for datum in tqdm(data):
    assert datum["id"] == "2hop__" + "_".join([str(atom_q["id"]) for atom_q in datum["single_hop_efficacy"]]), datum

100%|██████████| 14376/14376 [00:00<00:00, 1331507.44it/s]
