In [1]:
from glob import glob

import pandas as pd
from experiments.musique.inference_only import macro_averaging
from knowledge_propagation.utils import io, vars, extractor
import os
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import describe
from thefuzz import fuzz

from datasets import load_dataset, load_from_disk

from copy import deepcopy
from collections import defaultdict
from dateutil.parser import parse
from dateutil.parser import ParserError

import re

def is_date(string):
    try:
        parse(string)
        return True
    except ParserError:
        return False



In [105]:

ds = load_dataset("ucinlp/drop")


drop_date_instances = []

no_verbatim_answer = []
different_span_types = []
set_of_spans = []
duplicate_passage = []
passage = set()
count = 0
unique_section = set()
passage2section_id = defaultdict(list)
type2count = defaultdict(int)


# for split in ["train", "validation"]:
for split in ["validation"]:
    for i in range(len(ds[split])):
        datum = ds[split][i]
        if datum["passage"] in passage and datum["section_id"] not in unique_section:
            duplicate_passage.append(datum)
            
        unique_section.add(datum["section_id"])
        passage.add(datum["passage"])
        span = datum["answers_spans"]
        a_str = datum["answers_spans"]["spans"][0]
        
        q_str = datum["question"].lower()
        if q_str.startswith("what was") or q_str.startswith("what were"):
            # count += 1
            set_of_spans.append(datum)
        if not all(datum["answers_spans"]["types"][0] == t for t in datum["answers_spans"]["types"]):
            different_span_types.append(datum)
        if not any(s in datum['passage'] for s in datum["answers_spans"]["spans"]):
            no_verbatim_answer.append(datum)
        for t in datum["answers_spans"]["types"]:
            type2count[t] += 1

In [8]:
drop_dev = io.load_json("/u/zliu/datastor1/KE-by-CP/data/drop_dataset/drop_dataset_dev.json")

In [9]:
# drop_dev["nfl_1184"]["qa_pairs"]
converted_drop_dev = []
for k, v in drop_dev.items():
    new_qa_pairs = []
    for qa in v["qa_pairs"]:
        if len(qa["answer"]["number"]) > 0:
            new_qa = {
                "question": qa["question"],
                "answer": qa["answer"]["number"]
            }
        elif len(qa["answer"]["spans"]) > 0:
            # assert len(qa["answer"]["spans"]) == 1, qa["answer"]["spans"]
            new_qa = {
                "question": qa["question"],
                "answer": ", ".join(qa["answer"]["spans"])
            }
        else:
            ans = " ".join([k for k, v in qa["answer"]["date"].items() if len(v) > 0])
            new_qa = {
                "question": qa["question"],
                "answer": ans
            }
        new_qa["query_id"] = qa["query_id"]
        
        new_qa_pairs.append(new_qa)
    new_v = {
        "passage": v["passage"],
        "section_id": k,
        "qa_pairs": new_qa_pairs,
        "wiki_url": v["wiki_url"]
    }
    converted_drop_dev.append(new_v)

In [10]:
converted_drop_test = converted_drop_dev[:100]
converted_drop_dev = converted_drop_dev[100:]

In [None]:
# io.dump_jsonlines(converted_drop_dev, "/u/zliu/datastor1/KE-by-CP/data/drop_dataset_converted/drop_dataset_dev.jsonl")
# io.dump_jsonlines(converted_drop_test, "/u/zliu/datastor1/KE-by-CP/data/drop_dataset_converted/drop_dataset_test.jsonl")

In [48]:
train_set = io.load_jsonlines("/u/zliu/datastor1/KE-by-CP/data/drop_dataset_converted/drop_dataset_train.jsonl")
dev_set = io.load_jsonlines("/u/zliu/datastor1/KE-by-CP/data/drop_dataset_converted/drop_dataset_dev.jsonl")
total_set = train_set + dev_set

filtered_total_set = []
for example in total_set:
    if len(vars.GPT_4_TOKENIZER(example["passage"])) > 1024:
        continue
    
    
    new_example = deepcopy(example)
    
    new_example["qa_pairs"] = [
        {
            "question": qa["question"],
            "answer": qa["answer"]
        }
        for qa in example["qa_pairs"]
        if len(qa["answer"]) > 0
    ]
    assert len(new_example["qa_pairs"]) > 0
    filtered_total_set.append(new_example)


# # resplit train and dev
n_dev = 100
np.random.shuffle(total_set)
new_train_set = total_set[n_dev:]
new_dev_set = total_set[:n_dev]
io.dump_jsonlines(new_train_set, "/u/zliu/datastor1/KE-by-CP/data/drop_dataset_converted/drop_dataset_train.jsonl")
io.dump_jsonlines(new_dev_set, "/u/zliu/datastor1/KE-by-CP/data/drop_dataset_converted/drop_dataset_dev.jsonl")

In [47]:
len(filtered_total_set)

5993

In [49]:
train_set = io.load_jsonlines("/u/zliu/datastor1/KE-by-CP/data/drop_dataset_converted/drop_dataset_test.jsonl")

In [50]:
count = 0
passage_len = []

for example in tqdm(train_set):
    passage_len.append(len(vars.GPT_4_TOKENIZER(example["passage"])))
    for qa in example["qa_pairs"]:
        count += len(qa["answer"]) == 0
    

100%|██████████| 100/100 [00:00<00:00, 9099.66it/s]


In [51]:
describe(passage_len)

DescribeResult(nobs=100, minmax=(np.int64(116), np.int64(698)), mean=np.float64(264.15), variance=np.float64(9337.361111111113), skewness=np.float64(1.5776353190469845), kurtosis=np.float64(4.297482113075264))

In [45]:
len([x for x in passage_len if x > 1000])

57

In [67]:
dev = io.load_json("/u/zliu/datastor1/KE-by-CP/data/multirc-v2/dev_83-fixedIds.json")

In [73]:
comp_webq_snippet_dev = io.load_json("/u/zliu/datastor1/KE-by-CP/data/complexwebquestions_V1_1/web_snippets_dev.json")

In [74]:
comp_webq = io.load_json("/u/zliu/datastor1/KE-by-CP/data/complexwebquestions_V1_1/ComplexWebQuestions_dev.json")

In [98]:
comp_webq[1]

{'ID': 'WebQTrn-2505_eba9c2e29a1198e8b56c2f7e4210638b',
 'answers': [{'aliases': [],
   'answer': 'Vanderbilt University Mr. Commodore',
   'answer_id': 'm.05k6hxh'}],
 'composition_answer': 'vanderbilt university',
 'compositionality_type': 'composition',
 'created': '2018-02-13T00:03:35',
 'machine_question': 'what is the organiztion leadership has a person named Nicholas S. Zeppos mascot',
 'question': 'What is the mascot of the team that has Nicholas S. Zeppos as its leader?',
 'sparql': "PREFIX ns: <http://rdf.freebase.com/ns/>\nSELECT DISTINCT ?x\nWHERE {\nFILTER (?x != ?c)\nFILTER (!isLiteral(?x) OR lang(?x) = '' OR langMatches(lang(?x), 'en'))\n?c ns:organization.organization.leadership ?k .\n?k ns:organization.leadership.person ns:m.02vymvp . \n?c ns:education.educational_institution.mascot ?x .\n}\n",
 'webqsp_ID': 'WebQTrn-2505',
 'webqsp_question': 'what is vanderbilt university mascot'}

In [99]:
question_snippets = [s for s in comp_webq_snippet_dev if s['question_ID'] == comp_webq[1]["ID"]]
len(question_snippets)

3

In [100]:
question_snippets[0]["web_query"]

'What is the mascot of the team that has Nicholas S Zeppos as its leader'

In [101]:
question_snippets[0]["web_snippets"][0]

{'snippet': 'Vanderbilt University (informally Vandy) is a private research university in Nashville, Tennessee. Founded in 1873, it was named in honor of shipping and rail magnate Cornelius Vanderbilt, who provided the school its initial $1 million endowment despite having never been to the South. Vanderbilt hoped that his gift and the\xa0...',
 'title': 'Vanderbilt University - Wikipedia'}

In [103]:
question_snippets[1]["web_query"]

'What is the mascot of vanderbilt university'

In [104]:
question_snippets[2]["web_query"]

'the team that has Nicholas S Zeppos as its leader'