In [1]:
from typing import List, Tuple, Any, Dict, Callable, Iterable
import pandas as pd
import json
import datasets
import itertools
from collections import defaultdict, Counter

from utils import *

In [2]:
# qanom = datasets.load_dataset("biu-nlp/qanom")
# qasrl = datasets.load_dataset("biu-nlp/qa_srl2018", "v2")
qasrl = datasets.load_dataset("kleinay/qa_srl")

No config specified, defaulting to: qa_srl/plain_text
Reusing dataset qa_srl (/home/nlp/kleinay/.cache/huggingface/datasets/kleinay___qa_srl/plain_text/1.0.0/9aaf099b628da9c576ebbc49bd242c93d0e6cc79ffdb2e0e1d3daf409f696820)


  0%|          | 0/3 [00:00<?, ?it/s]

### Prepare qasrl_slots for DFA

In [52]:
data = datasets.concatenate_datasets([
    # qanom['train'], qanom['validation'] 
    # qasrl['train'], qasrl['validation'] 
    qanom['train'], qanom['validation'], qasrl['train'], qasrl['validation'] 
]) 

questions = [q for q in data['question'] if q]
slots = list(zip(*questions))
slots = [sorted(set(slot)) for slot in slots]
slots_dict = {"wh": slots[0],
              "aux": slots[1],
              "subj": slots[2],
              "verb": slots[3],
              "obj": slots[4],
              "prep": slots[5],
              "obj2": slots[6],
              "?": slots[7]}

# slots_dict.pop("verb")
import json
# json.dump(slots_dict, open("seq2seq_constrained_decoding/qasrl_slots.json", "w"))


Verify that there no conflicts between adjecent slots, making the qasrl_question_dfa non-deterministic

In [3]:
import itertools
def is_conflicting(slot1, slot2):
    tok1 = slot1.split(" ")
    tok2 = slot2.split(" ")
    return tok1[-1] == tok2[0]

def find_conflicts(preps, obj2):
    return [(s1, s2) for s1, s2 in itertools.product(preps, obj2)
            if is_conflicting(s1, s2)] 
    
slot_names = list(slots_dict.keys())
for sl1, sl2 in zip(slot_names[:-1], slot_names[1:]):
    print(f"Conflicts for {sl1} vs. {sl2}: {find_conflicts(slots_dict[sl1], slots_dict[sl2])}") 

Conflicts for wh vs. aux: []
Conflicts for aux vs. subj: [('_', '_')]
Conflicts for subj vs. verb: []
Conflicts for verb vs. obj: []
Conflicts for obj vs. prep: [('_', '_')]
Conflicts for prep vs. obj2: [('_', '_'), ('as doing', 'doing'), ('by doing', 'doing'), ('in doing', 'doing'), ('of doing', 'doing'), ('on doing', 'doing'), ('to do', 'do'), ('to doing', 'doing'), ('with doing', 'doing')]
Conflicts for obj2 vs. ?: []


Reomve conflicts between prep and obj2 by striping obj2 words from preps, then save and override qasrl_slots 

In [56]:
# Resolve conflicts in prep--obj2 slots 

for i, prep in enumerate(slots_dict["prep"]):
    for obj in slots_dict["obj2"]:
        if prep.endswith(f" {obj}"):
            new_prep = prep[:-(len(obj)+1)]
            slots_dict["prep"][i] = new_prep
            break
slots_dict["prep"] = list(sorted(set(slots_dict["prep"])))
json.dump(slots_dict, open("seq2seq_constrained_decoding/qasrl_slots.json", "w"))


In [57]:
'' in slots_dict["prep"]

False

Inspect 

In [58]:
q_with_sep = [q for q in questions if q and "~!~" in q[3]]
q_with_sep


[]

In [60]:
def countin(l, p):
    return len([s for s in l if p in s])

print({sl:countin(l, "~!~") for sl,l in slots_dict.items()})

verb_prefixes= set([' '.join(t.split(" ")[:-1]) for t in v_vs if len(t.split(" "))>1])
print(verb_prefixes)
verb_prefixes= set([' '.join(t.split(" ")[:-1]) for t in n_vs if len(t.split(" "))>1])
print(verb_prefixes)

{'wh': 0, 'aux': 0, 'subj': 0, 'verb': 0, 'obj': 0, 'prep': 0, 'obj2': 0, '?': 0}
set()
{'have been', 'have', 'not have been', 'not', 'be', 'not have', 'been', 'not be', 'being'}


In [54]:
v_qs = qasrl['train']["question"]
n_qs = qanom['train']["question"]

v_vs = [q[3] for q in v_qs]
n_vs = [q[3] for q in n_qs if q]
v_preps = [q[5] for q in v_qs]
n_preps = [q[5] for q in n_qs if q]
print(Counter(map(lambda v:len(v.split(" ")), v_vs)))
print(Counter(map(lambda v:len(v.split(" ")), n_vs)))

#verb prefixes
# print(set([t.split(" ")[0] for t in v_vs if len(t.split(" "))>1]))
print(set([tuple(t.split(" ")[:-1]) for t in n_vs if len(t.split(" "))>1]))

Counter({1: 215432})
Counter({1: 13592, 2: 2239, 3: 63, 4: 1})
{('not', 'have'), ('been',), ('be',), ('not', 'have', 'been'), ('have',), ('being',), ('have', 'been'), ('not', 'be'), ('not',)}


In [62]:
v_auxs = [q[1] for q in v_qs]
n_auxs = [q[1] for q in n_qs if q]

print(set(v_auxs))
print(set(n_auxs))

v_obj2s = [q[6] for q in v_qs]
n_obj2s = [q[6] for q in n_qs if q]

print(set(v_obj2s))
print(set(n_obj2s))

{"can't", 'has', "didn't", "wasn't", 'did', 'was', 'can', "hadn't", 'would', 'will', "won't", "hasn't", 'does', 'had', 'is', "shouldn't", "isn't", 'should', "wouldn't", 'might', '_', "doesn't"}
{"can't", 'has', "didn't", "wasn't", 'did', 'was', 'can', 'would', 'will', "won't", "hasn't", 'does', 'had', 'is', "shouldn't", "isn't", 'should', "wouldn't", 'might', '_', "doesn't"}
{'something', 'somewhere', 'doing', '_', 'do', 'someone'}
{'something', 'somewhere', 'doing', '_', 'do', 'someone'}


## Contextualizing QASRL datasets

In [4]:
# Code copied from the implementation in `run_parsing_model`

from roleqgen.question_translation import QuestionTranslator
contextualizer = QuestionTranslator.from_pretrained("biu-nlp/contextualizer_qasrl", device_id=0)

# Prepare contexts (inputs) for contextualizer
def as_input_for_contextualizer(qa):
    question = ' '.join(qa['question']).replace(' _', '').replace(' ?', '?')
    return {'proto_question': question, 
            'predicate_lemma': qa['verb_form'],
            'predicate_span': f"{qa['predicate_idx']}:{qa['predicate_idx'] + 1}",
            'text': qa['sentence']}

# Take contextualized slots from contextualized-question (co_q)  
def to_filled_slots(orig_slots, co_q: str) -> List[str]:
    # context can be at slots SUBJ (2), OBJ (4), OBJ2 (6); take from contextualized question
    if not orig_slots:
        return orig_slots
    wh, aux, subj, verb, obj, prep, obj2, _ = orig_slots
    co_q = without_suffix(co_q, '?') + ' '
    if wh not in co_q or f"{verb} " not in co_q or (aux != "_" and aux not in co_q):
        return orig_slots
    
    pre_v, post_v = co_q.split(f"{verb} ", 1)
    # subj is the part before verb after prefix
    subj = without_prefix(pre_v, wh.title()).lstrip()
    subj = without_prefix(subj, aux).strip()
    # if prep is not copied within co_q, cannot identify objects
    if prep != "_" and prep not in co_q:
        return [wh, aux, subj, verb, obj, prep, obj2, '?']
    # if at most one object is present, can easily know which is it
    if obj != "_" and obj2 == "_":
        obj = without_suffix(post_v.rstrip(), prep).strip()
    elif obj == "_" and obj2 != "_":
        obj2 = without_prefix(post_v.rstrip(), prep).strip()
    # if both objects are present, prep (in between) should be non empty
    elif obj != "_" and obj2 != "_":
        obj, obj2 = post_v.split(f" {prep} ", 1)
    return [wh, aux, subj, verb, obj, prep, obj2, '?']      
    
def contextualize(orig_dataset):
    # orig_dataset = raw_datasets[split]
    # Prepare contextualizer inputs from datatset
    inputs_for_contextualizer = orig_dataset.map(as_input_for_contextualizer, remove_columns=[
        'sentence', 'sent_id', 'predicate_idx', 'predicate', 'is_verbal', 'verb_form', 'question', 'answers', 'answer_ranges'])
    inputs_for_contextualizer = inputs_for_contextualizer.to_pandas().to_dict('records')
    # Run contextualizer
    contextualized_questions = contextualizer.predict(inputs_for_contextualizer)
    # Modify questions in dataset
    def contextualize_dataset(example, idx):
        example['question'] = to_filled_slots(example['question'], contextualized_questions[idx])
        return example  
    ret = orig_dataset.map(
        contextualize_dataset,
        with_indices=True,
        batched=True,
        load_from_cache_file=False,
        desc=f"contextualizing questions of the dataset"
    )
    return ret


2022-05-01 16:44:15,304 - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): huggingface.co:443
2022-05-01 16:44:16,062 - DEBUG - urllib3.connectionpool - https://huggingface.co:443 "HEAD /biu-nlp/contextualizer_qasrl/resolve/main/vocab.json HTTP/1.1" 200 0
2022-05-01 16:44:16,072 - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): huggingface.co:443
2022-05-01 16:44:16,736 - DEBUG - urllib3.connectionpool - https://huggingface.co:443 "HEAD /biu-nlp/contextualizer_qasrl/resolve/main/merges.txt HTTP/1.1" 200 0
2022-05-01 16:44:16,745 - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): huggingface.co:443
2022-05-01 16:44:17,428 - DEBUG - urllib3.connectionpool - https://huggingface.co:443 "HEAD /biu-nlp/contextualizer_qasrl/resolve/main/added_tokens.json HTTP/1.1" 404 0
2022-05-01 16:44:17,437 - DEBUG - urllib3.connectionpool - Starting new HTTPS connection (1): huggingface.co:443
2022-05-01 16:44:18,122 - DEBUG - urllib3.connectionpoo

In [None]:
# Run on QASRL datasets

qanom_dataset = datasets.load_dataset("biu-nlp/qanom")
for split in qanom_dataset:
    orig_dataset = qanom_dataset[split]
    new_dataset = contextualize(orig_dataset)
    new_dataset.to_csv(f"qanom_{split}_contextualized.csv")

# Evaluate nrl-parser

### 1. Prepare input for nrl-parser (jsonl)

In [7]:
test_sentences = set(qasrl['test']['sentence'])
dicts = [{"sentence": s} for s in test_sentences]
with open("qasrl_gs.test.jsonl", "w") as fout:
    for dic in dicts:
        fout.write(json.dumps(dic) + "\n")

### 2. translate parser output (jsonl) into csv 
This includes decoding the question to 7-slots.
We will try to use our DFA code for that.

*Conclusion*: That's not possible, as the nrl parser leave no traces for empty slots (_) so the DFA cannot parse it.


In [13]:
from pipeline import get_markers_for_model
from dfa_fill_qasrl_slots import dfa_fill_qasrl_slots, extract_is_negated, SLOT_TO_STATE

special_tokens = get_markers_for_model(True)
from constrained_decoding.qasrl import get_qasrl_question_dfa
question_dfa = get_qasrl_question_dfa(constrain_verb=False)
from strings_to_objects_parser import StringsToObjectsParser
str2objParser = StringsToObjectsParser(special_tokens, None)
# str2objParser._get_question_slots("Why was someone cited?")
dfa_fill_qasrl_slots("why was someone cited _ _ _ ?", question_dfa)

{'wh': 'why',
 'aux': 'was',
 'subj': 'someone',
 'verb': 'cited',
 'obj': '_',
 'prep': '_',
 'obj2': '_'}