# Converts the output from the parser to format that the state machine docker can read (for evaluation purposes)

In [11]:
import json

data_dir = "../data"

items = []
with open(f"{data_dir}/nrl_parser_output_qasrl_gs_test.jsonl") as f:
    x = f.readline()
    while x is not None and x != "":
        items.append(json.loads(x))
        x = f.readline()
    
len(items)

999

In [15]:
from typing import Tuple
import pandas as pd
from dataclasses import dataclass
from dataclasses_json import dataclass_json

@dataclass_json
@dataclass
class DataRow:
    qasrl_id: str
    verb_idx: int
    verb: str
    question: str
    answer: str
    answer_range: str  # e.g., 12:24
    sentence: str
    
ANSWER_SEPARATOR = "~!~"

def convert_answer_span_to_csv_format(answer_span: dict) -> Tuple[str, str]:
    return {
        "answer": answer_span['text'],
        "answer_range": f"{answer_span['start']}:{answer_span['end']}"
    }


data = []
for item_i, item in enumerate(items):
    sentence = " ".join(item['words'])
    for verb_i, verb_item in enumerate(item['verbs']):
        verb = verb_item['verb']
        verb_idx = int(verb_item['index'])
        for qa_i, qa_pair in enumerate(verb_item['qa_pairs']):
            question = qa_pair['question']
            spans = qa_pair['spans']
            answer_df = pd.DataFrame(convert_answer_span_to_csv_format(span) for span in spans)
            answer = ANSWER_SEPARATOR.join(list(answer_df['answer']))
            answer_range = ANSWER_SEPARATOR.join(list(answer_df['answer_range']))
            
            qasrl_id = f"{item_i}_{verb_i}_{qa_i}"
            data.append(DataRow(qasrl_id, verb_idx, verb, question, answer, answer_range, sentence))
            


            
                

In [17]:
input_df = pd.DataFrame(data_row.to_dict() for data_row in data)
input_df.to_csv(f"{data_dir}/input_file.csv", index=False)

### Run state machine

In [None]:
!docker run -it -v "$(pwd)/data/:/data" --rm --name qasrl hirscheran/qasrl_state_machine_example "file" "/data/input_file.csv" "/data/output_file.csv"

### Add sentence to output_file

In [22]:
import pandas as pd

output_df = pd.read_csv(f"{data_dir}/output_file.csv")


def take_sentence_from_input_df(qasrl_id: str) -> str:
    rows = input_df[input_df['qasrl_id'] == qasrl_id]
    assert rows.shape[0] == 1
    row = rows.iloc[0]
    return row['sentence']

output_df['sentence'] = output_df['qasrl_id'].apply(lambda qasrl_id: take_sentence_from_input_df(qasrl_id))

output_df.to_csv(f"{data_dir}/output_file_with_sentence.csv", index=False)