### QASPER Analytics

In [None]:
import json
import os

TRAIN_FILE = os.path.join('qasper-train-v0.3.json')
DEV_FILE = os.path.join('qasper-dev-v0.3.json')
TEST_FILE = os.path.join('qasper-test-v0.3.json')

split = 'dev'
SAVE_COLL_FILE = f'./qasper_{split}_collection.jsonl'

# per document collection
COLL_PATH = f'./collections/{split}/'
os.makedirs(COLL_PATH, exist_ok=True)

with open(TRAIN_FILE, 'r') as f:
    train = json.load(f)

with open(DEV_FILE, 'r') as f:
    dev = json.load(f)

with open(TEST_FILE, 'r') as f:
    test = json.load(f)

In [None]:
test_questions = {}
for k, v in test.items():
    for qa in v['qas']:
        q_id = qa['question_id']
        test_questions[q_id] = qa['question']

dev_questions = {}
for k, v in dev.items():
    for qa in v['qas']:
        q_id = qa['question_id']
        dev_questions[q_id] = qa['question']

train_questions = {}
for k, v in train.items():
    for qa in v['qas']:
        q_id = qa['question_id']
        train_questions[q_id] = qa['question']
        
print(len(train))
print(len(dev))
print(len(test))

print(len(train_questions))
print(len(dev_questions))
print(len(test_questions))

In [None]:
# join all dicts into one
all = train | dev | test

# Gather all questions and types
answers_count = 0
all_questions = {}
all_answer_types = {}
answers = []
for k, v in all.items():
    for qa in v['qas']:
        q_id = qa['question_id']
        answer_type = {"none": 0, "extractive": 0, "abstractive": 0, "boolean": 0}
        for annotation_info in qa["answers"]:
                answers_count += 1
                answer_info = annotation_info["answer"]
                if answer_info["unanswerable"]:
                        answer = "unanswerable"
                        answer_type["none"] += 1
                elif answer_info["extractive_spans"]:
                        answer = ", ".join(answer_info["extractive_spans"])
                        answer_type["extractive"] += 1
                elif answer_info["free_form_answer"]:
                        answer = answer_info["free_form_answer"]
                        answer_type["abstractive"] += 1
                elif answer_info["yes_no"]:
                        answer = "Yes"
                        answer_type["boolean"] += 1
                elif answer_info["yes_no"] is not None:
                        answer = "No"
                        answer_type["boolean"] += 1
                else:
                    raise RuntimeError(f"Annotation {answer_info['annotation_id']} does not contain an answer")
                answers.append(answer)
        all_questions[q_id] = qa['question']
        all_answer_types[q_id] = answer_type
answers_count

In [None]:
# find mean length of answers
import numpy as np

answer_lengths = []
for answer in answers:
    answer_lengths.append(len(answer.split(' ')))
print(np.mean(answer_lengths))
print(np.median(answer_lengths))
np.sort(answer_lengths)

In [None]:
all_types_count = {"none": 0, "extractive": 0, "abstractive": 0, "boolean": 0}
for v in all_answer_types.values():
    for type, count in v.items():
        all_types_count[type] += count
all_types_count

In [None]:
print(len(all_questions))
print(len(all))

In [None]:
general_questions = {v : [k for k, v2 in all_questions.items() if v2 == v] for v in all_questions.values()}
specific_questions = {k : v for k, v in general_questions.items() if len(v) == 1}
general_questions = {k : v for k, v in general_questions.items() if len(v) > 1}
print(len(general_questions))
print(len(specific_questions))

general_train_questions = {v : [k for k, v2 in train_questions.items() if v2 == v] for v in train_questions.values()}
specific_train_questions = {k : v for k, v in general_train_questions.items() if len(v) == 1}
general_train_questions = {k : v for k, v in general_train_questions.items() if len(v) > 1}
print(len(general_train_questions))
print(len(specific_train_questions))

general_test_questions = {v : [k for k, v2 in test_questions.items() if v2 == v] for v in test_questions.values()}
specific_test_questions = {k : v for k, v in general_test_questions.items() if len(v) == 1}
general_test_questions = {k : v for k, v in general_test_questions.items() if len(v) > 1}
print(len(general_test_questions))
print(len(specific_test_questions))

general_dev_questions = {v : [k for k, v2 in dev_questions.items() if v2 == v] for v in dev_questions.values()}
specific_dev_questions = {k : v for k, v in general_dev_questions.items() if len(v) == 1}
general_dev_questions = {k : v for k, v in general_dev_questions.items() if len(v) > 1}
print(len(general_dev_questions))
print(len(specific_dev_questions))

In [None]:
general_test_questions

In [None]:
with open("./general_test_questions.json", "w") as f:
    json.dump(general_test_questions, f, indent = 4)

In [None]:
from tqdm import tqdm
count = 0
for k, v in tqdm(train.items()):
    for qa in tqdm(v['qas']):
        q_id = qa['question_id']
        types = {"none": 0, "extractive": 0, "abstractive": 0, "boolean": 0}
        for answer in qa['answers']:
            answer_info = answer["answer"]
            if answer_info["unanswerable"]:
                types["none"] += 1
            elif answer_info["extractive_spans"]:
                types["extractive"] += 1
            elif answer_info["free_form_answer"]:
                types["abstractive"] += 1
            elif answer_info["yes_no"] or answer_info["yes_no"] is not None:
                types["boolean"] += 1
            else:
                raise RuntimeError(f"Annotation {answer_info['annotation_id']} does not contain an answer")
        if len(qa['answers']) > 1:
            non_zero_types = {k : v for k, v in types.items() if v != 0}
            print(non_zero_types)
            if len(non_zero_types) > 1:
                count += 1
                
print(count)