In [1]:
from pathlib import Path

from src.utils import is_json

ROOT = Path().resolve().parent
RESOURCE_PATH = ROOT / "resources"
RESULTS_PATH = ROOT / "results" / "prompt_selection"


## Read the annotation

In [2]:
annotations = []
filepaths = RESULTS_PATH.glob("**/*.txt")
for filepath in filepaths:
    *_, model, entity, template, doc, _ = filepath.parts
    answer = filepath.read_text()
    annotations.append({
        "model": model,
        "entity": entity,
        "template": template,
        "doc": doc,
        "answer": answer
    })

## How many responses are in a valid json format?

In [3]:
extraction_count = 0
classification_count = 0
for annotation in annotations:
    answer = annotation["answer"]
    template = annotation["template"]

    if is_json(answer):
        if template.startswith("tc"):
            classification_count += 1
        elif template.startswith("te"):
            extraction_count += 1
    
print(f"Number of JSON answers: {extraction_count + classification_count} / {len(annotations)}")
print(f"Number of extraction answers: {extraction_count}")
print(f"Number of classification answers: {classification_count}")

Number of JSON answers: 1928 / 2400
Number of extraction answers: 1200
Number of classification answers: 728


We can see that the classification answers fail much more on complaing with the json format.

Let's try to clean up the answers.

### Clean classifaction answers

In [4]:
bad_answers = []
for annotation in annotations:
    template = annotation["template"]
    answer = annotation["answer"]

    if not is_json(answer):
        if template.startswith("tc"):
            bad_answers.append(answer)
print(f"Number of bad classification answers: {len(bad_answers)}")

Number of bad classification answers: 472


#### Transform into list

Some of the answers do comply teh json format, but are presented as a list of dictionaries without the openning anc closing square brackets.

Example:

```json
{"State": "Um homem matou hoje com um disparo de arma de fogo a ex-companheira em Vila Nova de Gaia"},
{"Occurrence": "Fonte do Comando Territorial da GNR do Porto referiu à agência Lusa que o crime ocorreu cerca das 10:30 na Rua da Figueirinha, em Grijó, Vila Nova de Gaia, distrito do Porto, e que o homem tem 62 anos"},
{"Reporting": "O caso passou para a alçada da Polícia Judiciária"},
{"Perception": "A fonte admitiu que o homem tenha cometido o crime em retaliação por a mulher ter optado por terminar a relação que ambos mantiveram durante 10 anos"},
{"Aspectual": "A vítima era 20 anos mais nova"},
{"Intensional Action": "Um homem matou hoje com um disparo de arma de fogo a ex-companheira em Vila Nova de Gaia e foi detido numa altura em que estaria a tentar o suicídio com a mesma arma"},
{"Intensional State": "Homem mata ex-companheira com tiro em Vila Nova de Gaia"}'
```



In [5]:
def transform_in_list(answer: str):
    return f"[{answer}]"

In [6]:
answers_missing_list = [
    transform_in_list(answer) 
    for answer in bad_answers
    if is_json(transform_in_list(answer))
]

print(f"Number of bad classification answers that can be transformed into a list: {len(answers_missing_list)}")

Number of bad classification answers that can be transformed into a list: 288


In [7]:
bad_answers = [
    answer
    for answer in bad_answers
    if not is_json(transform_in_list(answer))
]

print(f"Number of bad classification answers that cannot be transformed into a list: {len(bad_answers)}")


Number of bad classification answers that cannot be transformed into a list: 184


#### Missing list of tuples

Other common problems is the reponse being a list of tuples that is missing the squareed brackets.

Example:

```json
{"events": ("Edifício da Luanda Medical Center, hotel e projeto imobiliário devolvidos ao Estado": "State"),
("A Procuradoria-Geral da República (PGR) de Angola anunciou hoje": "Reporting"),
("foram entregues ao Serviço Nacional de Recuperação de Ativos mais três edifícios e um projeto imobiliário, incluindo a clínica Luanda Medical Center e um hotel": "Occurrence"),
}

```

In [8]:
def fix_list_of_tuples(answer: str):
    answer = answer.replace(": (", ": [(")
    answer = answer.replace(")}", ")]}")
    answer = answer.replace(")", "}")
    answer = answer.replace("(", "{")
    return answer


In [9]:
answers_missing_list_of_tuples = [
    fix_list_of_tuples(answer) 
    for answer in bad_answers
    if is_json(fix_list_of_tuples(answer))
]

print(f"Number of bad answers that can be fixed with list of tuples: {len(answers_missing_list_of_tuples)}")

Number of bad answers that can be fixed with list of tuples: 102


In [10]:
bad_answers = [
    answer
    for answer in bad_answers
    if not is_json(fix_list_of_tuples(answer))
]

print(f"Number of bad classification answers that cannot be transformed into a list: {len(bad_answers)}")

Number of bad classification answers that cannot be transformed into a list: 82


The remaning answer are considered to be bad, and therefore, discarded.

### Clean extraction answers

In [11]:
bad_answers = []
for annotation in annotations:
    template = annotation["template"]
    answer = annotation["answer"]

    if not is_json(answer):
        if template.startswith("te"):
            bad_answers.append(answer)
print(f"Number of bad classification answers: {len(bad_answers)}")

Number of bad classification answers: 0
