In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from architectures import BertModel, RobertaModel
from transformers import BertTokenizerFast, RobertaTokenizerFast
from evaluation_utils import get_sentence_predicted_annotation, evaluate, get_annotated_entities
from evaluation_utils import get_entities_positions, get_entity_annotation_metrics
from utils import align_word_ids
import pandas as pd
import numpy as np
from data_structures import DataSequence
from config import MAX_LENGTH, LABEL_ALL_TOKENS, BATCH_SIZE
from nltk import WordNetLemmatizer

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

def get_lemma(token: str) -> str:
    if len(token) > 1:
        doc = nlp(token)
        return doc[0].lemma_
    else:
        return token

#### Constants

In [4]:
#'./best_models/0*allenai-scibert_scivocab_cased_sgd_0.001_0.35_0_True.pt'
#'./best_models/0*recobo-agriculture-bert-uncased_sgd_0.001_0.35_0_True.pt'
BEST_MODELS_PATH = './best_models/0*recobo-agriculture-bert-uncased_sgd_0.001_0.35_0_True_False_0.21.pt'
# 'recobo/agriculture-bert-uncased'
# 'allenai/scibert_scivocab_cased'
PRETRAINED_MODEL = 'recobo/agriculture-bert-uncased'
IDS_TO_LABELS = {0: 'B-Agr', 1: 'I-Agr', 2: 'O'}
UNIQUE_LABELS = {'I-Agr', 'O', 'B-Agr'}
SENTENCE_TO_ANNOTATE = 'In addition, it also does not seem to be justifiable to assume that the breed fights against fitness'

#### Loading Model and Tokenizer

In [5]:
model = torch.load(BEST_MODELS_PATH)
tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)

In [6]:
syns_labels = []
with open("labels/synonym-labels-used.txt", "r") as concepts_file:
    lines = concepts_file.readlines()
    syns_labels = [concept.strip() for concept in lines]
    
novel_labels = []
with open("labels/novel-labels-used.txt", "r") as concepts_file:
    lines = concepts_file.readlines()
    novel_labels = [concept.strip() for concept in lines]

pref_labels = []
with open("labels/pref-labels-used.txt", "r") as concepts_file:
    lines = concepts_file.readlines()
    pref_labels = [concept.strip() for concept in lines]

#### Evaluating

In [7]:
def is_novel_label(annotation):
    for novel_label in novel_labels:
        if annotation.strip().casefold() == novel_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == novel_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == get_lemma(novel_label.casefold()):
            return True  
    return False

def is_syns_label(annotation):
    for syn_label in syns_labels:
        if annotation.strip().casefold() == syn_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == syn_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == get_lemma(syn_label.casefold()):
            return True  
    return False

def is_pref_label(annotation):
    for pref_label in pref_labels:
        if annotation.strip().casefold() == pref_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == pref_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == get_lemma(pref_label.casefold()):
            return True
    return False

def is_partial_pref_label(annotation):
    for pref_label in pref_labels:
        if annotation.strip().casefold() in pref_label.casefold():
            return True
        elif get_lemma(annotation.strip()) in pref_label.casefold():
            return True
    return False

In [8]:
def get_all_annotated_entities(annotations_path: str):
    df_test = pd.read_csv(annotations_path)
    total_real_entities = 0
    total_complete_entities_annotated = 0
    total_partial_entities_annotated = 0
    total_false_positives = 0
    all_annotated_entities = []
    clean_annotated_entities = []

    for sentence, annotation in zip(df_test['text'], df_test['labels']):
        predicted_annotations = get_sentence_predicted_annotation(model, tokenizer, sentence, IDS_TO_LABELS)
        real_annotations = annotation.split(" ")
        if len(predicted_annotations) == len(real_annotations):
            #print(sentence)
            #print(predicted_annotations)
            #print(real_annotations)
            annotated_entities = get_annotated_entities(sentence, predicted_annotations)
            if len(annotated_entities) > 0:
                all_annotated_entities+=annotated_entities
            real_entities = get_entities_positions(real_annotations)
            predicted_entities = get_entities_positions(predicted_annotations)
            num_complete_entities_annotated, num_partial_entities_annotated, num_false_positive_annotations = get_entity_annotation_metrics(real_entities, predicted_entities)
            #print(num_complete_entities_annotated, num_partial_entities_annotated)
            total_real_entities+=len(real_entities)
            total_complete_entities_annotated+=num_complete_entities_annotated
            total_partial_entities_annotated+=num_partial_entities_annotated
            total_false_positives+=num_false_positive_annotations
            #print()
    print(round(np.mean(total_complete_entities_annotated/total_real_entities), 4))
    print(round(np.mean(total_partial_entities_annotated/total_real_entities), 4))
    print(round(np.mean(total_false_positives/total_real_entities), 4))

    #print(all_annotated_entities)

    for annotated_entity in all_annotated_entities:
        if type(annotated_entity) != str:
            clean_annotated_entities.append(" ".join(annotated_entity))
        else:
            clean_annotated_entities.append(annotated_entity)
            
    clean_annotated_entities = list(set(clean_annotated_entities))
    return clean_annotated_entities

In [11]:
pref_annotated_entities = get_all_annotated_entities('datasets/pref-agri-ner-input.csv')

0.986
0.0064
0.0071


In [9]:
novel_annotated_entities = get_all_annotated_entities('datasets/novel-agri-ner-input.csv')

0.6229
0.0
0.268


In [10]:
syns_annotated_entities = get_all_annotated_entities('datasets/synonym-agri-ner-input.csv')

0.7328
0.0
0.2443


In [12]:
for annotation in novel_annotated_entities:
    if not is_novel_label(annotation)\
    and not is_pref_label(annotation)\
    and not is_syns_label(annotation):
        print(annotation)


outcomes
paradigms
probability
radiographs
sepsis
Shanghai
wings
leptin
surveillance
stakeholders
elderly
producers
tourism
redistribution
plasmodium
robustness
Tourism
cytoskeleton
teams
sequences
Edinburgh
signals
packages
Bioavailability
happiness
sequencing
prognosis
actions
radicals
security
proteome
mobility
paralysis
DON
poisoning
syntax
;
ethics
aerosols
routing
instrumentation
databases
targeting
smoking
CKD
emotions
Circadian rhythms
sustainability
specifications
astrocytes
democracy
poverty
clouds
together
costs
outputs
outsourcing
striatum
Elderly
insulin
markers
Memory
students
gut
fitness
localization
center
circadian clocks
cyanobacteria
endocytosis
software
diversity
biodiversity
tumors
analgesia
memory
integrity
actors
rhythms
nature
Antioxidants
devices


In [13]:
for annotation in syns_annotated_entities:
    if is_syns_label(annotation) and not is_pref_label(annotation):
        print(annotation)

behaviors
fibers
trials
puberty
instruments
Projections
power
motility
Estimation
pricing
oocytes
dissolution
color
Trials
Experiments
sperm
hemoglobin
behavior
synergy
estimations
streams
microbiota
rainfall
stimulation
etiology
tools
predictions
birth
prey
illumination
prediction
SOC
decomposition
Sperm
Birth
separation
stirring
labor
origins
Pulses
fetuses
gestation
eating
cultivars
taxation
Pigs
odor
pulses
researchers
pigs
flavor
Motility
experiments
spectrum
origin
births
estimation


In [14]:
for annotation in novel_annotated_entities:
    if is_novel_label(annotation) and not is_pref_label(annotation):
        print(annotation)

stations
decay
farming
Works
architecture
working
sonication
distribution
works
exploitation
QTL
Turkey
perception
organelles
exercise
GIS
Britain
generations
Corticosteroids
debris
insects
dispersion
embryos
ischemia
Compression
spawning
atherosclerosis
Food
farms
Distribution
cortisol
farming practices
microorganisms
ACTH
food
working memory
impacts
Insects
food aid
Drawings
substrates
eukaryotes
Impacts
farm
economy
conservation
food security
working hours
ticks
photoperiod
work organisation
biofilms
rights
microbes
chemotherapy
pathophysiology
BMI
distributions
people
transmission
food intake
foods
Cortisol
mapping
exercises
compression
cities
Anesthesia
food preferences
esophagus
cascades
emergence
boxes
propagation
GPS
contaminants
work
repair
SNPs
economies
