In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from architectures import BertModel, RobertaModel
from transformers import BertTokenizerFast, RobertaTokenizerFast
from evaluation_utils import get_sentence_predicted_annotation, evaluate, get_annotated_entities
from evaluation_utils import get_entities_positions, get_entity_annotation_metrics
from utils import align_word_ids
import pandas as pd
import numpy as np
from data_structures import DataSequence
from config import MAX_LENGTH, LABEL_ALL_TOKENS, BATCH_SIZE
from nltk import WordNetLemmatizer

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

def get_lemma(token: str) -> str:
    if len(token) > 1:
        doc = nlp(token)
        return doc[0].lemma_
    else:
        return token

#### Constants

In [11]:
#'./best_models/0*allenai-scibert_scivocab_cased_sgd_0.001_0.35_0_True.pt'
#'./best_models/0*recobo-agriculture-bert-uncased_sgd_0.001_0.35_0_True.pt'
BEST_MODELS_PATH = './best_models/0*allenai-scibert_scivocab_cased_sgd_0.001_0.35_0_True.pt'
# 'recobo/agriculture-bert-uncased'
# 'allenai/scibert_scivocab_cased'
PRETRAINED_MODEL = 'allenai/scibert_scivocab_cased'
IDS_TO_LABELS = {0: 'B-Agr', 1: 'I-Agr', 2: 'O'}
UNIQUE_LABELS = {'I-Agr', 'O', 'B-Agr'}
SENTENCE_TO_ANNOTATE = 'In addition, it also does not seem to be justifiable to assume that the breed fights against fitness'

#### Loading Model and Tokenizer

In [12]:
model = torch.load(BEST_MODELS_PATH)
tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)

In [13]:
syns_labels = []
with open("labels/synonym-labels-used.txt", "r") as concepts_file:
    lines = concepts_file.readlines()
    syns_labels = [concept.strip() for concept in lines]
    
novel_labels = []
with open("labels/novel-labels-used.txt", "r") as concepts_file:
    lines = concepts_file.readlines()
    novel_labels = [concept.strip() for concept in lines]

pref_labels = []
with open("labels/pref-labels-used.txt", "r") as concepts_file:
    lines = concepts_file.readlines()
    pref_labels = [concept.strip() for concept in lines]

#### Evaluating

In [None]:
def is_novel_label(annotation):
    for novel_label in novel_labels:
        if annotation.strip().casefold() == novel_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == novel_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == get_lemma(novel_label.casefold()):
            return True  
    return False

def is_syns_label(annotation):
    for syn_label in syns_labels:
        if annotation.strip().casefold() == syn_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == syn_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == get_lemma(syn_label.casefold()):
            return True  
    return False

def is_pref_label(annotation):
    for pref_label in pref_labels:
        if annotation.strip().casefold() == pref_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == pref_label.casefold():
            return True
        elif get_lemma(annotation.strip()) == get_lemma(pref_label.casefold()):
            return True
    return False

def is_partial_pref_label(annotation):
    for pref_label in pref_labels:
        if annotation.strip().casefold() in pref_label.casefold():
            return True
        elif get_lemma(annotation.strip()) in pref_label.casefold():
            return True
    return False

In [14]:
def get_all_annotated_entities(annotations_path: str):
    df_test = pd.read_csv(annotations_path)
    total_real_entities = 0
    total_complete_entities_annotated = 0
    total_partial_entities_annotated = 0
    total_false_positives = 0
    all_annotated_entities = []
    clean_annotated_entities = []

    for sentence, annotation in zip(df_test['text'], df_test['labels']):
        predicted_annotations = get_sentence_predicted_annotation(model, tokenizer, sentence, IDS_TO_LABELS)
        real_annotations = annotation.split(" ")
        if len(predicted_annotations) == len(real_annotations):
            #print(sentence)
            #print(predicted_annotations)
            #print(real_annotations)
            annotated_entities = get_annotated_entities(sentence, predicted_annotations)
            if len(annotated_entities) > 0:
                all_annotated_entities+=annotated_entities
            real_entities = get_entities_positions(real_annotations)
            predicted_entities = get_entities_positions(predicted_annotations)
            num_complete_entities_annotated, num_partial_entities_annotated, num_false_positive_annotations = get_entity_annotation_metrics(real_entities, predicted_entities)
            #print(num_complete_entities_annotated, num_partial_entities_annotated)
            total_real_entities+=len(real_entities)
            total_complete_entities_annotated+=num_complete_entities_annotated
            total_partial_entities_annotated+=num_partial_entities_annotated
            total_false_positives+=num_false_positive_annotations
            #print()
    print(round(np.mean(total_complete_entities_annotated/total_real_entities), 4))
    print(round(np.mean(total_partial_entities_annotated/total_real_entities), 4))
    print(round(np.mean(total_false_positives/total_real_entities), 4))

    #print(all_annotated_entities)

    for annotated_entity in all_annotated_entities:
        if type(annotated_entity) != str:
            clean_annotated_entities.append(" ".join(annotated_entity))
        else:
            clean_annotated_entities.append(annotated_entity)
            
    clean_annotated_entities = list(set(clean_annotated_entities))
    return clean_annotated_entities

In [15]:
novel_annotated_entities = get_all_annotated_entities('datasets/novel-agri-ner-input.csv')

0.5746
0.0
0.2514


In [16]:
syns_annotated_entities = get_all_annotated_entities('datasets/synonym-agri-ner-input.csv')

0.7319
0.0
0.2029


In [None]:
for annotation in novel_annotated_entities:
    if not is_novel_label(annotation)\
    and not is_pref_label(annotation)\
    and not is_syns_label(annotation):
        print(annotation)




In [22]:
for annotation in novel_annotated_entities:
    if is_novel_label(annotation) and not is_pref_label(annotation):
        print(annotation)

KeyboardInterrupt: 

In [None]:
for annotation in syns_annotated_entities:
    if is_syns_label(annotation) and not is_pref_label(annotation):
        print(annotation)

In [15]:
for clean_annotation in novel_annotated_entities:
    if is_novel_label(clean_annotation) and not is_pref_label(clean_annotation):
        print(clean_annotation)

conservation
contaminants
waves
distribution
transmission
BMI
economy
substrates
Turkey
economies
cortisol
QTL
Drawings
exploitation
generations
emergence
leisure
mapping
work organisation
ACTH
Impacts
decay
dispersion
work
Britain
Compression
Distribution
permits
antagonists
stations
biofilms
distributions
insects
ticks
impacts
compression
SNPs
repair
chemotherapy
GIS
Corticosteroids
pathophysiology
people
embryos
Insects
boxes
propagation
architecture
Cortisol
eukaryotes
GPS
sonication
photoperiod
perception
works
cities


In [97]:
for clean_annotation in clean_annotated_entities:
    if is_alt_label(clean_annotation)\
     and not is_pref_label(clean_annotation)\
     and not is_partial_pref_label(clean_annotation):
        print(clean_annotation)

body fat
SNP
fat content
ACTH
Oil content
hemoglobin
endocrine control
Fat content
speech
Plots
dissolution
sonication
Thermal treatment
hyperglycemia
substrates
eukaryotes
rDNA
Exercise
allergy
CNS
organisation
filtering
Puberty
Estimation
molecular markers
organic farming
Anesthesia
infectivity
nephropathy
estimation
exercises
antibiotic resistance
puberty
Renal function
Molecular markers
Organic farming
genetic manipulation
food webs
food preferences
stirring
cortisol
Cortisol
precision farming
flavor
feces
contaminants
fibers
Aluminum
BMI
Filtering
hemorrhage
GPS
oil content
lifespan
ischemia
QTL


In [98]:
for clean_annotation in clean_annotated_entities:
    if not is_alt_label(clean_annotation) and is_pref_label(clean_annotation):
        print(clean_annotation)

RNA
FSH
questionnaire
ages
men
AGE
densities
sex
methodologies
language
sap
air
societies
age
chromosome
DNA
wind
gene
accuracies
ICE
structure
ABA
laboratory
technologies
abilities
virus
equilibria
attitude
sexes
dairy
limb
ATP
efficiencies
genome
Sun
additive
lipid


In [95]:
for clean_annotation in clean_annotated_entities:
    if not is_alt_label(clean_annotation) and not is_pref_label(clean_annotation):
        print(clean_annotation)


food web
inference
personality
meals
develop
understanding
eps
biomarkers
University
exclusion
mating behavior
punishment
surgery
MRI
endometriosis
Optics
recording
categories
schema
dissemination
sequencing
analgesia
Sequencing
choice
behavioral control
alignment
economic
transposition
optimization
mating types
genetic architecture
autism
reasoning
glycosylation
transcripts
SA perception
Transposition
places
Oil
genetic origin
SNPs
QTL mapping
sequence
matching
dissipation
luciferase
intraocular
genetic values
eating behavior
Slow
conservation practices
resistivity
flows
integrity
Reducing power
striatum
In vivo
schemes
Validation
tumors
discharges
nursing
cervix
SSR markers
fertilizer treatments
donors
cognition
registers
birth cohort
neurons
mobility
insight
articles
metformin
satisfaction
reperfusion
sensing
surveillance
conjugation
mechanisms
thermal
Community structure
pathways
sexual behavior
sepsis
Efficiencies
dilation
images
food system
flexibility
scoring
waist circumferenc