In [1]:
!export TOKENIZERS_PARALLELISM=false

In [2]:
%load_ext blackcellmagic

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import csv
import time
import gc
import os

import torch
from torch import nn

from data_structures.architectures import BertModel, RobertaModel, XLNetModel
from transformers import BertTokenizerFast, RobertaTokenizerFast, XLNetTokenizerFast

from utils.preprocessing_utils import align_word_ids

from utils.training_utils import train_loop

from utils.evaluation_utils import evaluate
from utils.evaluation_utils import get_entities_positions
from utils.evaluation_utils import get_entity_annotation_metrics
from utils.evaluation_utils import get_entity_annotation_report

from utils.agrovoc_utils import load_agrovoc_altLabels
from utils.agrovoc_utils import load_agrovoc_prefLabels

from utils.oversampling_utils import get_lemma
from utils.oversampling_utils import get_oversampled_data_with_synonyms
from utils.oversampling_utils import get_annotation_ratio

from utils.transfer_learning_utils import freeeze_layers
from utils.transfer_learning_utils import freeeze_embeddings

from config import LABEL_ALL_TOKENS, MAX_LENGTH
from config import PATIENCE, NUM_EXPERIMENTS, BEST_MODELS_PATH
from config import DATASETS_FOLDER

from typing import Tuple

RANDOM_STATE = 2022

np.random.seed(seed=RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

<torch._C.Generator at 0x7f26f10ea930>

#### Input Files

In [2]:
# Complete Training Files
INPUT_FILE = os.path.join(DATASETS_FOLDER, 'pref-agri-ner-input.csv')
SYNS_INPUT_FILE = os.path.join(DATASETS_FOLDER, 'synonym-agri-ner-input.csv')
NOVEL_INPUT_FILE = os.path.join(DATASETS_FOLDER, 'novel-agri-ner-input.csv')

# Minified input files for exploratory purposes
MIN_INPUT_FILE = os.path.join(DATASETS_FOLDER, 'pref-agri-ner-input-min.csv')
MIN_NOVEL_INPUT_FILE = os.path.join(DATASETS_FOLDER, 'novel-agri-ner-input-min.csv')

#### Displaying Input Files

In [3]:
df = pd.read_csv(MIN_INPUT_FILE)
df

Unnamed: 0,text,labels
0,On observing the pistils of the flowers which ...,O O O B-Agr O O B-Agr O O O B-Agr O O O O O B-...
1,The author concludes that from the obtained da...,O O O O O O O B-Agr O O O O O O O O O O O O O ...
2,In all the taxa studied most chromosomes of th...,O O O B-Agr O O B-Agr O O O B-Agr O O O O O
3,Nucleolar organizer chromosomes of different t...,O O B-Agr O O O O O O O O O O O O O B-Agr O O
4,Fourteen populations representative of all the...,O B-Agr O O O O B-Agr O O O O O O O O O O O O ...
...,...,...
1905,Indirect effects of fruit length and fruit wei...,O B-Agr O O B-Agr O O B-Agr O O O O O
1906,Fruit length and fruit weight however contribu...,O B-Agr O O B-Agr O O O O O O O O
1907,Application of selection index failed to show ...,B-Agr O B-Agr I-Agr O O O O O O O O B-Agr O O O
1908,Loading and handling operations are mostly car...,B-Agr O B-Agr O O O O O O B-Agr O O O O O O B-...


In [4]:
df_syns = pd.read_csv(SYNS_INPUT_FILE)
df_syns

Unnamed: 0,text,labels
0,Many of them are of Dutch origin,O O O O O O B-Agr
1,Scenarios and projections into the future are ...,O O B-Agr O O O O O O O O O O O O
2,The frequency of abnormality varies greatly be...,O O O O O O O O B-Agr
3,Agricultural conditions determined by seasonal...,O O O O O B-Agr O O O O O O O O
4,The experiments were carried out both in the p...,O B-Agr O O O O O O O O O O O O O O O
...,...,...
1258,Food likes are not innate but can develop very...,O O O O O O O O O O O O O B-Agr O
1259,Preterm birth may be associated with eating di...,O B-Agr O O O O B-Agr O
1260,The estimation of microbial diversity has rece...,O B-Agr O O O O O O O O O O O O O O
1261,Member commitment falls down in large agricult...,O O B-Agr O O O O O


In [5]:
df_novel = pd.read_csv(MIN_NOVEL_INPUT_FILE)
df_novel

Unnamed: 0,text,labels
0,Parental F(1) F(2) and some backcross generati...,O O O O O O B-Agr O O
1,Ranking of the stations from the most similar ...,O O O B-Agr O O O O O O O O O O B-Agr O O O O ...
2,Available N was not highly correlated with tot...,O O O O O O O O O O O O B-Agr
3,Early emergence within certain limits may be a...,O B-Agr O O O B-Agr O O O O O
4,Generally food intake was pulsed with maxima e...,O B-Agr O O O O O O O
...,...,...
1258,However these devices should not be viewed as ...,O O O O O O O O O O B-Agr
1259,High-altitude resorts are confronted with a st...,O O O O O O O O O O O O B-Agr B-Agr
1260,Our work has two main objectives,O B-Agr O O O O
1261,These rebuilt neighbourhoods can make it possi...,O O O O O O O O O O O O O O O B-Agr O O O


#### Formatting Labels and Maps

In [6]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df["labels"].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)
print(ids_to_labels)

{'B-Agr', 'O', 'I-Agr'}
{'B-Agr': 0, 'I-Agr': 1, 'O': 2}
{0: 'B-Agr', 1: 'I-Agr', 2: 'O'}


#### Experimental Variables

In [7]:
PRETRAINED_MODELS = [
  'recobo/agriculture-bert-uncased',
  'bert-base-cased',
  'xlnet-base-cased',
  'roberta-base',
  'allenai/scibert_scivocab_cased'     
]
LEARNING_RATES = [1e-3]
FROZEN_EMBEDDINGS_MODE = [True, False] # To freeze or not embedding layers
DROPOUT_RATES = [0.35] # Not too much difference
NUM_FROZEN_LAYERS_EXTRACTION_MODE = [7, 0]
#OPTIMIZER_NAMES = ["adam", "sgd"]
OPTIMIZER_NAMES = ["sgd"] # Adam does not optimize very well (with current hyper-parameters)
USE_RESAMPLING = False # Future research?

#### Main Experimental Loop

In [None]:
with open("result_file.csv", "w") as result_file:
    writer = csv.writer(result_file)
    for num_experiment in range(0, NUM_EXPERIMENTS):
        random_state = RANDOM_STATE // (num_experiment + 1)
        # Data Splitting -> Train, val, test
        df_train, df_val, df_test = np.split(df.sample(frac=1., random_state=random_state),
                                                         [int(.8*len(df)), int(.9*len(df))])
        if USE_RESAMPLING: # Not used for this paper
            print("[INFO] Loading Exluded Terms")
            OVERSAMPLING_EXCLUDED_TERMS = list(set(load_agrovoc_altLabels())) + list(set(load_agrovoc_prefLabels()))
            print("[INFO] Stemming Exluded Terms")
            oversampling_excluded_terms = [get_lemma(exluded_term) for exluded_term in OVERSAMPLING_EXCLUDED_TERMS]
            df_train = get_oversampled_data_with_synonyms(df_train, oversampling_excluded_terms, 
                                                                     use_undersampling = False,
                                                                     verbose=False)
        annotation_ratio = get_annotation_ratio(df_train)
        #### Grid Search
        for pretrained_model in PRETRAINED_MODELS:
            best_test_accuracy = 0.0
            for optimizer_name in OPTIMIZER_NAMES:
                for num_frozen_layers in NUM_FROZEN_LAYERS_EXTRACTION_MODE:
                    for frozen_embeddings_mode in FROZEN_EMBEDDINGS_MODE:
                        try:
                            for learning_rate in LEARNING_RATES:
                                for dropout_rate in DROPOUT_RATES:
                                    pipeline_id = "_".join(
                                        [   str(pretrained_model).replace("/", "-"),
                                            str(optimizer_name),
                                            str(learning_rate),
                                            str(dropout_rate),
                                            str(num_frozen_layers),
                                            str(frozen_embeddings_mode),
                                            str(USE_RESAMPLING),
                                            str(round(annotation_ratio, 3))
                                        ]
                                    )
                                    
                                    #### CREATING MODELS ####
                                    if pretrained_model == "roberta-base":
                                        tokenizer = (
                                            RobertaTokenizerFast.from_pretrained(
                                                pretrained_model
                                            )
                                        )
                                        model = RobertaModel(pretrained_model, unique_labels)
                                        model.roberta.dropout = nn.Dropout(
                                            p=dropout_rate, inplace=False
                                        )
                                    elif pretrained_model == "xlnet-base-cased":
                                        tokenizer = (
                                            XLNetTokenizerFast.from_pretrained(
                                                pretrained_model
                                            )
                                        )
                                        model = XLNetModel(pretrained_model, unique_labels)
                                        model.xlnet.dropout = nn.Dropout(
                                            p=dropout_rate, inplace=False
                                        )
                                    else:
                                        tokenizer = BertTokenizerFast.from_pretrained(
                                            pretrained_model
                                        )
                                        model = BertModel(pretrained_model, unique_labels)
                                        model.bert.dropout = nn.Dropout(
                                            p=dropout_rate, inplace=False
                                        )
                                    
                                    #### FREEZING LAYERS? ####

                                    if num_frozen_layers > 0:
                                        freeeze_layers(
                                            model, num_frozen_layers, pretrained_model
                                        )
                                    if frozen_embeddings_mode:
                                        freeeze_embeddings(model, pretrained_model)
                                    init_time = time.time()
                                    #### TRAIN LOOP ####
                                    (
                                        train_loss,
                                        train_accuracy,
                                        val_loss,
                                        val_accuracy,
                                        best_epoch,
                                    ) = train_loop(
                                        model,
                                        df_train,
                                        df_val,
                                        tokenizer,
                                        labels_to_ids,
                                        optimizer_name,
                                        learning_rate,
                                    )
                                    end_time = time.time()
                                    #### COMPUTING PERFORMANCES PER LABEL ####
                                    (
                                        test_accuracy,
                                        test_f1,
                                        f1_per_class,
                                        test_precision,
                                        precision_per_class,
                                        test_recall,
                                        recall_per_class,
                                    ) = evaluate(model, df_test, tokenizer, unique_labels)
                                    if test_accuracy > best_test_accuracy:
                                        best_test_accuracy = test_accuracy
                                        model_name = "*".join(
                                            [str(num_experiment), pipeline_id]
                                        )
                                        
                                        torch.save(
                                            model,
                                            os.path.join(
                                                BEST_MODELS_PATH, model_name + ".pt"
                                            ),
                                        )
                                    
                                    #### COMPUTING PERFORMANCES PER ENTITY ####
                                    ## Test Performance ##
                                    test_total_entities_ratio, test_partial_entities_ratio, test_false_positive_ratio = get_entity_annotation_report(df_test, model, tokenizer, ids_to_labels)
                                    ## Syns Performance ##
                                    syns_total_entities_ratio, syns_partial_entities_ratio, syns_false_positive_ratio = get_entity_annotation_report(df_syns, model, tokenizer, ids_to_labels)
                                    ## Novel Performance ##
                                    novel_total_entities_ratio, novel_partial_entities_ratio, novel_false_positive_ratio = get_entity_annotation_report(df_novel, model, tokenizer, ids_to_labels)
                                    
                                    #### REPORTING PERFORMANCES ####
                                    writer.writerow(
                                        [
                                            num_experiment,
                                            pipeline_id,
                                            best_epoch,
                                            round(train_loss, 4),
                                            round(train_accuracy.item(), 4),
                                            round(val_loss, 4),
                                            round(val_accuracy.item(), 4),
                                            round(test_accuracy.item(), 4),
                                            round(test_f1.item(), 4),
                                            f1_per_class,
                                            round(test_precision.item(), 4),
                                            precision_per_class,
                                            round(test_recall.item(), 4),
                                            recall_per_class,
                                            test_total_entities_ratio, 
                                            test_partial_entities_ratio, 
                                            test_false_positive_ratio,
                                            syns_total_entities_ratio, 
                                            syns_partial_entities_ratio, 
                                            syns_false_positive_ratio,
                                            novel_total_entities_ratio, 
                                            novel_partial_entities_ratio, 
                                            novel_false_positive_ratio,      
                                            (end_time - init_time)/(best_epoch+PATIENCE)
                                        ]
                                    )
                                    result_file.flush()
                                    del model
                                    gc.collect()
                        except Exception as e:
                            print("Exception")
                            print(e)

Some weights of the model checkpoint at recobo/agriculture-bert-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at recobo/agriculture-bert-u

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  self.pid = os.fork()
  self.pid = os.fork()
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1528/1528 [00:38<00:00, 39.24it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


[INFO] Updating weights: 0.8569644689559937 > 0
Epochs: 1 | Loss:  0.423 | Accuracy:  0.831 | Val_Loss:  0.354 | Accuracy:  0.857
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  self.pid = os.fork()
  self.pid = os.fork()
  self.pid = os.fork()


Test Accuracy:  0.847


Some weights of the model checkpoint at recobo/agriculture-bert-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at recobo/agriculture-bert-u

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 28%|█████████████████████████████████████████████▉                                                                                                                     | 431/1528 [00:13<00:34, 31.64it/s]