# Αναγνώριση ονοματισμένων ονοτήτων

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
from spacy.gold import GoldParse
from spacy.scorer import Scorer



def data_preparation_for_NER(data_filename, test_size=0.25):
    """
    Given the file that contains annotated data for NER
    splits the dataset to train set and test set.

    Parameters
    ----------
    data_filename : str
        the path to the file that contains the annotated
        data for NER
        
    test_size : float
        the test_size that is given as a parameter to
        train_test_split function of scikit-learn

    Returns
    -------
    tuple
        the tuple of train set and test set
    """
    df_texts = pd.read_csv(data_filename, index_col=None, converters={'entities': eval})
    df_texts['text_entities'] = list(zip(df_texts.text, df_texts.entities))
    data = [(text, {'entities': entities}) for (text, entities) in df_texts['text_entities']]
    train_data, test_data = train_test_split(data, test_size=test_size, random_state=10)
    return train_data, test_data



def train_ner(labels, train_set, model=None, new_model_name="", output_dir=None, n_iter=30):
    """
    Sets up the pipeline and named entity recognizer
    and trains the NER for new entities.

    Parameters
    ----------
    labels : list
        the labels (strings) of new entities 
        
    train_set : list
        the observations of the train set
        
    model : module
        existing SpaCy model
        
    new_model_name : str
        the name of the new model
        
    output_dir: str
        the directory of the output file where the new
        model will be saved
        
    n_iter : int
        the number of the iterations in the training process
    """
    random.seed(0)
    if model is not None:
        nlp = model.load()  # load existing spaCy model
    else:
        nlp = spacy.blank("el")  # create blank model in greek
    # add entity recognizer to model if it's not in the pipeline
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    for label in labels:
        ner.add_label(label)  # add new entity labels to entity recognizer
    
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(train_set)
            batches = minibatch(train_set, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses in iteration: " + str(itn + 1), losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        
        

def evaluate_ner(output_dir, test_set, print_entities=False):
    """
    Tests the new ner model and scores its performance.

    Parameters
    ----------
    output_dir: str
        the directory of the output file where the new
        model was saved
        
    test_set : list
        the observations of the test set
        
    print_entities : boolean
        if True the entities detected for each text
        in the test set will be printed
        
    Returns
    -------
    dict
        all the metrics ant their scores that measure
        the performance of the new NLP model
    """
    ner_model = spacy.load(output_dir)
    scorer = Scorer()
    # test ner
    for (input_, dict_entities) in test_set:        
        [annot] = dict_entities.values()
        doc_gold_text = ner_model.make_doc(input_)
        gold = GoldParse(doc_gold_text, entities=annot)
        pred_value = ner_model(input_)
        # precision, recall, f-score for ner etc.
        scorer.score(pred_value, gold)
        if print_entities:
            print("Entities in '%s'" % input_[:70])
            for ent in pred_value.ents:
                print(ent.label_, ent.text)
    return scorer.scores

In [5]:
train_data, test_data = data_preparation_for_NER('../data/votana_total_texts_annotated.csv', 0.25)

In [7]:
print('Training...')
train_ner(['ORGAN', 'PLANT'], train_data, new_model_name='ethnopharmacology_ner', output_dir='./ner_model/', n_iter=30)

Training...
Losses in iteration: 1 {'ner': 28009.486630714855}
Losses in iteration: 2 {'ner': 10072.37744715274}
Losses in iteration: 3 {'ner': 6750.224350848334}
Losses in iteration: 4 {'ner': 5211.193739124763}
Losses in iteration: 5 {'ner': 3992.39955358218}
Losses in iteration: 6 {'ner': 3095.672196773336}
Losses in iteration: 7 {'ner': 2830.3548845383857}
Losses in iteration: 8 {'ner': 2343.4972213200845}
Losses in iteration: 9 {'ner': 2095.902628214111}
Losses in iteration: 10 {'ner': 1905.747405956969}
Losses in iteration: 11 {'ner': 1849.0109994482434}
Losses in iteration: 12 {'ner': 1549.1252833628105}
Losses in iteration: 13 {'ner': 1275.264053523643}
Losses in iteration: 14 {'ner': 1290.2932319212568}
Losses in iteration: 15 {'ner': 1109.5565862497112}
Losses in iteration: 16 {'ner': 1107.455720012933}
Losses in iteration: 17 {'ner': 1111.0178881850686}
Losses in iteration: 18 {'ner': 936.43680941124}
Losses in iteration: 19 {'ner': 953.3981488042859}
Losses in iteration: 20

In [6]:
print('Testing...')
scores = evaluate_ner('./ner_model/', test_data, False)
print(scores)

Testing...
{'uas': 0.0, 'las': 0.0, 'las_per_type': {'': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'ents_p': 98.15357188335668, 'ents_r': 95.30168150346192, 'ents_f': 96.70660560818017, 'ents_per_type': {'PLANT': {'p': 97.71313267193543, 'r': 94.13575247043578, 'f': 95.8910891089109}, 'ORGAN': {'p': 99.52780692549842, 'r': 99.06005221932115, 'f': 99.29337869667627}}, 'tags_acc': 0.0, 'token_acc': 100.0, 'textcat_score': 0.0, 'textcats_per_cat': {}}
