# Setup

In [1]:
%run ../flaubert_token_classification.py
%run ../tools/utils_ner.py

In [2]:
import datetime
import math
import os

import matplotlib.pyplot as plt
import itertools
import numpy as np
import tensorflow as tf
from seqeval import metrics
from transformers import (
    TF2_WEIGHTS_NAME,
    FlaubertConfig,
    FlaubertTokenizer,
    GradientAccumulator,
    create_optimizer
)

In [3]:
model_name="jplu/tf-flaubert-base-cased"
ROOT_FOLDER = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/"
MODEL_PATH = ROOT_FOLDER + "models/ner/"
DATASET_PATH = ROOT_FOLDER + "dataset/custom_dataset/"
LABEL_PATH = DATASET_PATH + "labels.txt"

labels = ['LOC', 'MISC', 'ORG', 'PER', 'O']

num_labels = 5
pad_token_label_id = -1

batch_size=64
max_seq_length = 64

In [4]:
strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0")

with strategy.scope():
    model = TFFlaubertForTokenClassification.from_pretrained(MODEL_PATH)
    tokenizer = FlaubertTokenizer.from_pretrained(model_name)

# Evaluation

In [5]:
def load_and_cache_examples(tokenizer, labels, pad_token_label_id):
    mode = "ner_test"
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        DATASET_PATH,
        "cached_{}_{}_{}.tf_record".format(
            mode, list(filter(None, model_name.split("/"))).pop(), str(max_seq_length)
        ),
    )
    if os.path.exists(cached_features_file):
        logging.info("Loading features from cached file %s", cached_features_file)
        dataset, size = load_cache(cached_features_file, max_seq_length)
    else:
        print("Creating features from dataset file at", DATASET_PATH)
        examples = read_examples_from_file(DATASET_PATH, mode)
        features = convert_examples_to_features(
            examples,
            labels,
            max_seq_length,
            tokenizer,
            cls_token_at_end=False,
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=False, # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=0,
            pad_token_label_id=pad_token_label_id,
        )

        save_cache(features, cached_features_file)
        dataset, size = load_cache(cached_features_file, max_seq_length)

    if mode == "train":
        dataset = dataset.repeat()
        dataset = dataset.shuffle(buffer_size=8192, seed=seed)

    dataset = dataset.batch(batch_size, True)
    dataset = dataset.prefetch(buffer_size=batch_size)

    return dataset, size

In [6]:
eval_dataset, size = load_and_cache_examples(tokenizer, labels, -1)
eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)

Creating features from dataset file at /home/jupyter/bert_clustering/dataset/custom_dataset/


In [163]:
def evaluate(strategy, model, tokenizer, labels, pad_token_label_id, mode):
    preds = None

    ner_labels = ["B-LOC", "B-MISC", "B-ORG", "B-PER", "I-LOC", "I-MISC", "I-ORG", "I-PER", "O"]
    
    input_ids = []
    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    loss = 0.0
    idx = 0
    for idx, (eval_features, eval_labels) in enumerate(eval_dataset):
        if idx > 10:
            break
        inputs = {
            "attention_mask": eval_features["input_mask"],
            "token_type_ids": eval_features["segment_ids"],
            "training": False
        }

        with strategy.scope():
            ner_predictions = model(eval_features["input_ids"], **inputs)
            for i in range(0, batch_size):
                y_pred.append(ner_predictions[0][i])
                y_true.append(eval_labels[i])
                input_ids.append(eval_features["input_ids"])

    return y_true, y_pred, input_ids

In [164]:
y_true, y_pred, input_ids = evaluate(strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")

In [202]:
def post_token_classification(predictions, input_mask, tokens, labels, null_tokens):
    entities = []
    str_len = -1
    merge_prev = False

    #tokens = [""] + tokens
    for idx, prediction in enumerate(predictions):
        # Skip tokens added by tokenizer
        if idx >= len(tokens): # or input_mask[idx] != 1:
            continue

        # Merge word splitted
        if merge_prev:
            tokens[idx] = tokens[idx - 1] + tokens[idx]

        if not "</w>" in tokens[idx] and labels[prediction] not in null_tokens:
            merge_prev = True
            continue

        merge_prev = False
        token = tokens[idx].replace("</w>", "")
        start = str_len + 1
        end = len(token) + start
        str_len = end

        if labels[prediction] not in null_tokens:
            entities.append({
                "token": token,
                "label": labels[prediction],
                "start": start,
                "end": end
            })
    return entities

In [230]:
export = open("export.csv", "w")
export.write("sentence; true; pred\n")

colors = ["\x1b[31m", "\x1b[32m", "\x1b[33m", "\x1b[34m", "\x1b[30m"]

print(" ".join([color + " " + labels[idx] + " \x1b[0m"for idx, color in enumerate(colors)]))

for idx, (true, pred, ids) in enumerate(zip(y_true, y_pred, input_ids)):
    if idx > 100:
        break
    
    sentence = tokenizer.convert_ids_to_tokens(ids[idx % 64])
    sentence = [s for s in sentence[1:] if s != "<pad>" and s != "</s>"]

    new_idx = 0
    new_sentence = [""] * (len([s for s in sentence[1:] if "</w>" in s]) + 1)
    for word in sentence:
        new_sentence[new_idx] += word.replace("</w>", "")
        if "</w>" in word:
            new_idx += 1
    
    true_sentence = ""
    pred_sentence = ""
    sentence_pos = 0
    for sentence_pos, word in enumerate(new_sentence):
        if sentence_pos >= len(pred):
            break

        pred_sentence_idx = pred[sentence_pos] if pred[sentence_pos] < 4 else pred[sentence_pos] - 4
        true_sentence += colors[true[sentence_pos]] + word + " \x1b[0m"
        pred_sentence += colors[pred_sentence_idx] + word + " \x1b[0m"

    print("Actuel :", true_sentence)
    print("Predit :", pred_sentence, "\n")

    # export.write(sentence + "; " + true + "; " + pred + "\n")
export.close()

[31m LOC [0m [32m MISC [0m [33m ORG [0m [34m PER [0m [30m O [0m
Actuel : [30mFumigènes [0m[30m: [0m[30mSaint [0m[30m- [0m[30mEtienne [0m[30ms [0m[30men [0m[30msort [0m[30mavec [0m[30mun [0m[30mmatch [0m[30mferme [0m
Predit : [32mFumigènes [0m[30m: [0m[33mSaint [0m[33m- [0m[33mEtienne [0m[30ms [0m[30men [0m[30msort [0m[30mavec [0m[30mun [0m[30mmatch [0m[30mferme [0m 

Actuel : [30mLa [0m[30mchaîne [0m[33mHBO [0m[30mnégocie [0m[30ml [0m[30madaptation [0m[30men [0m[30mmini [0m[30m- [0m[30msérie [0m[30mdu [0m[30mfilm [0m[30m" [0m[32mParasite [0m[30m" [0m
Predit : [30mLa [0m[30mchaîne [0m[33mHBO [0m[30mnégocie [0m[30ml [0m[30madaptation [0m[30men [0m[30mmini [0m[30m- [0m[30msérie [0m[30mdu [0m[30mfilm [0m[30m" [0m[32mParasite [0m[30m" [0m 

Actuel : [30mLe [0m[30mgouverneur [0m[30mde [0m[31mCalifornie [0m[30mveut [0m[30mdébloquer [0m[30m1 [0m[30m, [0m[30m4 