In [1]:
%run ../flaubert_token_classification.py
%run ../tools/utils_ner.py

In [2]:
import datetime
import math
import os

import numpy as np
import tensorflow as tf
from seqeval import metrics
from fastprogress import master_bar, progress_bar
from transformers import (
    TF2_WEIGHTS_NAME,
    FlaubertConfig,
    FlaubertTokenizer,
    GradientAccumulator,
    create_optimizer
)

# Parameters

In [3]:
ROOT_FOLDER = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/"

## Model parameters

In [4]:
MODEL_PATH = ROOT_FOLDER + "models/dev/pos/"
LOG_PATH = ROOT_FOLDER + "logs/pos/"

model_name="jplu/tf-flaubert-base-cased"

max_seq_length=64
batch_size=32
epochs=3
learning_rate=5e-5
max_grad_norm=1.0
warmup_steps=0
pad_token_label_id = -1

## Dataset parameters

In [5]:
DATASET_PATH = ROOT_FOLDER + "dataset/pos_tag/UD_French-GSD/"
LABEL_PATH = DATASET_PATH + "labels.txt"

labels, num_labels = get_labels(LABEL_PATH)

## Random parameters

In [6]:
save_steps=1500
logging_steps=100
seed=42
no_cuda=False

In [7]:
def train(strategy, train_dataset, tokenizer, model, num_train_examples, labels, pad_token_label_id):
    num_train_steps = math.ceil(num_train_examples / batch_size) // epochs

    writer = tf.summary.create_file_writer("/tmp/mylogs")

    with strategy.scope():
        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
        optimizer = create_optimizer(learning_rate, num_train_steps, warmup_steps)
        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
        gradient_accumulator = GradientAccumulator()

    model.summary()

    @tf.function
    def apply_gradients():
        grads_and_vars = []

        for gradient, variable in zip(gradient_accumulator.gradients, model.trainable_variables):
            if gradient is not None:
                grads_and_vars.append((gradient, variable))
            else:
                grads_and_vars.append((gradient, variable))

        optimizer.apply_gradients(grads_and_vars, max_grad_norm)
        gradient_accumulator.reset()

    @tf.function
    def train_step(train_features, train_labels):
        def step_fn(train_features, train_labels):
            inputs = {
                "attention_mask": train_features["input_mask"],
                "token_type_ids": train_features["segment_ids"],
                "training": True
            }

            with tf.GradientTape() as tape:
                logits = model(train_features["input_ids"], **inputs)[0]
                active_loss = tf.reshape(train_labels, (-1,)) != pad_token_label_id
                active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss)
                active_labels = tf.boolean_mask(tf.reshape(train_labels, (-1,)), active_loss)
                cross_entropy = loss_fct(active_labels, active_logits)
                loss = tf.reduce_sum(cross_entropy) * (1.0 / batch_size)
                grads = tape.gradient(loss, model.trainable_variables)

                gradient_accumulator(grads)

            return cross_entropy

        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
        mean_loss = strategy.reduce(tf.distribute.ReduceOp.MEAN, per_example_losses, axis=0)

        return mean_loss

    current_time = datetime.datetime.now()
    train_iterator = master_bar(range(epochs))
    global_step = 0
    logging_loss = 0.0

    for epoch in train_iterator:
        epoch_iterator = progress_bar(
            train_dataset, total=num_train_steps, parent=train_iterator, display=False
        )

        step = 1

        with strategy.scope():
            for train_features, train_labels in epoch_iterator:
                loss = train_step(train_features, train_labels)
                strategy.experimental_run_v2(apply_gradients)
                loss_metric(loss)
                global_step += 1

                # Log metrics
                if logging_steps > 0 and global_step % logging_steps == 0:
                    lr = optimizer.learning_rate
                    learning_rate_step = lr(step)

                    with writer.as_default():
                        tf.summary.scalar("lr", learning_rate_step, global_step)
                        tf.summary.scalar("loss", (loss_metric.result() - logging_loss) / logging_steps, global_step)

                    logging_loss = loss_metric.result()

                with writer.as_default():
                    tf.summary.scalar("loss", loss_metric.result(), step=step)

                # Save model checkpoint
                if save_steps > 0 and global_step % save_steps == 0:
                    checkpoint_output_dir = os.path.join(MODEL_PATH, "checkpoint-{}".format(global_step))

                    if not os.path.exists(checkpoint_output_dir):
                        os.makedirs(checkpoint_output_dir)

                    model.save_pretrained(checkpoint_output_dir)
                    print("Saving model checkpoint to", checkpoint_output_dir)

                train_iterator.child.comment = f"loss : {loss_metric.result()}"
                step += 1

        train_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")
        loss_metric.reset_states()

    print("  Training took time = {}".format(datetime.datetime.now() - current_time))

In [8]:
def evaluate(strategy, model, tokenizer, labels, pad_token_label_id, mode):
    eval_dataset, size = load_and_cache_examples(tokenizer, labels, pad_token_label_id, mode=mode)
    eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)
    preds = None
    
    num_eval_steps = math.ceil(size / batch_size)
    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
    loss = 0.0

    for eval_features, eval_labels in eval_dataset:
        inputs = {
            "attention_mask": eval_features["input_mask"],
            "token_type_ids": eval_features["segment_ids"],
            "training": False
        }

        with strategy.scope():
            logits = model(eval_features["input_ids"], **inputs)[0]
            active_loss = tf.reshape(eval_labels, (-1,)) != pad_token_label_id
            active_logits = tf.boolean_mask(tf.reshape(logits, (-1, len(labels))), active_loss)
            active_labels = tf.boolean_mask(tf.reshape(eval_labels, (-1,)), active_loss)
            cross_entropy = loss_fct(active_labels, active_logits)
            loss += tf.reduce_sum(cross_entropy) * (1.0 / batch_size)

        if preds is None:
            preds = logits.numpy()
            label_ids = eval_labels.numpy()
        else:
            preds = np.append(preds, logits.numpy(), axis=0)
            label_ids = np.append(label_ids, eval_labels.numpy(), axis=0)

    preds = np.argmax(preds, axis=2)
    y_pred = [[] for _ in range(label_ids.shape[0])]
    y_true = [[] for _ in range(label_ids.shape[0])]
    loss = loss / num_eval_steps

    for i in range(label_ids.shape[0]):
        for j in range(label_ids.shape[1]):
            if label_ids[i, j] != pad_token_label_id:
                y_pred[i].append(labels[preds[i, j] - 1])
                y_true[i].append(labels[label_ids[i, j] - 1])

    return y_true, y_pred, loss.numpy()

In [9]:
def load_and_cache_examples(tokenizer, labels, pad_token_label_id, mode):
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        DATASET_PATH,
        "cached_{}_{}_{}.tf_record".format(
            mode, list(filter(None, model_name.split("/"))).pop(), str(max_seq_length)
        ),
    )

    print("Creating features from dataset file at", DATASET_PATH)
    examples = read_examples_from_file(DATASET_PATH, mode)
    features = convert_examples_to_features(
        examples,
        labels,
        max_seq_length,
        tokenizer,
        cls_token_at_end=False,
        # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=False, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=False, # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=0,
        pad_token_label_id=pad_token_label_id,
    )

    print("#", len(features) ," lines in dataset ")
    save_cache(features, cached_features_file)
    dataset, size = load_cache(cached_features_file, max_seq_length)

    if mode == "train":
        dataset = dataset.repeat()
        dataset = dataset.shuffle(buffer_size=8192, seed=seed)

    dataset = dataset.batch(batch_size, True)
    dataset = dataset.prefetch(buffer_size=batch_size)

    return dataset, size

# Import model

In [10]:
strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0" if no_cuda else "/gpu:0")

config = FlaubertConfig.from_pretrained(model_name, num_labels=num_labels)

with strategy.scope():
    model = TFFlaubertForTokenClassification.from_pretrained(model_name, config=config)
    tokenizer = FlaubertTokenizer.from_pretrained(model_name)

# Import dataset

In [11]:
train_dataset, num_train_examples = load_and_cache_examples(tokenizer, labels, pad_token_label_id, mode="train")
train_dataset = strategy.experimental_distribute_dataset(train_dataset)

Creating features from dataset file at /home/jupyter/bert_clustering/dataset/pos_tag/UD_French-GSD/
# 13361  lines in dataset 


# Run training

In [12]:
train(strategy, train_dataset, tokenizer, model, num_train_examples, labels, pad_token_label_id)

if not os.path.exists(MODEL_PATH):
    os.makedirs(MODEL_PATH)

print("Saving model to", MODEL_PATH)

model.save_pretrained(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

del train_dataset, num_train_examples

Model: "tf_flaubert_for_token_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_26 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  14611     
_________________________________________________________________
transformer (TFFlaubertMainL multiple                  138233088 
Total params: 138,247,699
Trainable params: 138,247,699
Non-trainable params: 0
_________________________________________________________________


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


  Training took time = 0:03:39.105156
Saving model to /home/jupyter/bert_clustering/models/dev/pos/


# Run evaluation

In [13]:
y_true, y_pred, eval_loss = evaluate(strategy, model, tokenizer, labels, pad_token_label_id, mode="dev")
report = metrics.classification_report(y_true, y_pred, digits=4)

output_eval_file = os.path.join(MODEL_PATH, "eval_results.txt")

with tf.io.gfile.GFile(output_eval_file, "w") as writer:
    # Log loss
    print("final_loss = " + str(eval_loss))
    writer.write("final_loss = " + str(eval_loss))
    writer.write("\n")

    # Log metrics
    print("final_report")
    print("\n" + report)
    writer.write("final_report" + "\n")
    writer.write(report)
    writer.write("\n")

Creating features from dataset file at /home/jupyter/bert_clustering/dataset/pos_tag/UD_French-GSD/
# 1375  lines in dataset 
final_loss = 41.62265
final_report

           precision    recall  f1-score   support

    CCONJ     0.9957    0.9951    0.9954      5305
      ADJ     0.9934    0.9940    0.9937      5486
    PROPN     0.9959    0.9986    0.9972      2174
     INTJ     0.9700    0.9620    0.9660      6376
      SYM     0.9662    0.9745    0.9704      2670
      ADP     0.9290    0.9389    0.9339      1212
      AUX     0.9788    0.9835    0.9812       847
     PRON     0.8962    0.8998    0.8980      1957
     PART     0.9823    0.9554    0.9687      1278
      ADV     0.9741    0.9816    0.9778      1033
        X     0.9951    0.9951    0.9951      1013
     NOUN     0.9400    0.9503    0.9451       906
    SCONJ     0.8718    0.5862    0.7010        58
    PUNCT     0.9447    0.9328    0.9387       238
      DET     0.0000    0.0000    0.0000         5
     VERB     0.3000 

In [14]:
# tokenizer = FlaubertTokenizer.from_pretrained(MODEL_PATH)
# model = TFFlaubertForTokenClassification.from_pretrained(MODEL_PATH)

# # predict_dataset, _ = load_and_cache_examples(tokenizer, labels, pad_token_label_id, mode="test")

# y_true, y_pred, pred_loss = evaluate(strategy, model, tokenizer, labels, pad_token_label_id, mode="test")
# output_test_results_file = os.path.join(MODEL_PATH, "test_results.txt")
# output_test_predictions_file = os.path.join(MODEL_PATH, "test_predictions.txt")
# report = metrics.classification_report(y_true, y_pred, digits=4)

# with tf.io.gfile.GFile(output_test_results_file, "w") as writer:
#     report = metrics.classification_report(y_true, y_pred, digits=4)

#     logging.info("\n" + report)

#     writer.write(report)
#     writer.write("\n\nloss = " + str(pred_loss))

# with tf.io.gfile.GFile(output_test_predictions_file, "w") as writer:
#     with tf.io.gfile.GFile(os.path.join(DATASET_PATH, "test.txt"), "r") as f:
#         example_id = 0

#         for line in f:
#             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
#                 writer.write(line)

#                 if not y_pred[example_id]:
#                     example_id += 1
#             elif y_pred[example_id]:
#                 output_line = line.split()[0] + " " + y_pred[example_id].pop(0) + "\n"
#                 writer.write(output_line)
#             else:
#                 logging.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
