In [1]:
%run ../tools/parse_dawt.py

In [2]:
import os
import re
import csv
import json
import time
import random
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import class_weight
from transformers import FlaubertTokenizer, CamembertTokenizer

In [3]:
BATCH_SIZE = 8
SEQUENCE_LENGTH = 64
ROOT_FOLDER = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + "/"
MODEL_PATH = ROOT_FOLDER + "models/ner/"
DATASET_PATH = ROOT_FOLDER + "dataset/"
LOG_PATH = ROOT_FOLDER + "logs/ner/"

## Import dataset

In [4]:
# For reference : http://www.llf.cnrs.fr/Gens/Abeille/French-Treebank-fr.php
# One Hot encoder class label by alphabetical order
label_map = {
    'O': 0,
    'PERSON': 1,
    'ORGANIZATION': 2,
    'MISC': 3,
    'FILM': 4,
    'LOCATION': 5,
    'EVENT': 6,
    'BOOK': 7
}

In [5]:
sample, labels, label_count = parse_dawt(label_map, max_seq_length=SEQUENCE_LENGTH)
print(len(sample))
print(len(labels))
print(label_count)

85it [00:14,  6.04it/s]


2595
2595
{'O': 76391, 'PERSON': 799, 'ORGANIZATION': 1158, 'MISC': 8157, 'FILM': 55, 'LOCATION': 2448, 'EVENT': 473, 'BOOK': 20}


In [6]:
label_weights = {}
for idx, (key, val) in enumerate(label_count.items()):
    label_weights[idx] = max(label_count.values()) / val
print(label_weights)

class_weights = {}
for idx, (key, val) in enumerate(label_weights.items()):
    class_weights[idx] = val / max(label_weights.values()) * 10
print(class_weights)

{0: 1.0, 1: 95.60826032540676, 2: 65.96804835924007, 3: 9.365085202893221, 4: 1388.9272727272728, 5: 31.20547385620915, 6: 161.5031712473573, 7: 3819.55}
{0: 0.0026181094631566545, 1: 0.2503128911138924, 2: 0.17271157167530224, 3: 0.0245188181929631, 4: 3.6363636363636367, 5: 0.08169934640522875, 6: 0.42283298097251587, 7: 10.0}


In [7]:
enc = OneHotEncoder(sparse=False)

enc.fit([[i] for i in range(len(label_map))])
print(enc.categories_)

for idx, label in enumerate(labels):
    labels[idx] = enc.transform([[l] for l in label])

[array([0, 1, 2, 3, 4, 5, 6, 7])]


In [8]:
dataset = tf.data.Dataset.from_tensor_slices((sample, labels))

In [9]:
dataset.shuffle(1)
dataset.batch(32)

<BatchDataset shapes: ((None, 64), (None, 64, 8)), types: (tf.int32, tf.float64)>

## Import Flaubert model

In [10]:
from transformers.modeling_tf_xlm import TFXLMPreTrainedModel
from transformers.modeling_tf_flaubert import TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP, TFXLMMainLayer, TFFlaubertMainLayer
from transformers.configuration_flaubert import FlaubertConfig
from transformers.modeling_tf_utils import TFPreTrainedModel, get_initializer

class TFXLMForTokenClassification(TFXLMPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.num_labels = config.num_labels

        self.transformer = TFXLMMainLayer(config, name="transformer")
        self.dropout = tf.keras.layers.Dropout(config.dropout)
        self.classifier = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.init_std), name="classifier"
        )

    def call(self, inputs, **kwargs):
        transformer_outputs = self.transformer(inputs, **kwargs)
        sequence_output = transformer_outputs[0]

        sequence_output = self.dropout(sequence_output, training=kwargs.get("training", False))
        logits = self.classifier(sequence_output)

        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
        return outputs

class TFFlaubertForTokenClassification(TFXLMForTokenClassification):
    config_class = FlaubertConfig
    pretrained_model_archive_map = TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_MAP

    def __init__(self, config, *inputs, **kwargs):
        super(TFFlaubertForTokenClassification, self).__init__(config, *inputs, **kwargs)
        self.transformer = TFFlaubertMainLayer(config, name="transformer")

In [11]:
model = TFFlaubertForTokenClassification.from_pretrained(
    #ROOT_FOLDER + "models/last_model",
    "jplu/tf-flaubert-base-cased",
    num_labels=len(label_map),
    max_length=SEQUENCE_LENGTH,
    #force_download=True
)
tokenizer = FlaubertTokenizer.from_pretrained("jplu/tf-flaubert-base-cased")

## Test model

In [12]:
input_ids = tokenizer.encode("La NBA n'a rien a faire à New-York", return_tensors='tf')#, add_special_tokens=True, pad_to_max_length=, return_tensors='tf')
out = model(input_ids)

print(input_ids)
print(out)
#print(np.argmax(np.abs(out[0])))
#print(labels[np.argmax(np.abs(out[0]))])

tf.Tensor([[    1    60 15899    51    34   245    34    88    19 19593     1]], shape=(1, 11), dtype=int32)
(<tf.Tensor: shape=(1, 11, 8), dtype=float32, numpy=
array([[[ 0.4750543 , -1.0263537 , -0.50032675, -0.6697321 ,
         -1.7107885 ,  0.58109   , -0.10784608, -0.70143867],
        [ 0.6314838 , -1.2021198 , -0.5643149 , -0.69011015,
         -1.7713788 ,  0.5009858 , -0.15354154, -0.63084567],
        [ 0.6899013 , -1.2557735 , -0.6727637 , -0.83872217,
         -1.6549236 ,  0.20699358, -0.11832702, -0.49471083],
        [ 0.84097475, -0.8674935 , -1.1272043 , -0.33308977,
         -1.4902676 , -0.532801  , -1.1031072 , -0.5339469 ],
        [ 0.8760713 , -1.6276183 , -0.72812325,  0.9745205 ,
         -1.3111415 , -1.3101437 , -0.59587735, -0.7248032 ],
        [ 1.3977413 , -1.0193906 , -0.6986549 ,  0.75868237,
         -0.30403677, -1.3089384 , -0.6317096 , -0.13659461],
        [ 1.3309778 , -0.96223736, -0.60745156,  0.63840073,
         -0.27939892, -1.2566918 , -0.2

In [13]:
#for idx, (a, b) in enumerate(file_generator()):
#    if idx > 1:
#        break
#    print(b)

## Train model on new dataset

In [14]:
def recall_m(y_true, y_pred):
    TP = tf.math.count_nonzero(y_pred * y_true)
    FN = tf.math.count_nonzero((y_pred - 1) * y_true)
    return TP / (TP + FN)

def precision_m(y_true, y_pred):
    TP = tf.math.count_nonzero(y_pred * y_true)
    FP = tf.math.count_nonzero(y_pred * (y_true - 1))
    return TP / (TP + FP)

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2 * precision * recall / (precision + recall)

In [15]:
optimizer = tf.keras.optimizers.Adam()#learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.CategoricalAccuracy()
tensorboard = tf.keras.callbacks.TensorBoard(log_dir=LOG_PATH+"flaubert_cased_"+time.strftime("%d%m%y/%H:%M:%S"))
checkpoint = tf.keras.callbacks.ModelCheckpoint(MODEL_PATH+"checkpoints/")

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[f1_m, metric, recall_m, precision_m],
)
model.summary()

Model: "tf_flaubert_for_token_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_26 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  6152      
_________________________________________________________________
transformer (TFFlaubertMainL multiple                  138233088 
Total params: 138,239,240
Trainable params: 138,239,240
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.fit(
    dataset,
    epochs=10,
    #max_queue_size=2,
    #class_weight=class_weights,
    #steps_per_epoch=200,
    #validation_data=file_generator(),
    #validation_split=0.1,
    #callbacks=[checkpoint] # tensorboard
)

Train for 2595 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10

KeyboardInterrupt: 

In [None]:
input_ids = tf.constant(
    [
        tokenizer.encode(
            "Hillary Clinton, présidente des Etats-unis !",
            add_special_tokens=True
            
        ),
    ], tf.int32
)
out = model(input_ids)

In [None]:
print(out[0].shape)
print(np.round(out[0]))
print(np.argmax(out[0], axis=2))
#print(np.argmax(np.round(out[0][0])))
#print(list(labels)[np.argmax(np.abs(out[0]))])

print(label_map)

In [None]:
for idx, (X_test, y_test) in enumerate(file_generator()):
    if idx > 2:
        break
    #Confution Matrix and Classification Report
    Y_pred = model(X_test)

    y_pred = [labels[int(np.argmax(y))] for y in Y_pred[0]]
    #for (a, b) in zip(X_test, y_pred):
    #    print(b, ":", tokenizer.decode(a, skip_special_tokens=True))
    y_test = enc.inverse_transform(y_test.numpy())

    #for x, y in zip(y_test, y_pred):
    #    print(x, "/", y)

    print('Confusion Matrix')
    print(confusion_matrix(y_test, y_pred))
    print('Classification Report')

    #print(classification_report(y_test, y_pred, target_names=labels))

In [None]:
#model.save_pretrained(MODEL_PATH+"last_model")