In [1]:
!pip install numpy pandas tqdm sklearn transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 11.7 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 464 kB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.7 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Atte

In [2]:
!git clone https://github.com/DinhLuan14/CrossNER.git

Cloning into 'CrossNER'...
remote: Enumerating objects: 172, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 172 (delta 2), reused 2 (delta 2), pack-reused 164[K
Receiving objects: 100% (172/172), 2.30 MiB | 8.20 MiB/s, done.
Resolving deltas: 100% (77/77), done.


In [3]:
def prepare_dataset(PATH,convert2bio=False):
  sents = []
  chunks = open(PATH,'r').read().split('\n\n')
  for chunk in chunks:
    lines = chunk.split('\n')
    sent = []
    current_tag = None
    previous_tag = None
    for line in lines:
        if line != '':
            token = line.split('\t')
            previous_tag = current_tag 
            current_tag = token[1]
            if convert2bio:
                if previous_tag == current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                elif previous_tag != current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                else:
                    sent.append((token[0],token[1]))
            else:
                sent.append((token[0],token[1]))
    sents.append(sent)
  return sents

In [4]:
samples_train = prepare_dataset('/content/CrossNER/ner_data/literature/train.txt',convert2bio=True)
samples_valid = prepare_dataset('/content/CrossNER/ner_data/literature/dev.txt',convert2bio=True)
samples_test = prepare_dataset('/content/CrossNER/ner_data/literature/test.txt',convert2bio=True)

In [5]:
samples = samples_train + samples_test + samples_valid
schema = ['_'] + sorted({tag for sentence in samples 
                             for _, tag in sentence})

In [6]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import json
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import AutoConfig, TFXLMRobertaForTokenClassification

MODEL_NAME = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

In [7]:
def tokenize_sample(sample):
  seq = [
            (subtoken, tag)
            for token, tag in sample
            for subtoken in tokenizer(token)['input_ids'][1:-1]
        ]
  return [(3, 'O')] + seq + [(4, 'O')]

def preprocess(samples,schema,max_len_sent=None):
    if max_len_sent != None:
      reduced_samples = []
      for sample in samples:
        if len(sample) < max_len_sent:
          reduced_samples.append(sample)
    else:
      reduced_samples = samples
    
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize_sample, reduced_samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocess(samples_train,schema,max_len_sent=100)
X_val, y_val = preprocess(samples_valid,schema,max_len_sent=100)
X_test, y_test = preprocess(samples_test,schema,max_len_sent=100)

101it [00:00, 245.34it/s]
401it [00:01, 256.20it/s]
417it [00:01, 237.41it/s]


In [8]:
NR_EPOCHS=20
BATCH_SIZE=2
model = TFXLMRobertaForTokenClassification.from_pretrained(MODEL_NAME,num_labels=len(schema))

Downloading:   0%|          | 0.00/3.05G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForTokenClassification.

Some layers of TFXLMRobertaForTokenClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
  #model.load_weights(checkpoint_filepath)
history = model.fit(tf.constant(X_train), tf.constant(y_train),
                      validation_data=(X_val,y_val), epochs=NR_EPOCHS, 
                      batch_size=2,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
def aggregate(sample,schema,predictions):
    results = []
    i = 1
    for token, y_true in sample:
        nr_subtoken = len(tokenizer(token)['input_ids']) - 2
        pred = predictions[i:i+nr_subtoken]
        i += nr_subtoken
        y_pred = schema[np.argmax(np.sum(pred, axis=0))]
        results.append((token, y_true, y_pred))
    return results
 
y_probs = model.predict(X_test)[0]
predictions = [aggregate(sample,schema,predictions) for sample, predictions in zip(samples_test, y_probs)]

In [22]:
y = []
y_hat = []
y_pre = []
for pred in predictions:
  for token in pred:
    y.append(token[1])
    y_hat.append(token[2])
    y_pre.append('O')
len(y_hat)

16157

In [23]:
from sklearn.metrics import f1_score
print('micro f1:',f1_score(y,y_hat,average='micro'))
print('macro f1:',f1_score(y,y_hat,average='macro'))

micro f1: 0.9019001052175528
macro f1: 0.6494307228233492


In [24]:
print('micro f1:',f1_score(y,y_pre,average='micro'))
print('macro f1:',f1_score(y,y_pre,average='macro'))

micro f1: 0.6774153617627034
macro f1: 0.03230757877647406


In [25]:
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

print(classification_report(y, y_hat,digits=4))

                 precision    recall  f1-score   support

        B-award     0.9071    0.9007    0.9039       141
         B-book     0.6440    0.9043    0.7522       418
      B-country     0.6148    0.8218    0.7034       101
        B-event     0.6842    0.2889    0.4062        45
B-literarygenre     0.5342    0.6443    0.5841       194
     B-location     0.5932    0.7071    0.6452        99
     B-magazine     0.8596    0.8596    0.8596        57
         B-misc     0.4791    0.4310    0.4537       239
 B-organisation     0.9077    0.5364    0.6743       110
       B-person     0.5676    0.4800    0.5201       175
         B-poem     0.5200    0.2167    0.3059       120
       B-writer     0.8407    0.8748    0.8574       567
        I-award     0.9067    0.9284    0.9174       335
         I-book     0.7160    0.9432    0.8140       898
      I-country     0.7037    0.6786    0.6909        56
        I-event     0.7097    0.7416    0.7253        89
I-literarygenre     0.6923    

In [26]:
print(classification_report(y, y_pre,digits=4))

                 precision    recall  f1-score   support

        B-award     0.0000    0.0000    0.0000       141
         B-book     0.0000    0.0000    0.0000       418
      B-country     0.0000    0.0000    0.0000       101
        B-event     0.0000    0.0000    0.0000        45
B-literarygenre     0.0000    0.0000    0.0000       194
     B-location     0.0000    0.0000    0.0000        99
     B-magazine     0.0000    0.0000    0.0000        57
         B-misc     0.0000    0.0000    0.0000       239
 B-organisation     0.0000    0.0000    0.0000       110
       B-person     0.0000    0.0000    0.0000       175
         B-poem     0.0000    0.0000    0.0000       120
       B-writer     0.0000    0.0000    0.0000       567
        I-award     0.0000    0.0000    0.0000       335
         I-book     0.0000    0.0000    0.0000       898
      I-country     0.0000    0.0000    0.0000        56
        I-event     0.0000    0.0000    0.0000        89
I-literarygenre     0.0000    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
predictions[250]

[('Writing', 'O', 'O'),
 ('for', 'O', 'O'),
 ('The', 'B-magazine', 'B-book'),
 ('Spectator', 'I-magazine', 'I-book'),
 (',', 'O', 'O'),
 ('Graham', 'B-writer', 'B-writer'),
 ('Greene', 'I-writer', 'I-writer'),
 ('expressed', 'O', 'O'),
 ('similar', 'O', 'O'),
 ('views', 'O', 'O'),
 (',', 'O', 'O'),
 ('acerbically', 'O', 'O'),
 ('noting', 'O', 'O'),
 ('of', 'O', 'O'),
 ('the', 'O', 'O'),
 ('film', 'O', 'O'),
 ('that', 'O', 'O'),
 ('it', 'O', 'O'),
 ('goes', 'O', 'O'),
 ('on', 'O', 'O'),
 ('too', 'O', 'O'),
 ('long', 'O', 'O'),
 (',', 'O', 'O'),
 ('otherwise', 'O', 'O'),
 ('it', 'O', 'O'),
 ('might', 'O', 'O'),
 ('have', 'O', 'O'),
 ('been', 'O', 'O'),
 ('the', 'O', 'O'),
 ('funniest', 'O', 'O'),
 ('film', 'O', 'O'),
 ('since', 'O', 'O'),
 ('The', 'B-misc', 'B-book'),
 ('Crusades', 'I-misc', 'I-book'),
 ('.', 'O', 'O')]