In [1]:
!pip install numpy pandas tqdm sklearn transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 61.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 439 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [2]:
!git clone https://github.com/DinhLuan14/CrossNER.git

Cloning into 'CrossNER'...
remote: Enumerating objects: 172, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 172 (delta 2), reused 2 (delta 2), pack-reused 164[K
Receiving objects: 100% (172/172), 2.30 MiB | 16.00 MiB/s, done.
Resolving deltas: 100% (77/77), done.


In [3]:
def prepare_dataset(PATH,convert2bio=False):
  sents = []
  chunks = open(PATH,'r').read().split('\n\n')
  for chunk in chunks:
    lines = chunk.split('\n')
    sent = []
    current_tag = None
    previous_tag = None
    for line in lines:
        if line != '':
            token = line.split('\t')
            previous_tag = current_tag 
            current_tag = token[1]
            if convert2bio:
                if previous_tag == current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                elif previous_tag != current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                else:
                    sent.append((token[0],token[1]))
            else:
                sent.append((token[0],token[1]))
    sents.append(sent)
  return sents

In [4]:
samples_train = prepare_dataset('/content/CrossNER/ner_data/science/train.txt',convert2bio=True)
samples_valid = prepare_dataset('/content/CrossNER/ner_data/science/dev.txt',convert2bio=True)
samples_test = prepare_dataset('/content/CrossNER/ner_data/science/test.txt',convert2bio=True)

In [5]:
samples = samples_train + samples_test + samples_valid
schema = ['_'] + sorted({tag for sentence in samples 
                             for _, tag in sentence})

In [6]:
schema

['_',
 'B-academicjournal',
 'B-astronomicalobject',
 'B-award',
 'B-chemicalcompound',
 'B-chemicalelement',
 'B-country',
 'B-discipline',
 'B-enzyme',
 'B-event',
 'B-location',
 'B-misc',
 'B-organisation',
 'B-person',
 'B-protein',
 'B-scientist',
 'B-theory',
 'B-university',
 'I-academicjournal',
 'I-astronomicalobject',
 'I-award',
 'I-chemicalcompound',
 'I-chemicalelement',
 'I-country',
 'I-discipline',
 'I-enzyme',
 'I-event',
 'I-location',
 'I-misc',
 'I-organisation',
 'I-person',
 'I-protein',
 'I-scientist',
 'I-theory',
 'I-university',
 'O']

In [7]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import json
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import AutoConfig, TFXLMRobertaForTokenClassification

MODEL_NAME = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

In [8]:
def tokenize_sample(sample):
  seq = [
            (subtoken, tag)
            for token, tag in sample
            for subtoken in tokenizer(token)['input_ids'][1:-1]
        ]
  return [(3, 'O')] + seq + [(4, 'O')]

def preprocess(samples,schema,max_len_sent=None):
    if max_len_sent != None:
      reduced_samples = []
      for sample in samples:
        if len(sample) < max_len_sent:
          reduced_samples.append(sample)
    else:
      reduced_samples = samples
    
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize_sample, reduced_samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocess(samples_train,schema,max_len_sent=100)
X_val, y_val = preprocess(samples_valid,schema,max_len_sent=100)
X_test, y_test = preprocess(samples_test,schema,max_len_sent=100)

201it [00:00, 240.76it/s]
451it [00:01, 315.97it/s]
544it [00:01, 340.87it/s]


In [9]:
NR_EPOCHS=20
BATCH_SIZE=2
model = TFXLMRobertaForTokenClassification.from_pretrained(MODEL_NAME,num_labels=len(schema))

Downloading:   0%|          | 0.00/3.05G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForTokenClassification.

Some layers of TFXLMRobertaForTokenClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
  #model.load_weights(checkpoint_filepath)
history = model.fit(tf.constant(X_train), tf.constant(y_train),
                      validation_data=(X_val,y_val), epochs=NR_EPOCHS, 
                      batch_size=BATCH_SIZE,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

KeyboardInterrupt: ignored

In [11]:
def aggregate(sample,schema,predictions):
    results = []
    i = 1
    for token, y_true in sample:
        nr_subtoken = len(tokenizer(token)['input_ids']) - 2
        pred = predictions[i:i+nr_subtoken]
        i += nr_subtoken
        y_pred = schema[np.argmax(np.sum(pred, axis=0))]
        results.append((token, y_true, y_pred))
    return results
 
y_probs = model.predict(X_test)[0]
predictions = [aggregate(sample,schema,predictions) for sample, predictions in zip(samples_test, y_probs)]

In [12]:
y = []
y_hat = []
y_pre = []
for pred in predictions:
  for token in pred:
    y.append(token[1])
    y_hat.append(token[2])
    y_pre.append('O')
len(y_hat)

19487

In [13]:
from sklearn.metrics import f1_score
print('micro f1:',f1_score(y,y_hat,average='micro'))
print('macro f1:',f1_score(y,y_hat,average='macro'))

micro f1: 0.8213167752860882
macro f1: 0.3426264306282121


In [14]:

print('model O: micro f1:',f1_score(y,y_pre,average='micro'))
print('model O: macro f1:',f1_score(y,y_pre,average='macro'))

model O: micro f1: 0.655462616103043
model O: macro f1: 0.022625099636878927


In [15]:
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

In [16]:
print(classification_report(y, y_hat,digits=4))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                      precision    recall  f1-score   support

   B-academicjournal     0.4305    0.7222    0.5394       180
B-astronomicalobject     0.8433    0.3353    0.4798       337
             B-award     0.9211    0.2397    0.3804       146
  B-chemicalcompound     0.2687    0.6750    0.3843       160
   B-chemicalelement     0.0000    0.0000    0.0000        85
           B-country     0.0000    0.0000    0.0000        27
        B-discipline     0.6429    0.2466    0.3564        73
            B-enzyme     0.0000    0.0000    0.0000        80
             B-event     0.0000    0.0000    0.0000        31
          B-location     0.4565    0.1235    0.1944       170
              B-misc     0.2481    0.3827    0.3011       520
      B-organisation     0.5027    0.6813    0.5785       273
            B-person     0.8750    0.5783    0.6963       230
           B-protein     0.2857    0.0769    0.1212       156
         B-scientist     0.7730    0.9469    0.8511       471
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
print(classification_report(y, y_pre,digits=4))

  _warn_prf(average, modifier, msg_start, len(result))


                      precision    recall  f1-score   support

   B-academicjournal     0.0000    0.0000    0.0000       180
B-astronomicalobject     0.0000    0.0000    0.0000       337
             B-award     0.0000    0.0000    0.0000       146
  B-chemicalcompound     0.0000    0.0000    0.0000       160
   B-chemicalelement     0.0000    0.0000    0.0000        85
           B-country     0.0000    0.0000    0.0000        27
        B-discipline     0.0000    0.0000    0.0000        73
            B-enzyme     0.0000    0.0000    0.0000        80
             B-event     0.0000    0.0000    0.0000        31
          B-location     0.0000    0.0000    0.0000       170
              B-misc     0.0000    0.0000    0.0000       520
      B-organisation     0.0000    0.0000    0.0000       273
            B-person     0.0000    0.0000    0.0000       230
           B-protein     0.0000    0.0000    0.0000       156
         B-scientist     0.0000    0.0000    0.0000       471
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
predictions[300]

[('In', 'O', 'O'),
 ('1348', 'O', 'O'),
 (',', 'O', 'O'),
 ('Giovanni', 'B-scientist', 'B-scientist'),
 ('Dondi', 'I-scientist', 'I-scientist'),
 ('built', 'O', 'O'),
 ('the', 'O', 'O'),
 ('first', 'O', 'O'),
 ('known', 'O', 'O'),
 ('clock', 'O', 'O'),
 ('driven', 'O', 'O'),
 ('mechanism', 'O', 'O'),
 ('which', 'O', 'O'),
 ('displays', 'O', 'O'),
 ('the', 'O', 'O'),
 ('ecliptical', 'O', 'O'),
 ('position', 'O', 'O'),
 ('of', 'O', 'O'),
 ('Moon', 'B-astronomicalobject', 'O'),
 (',', 'O', 'O'),
 ('Sun', 'B-astronomicalobject', 'O'),
 (',', 'O', 'O'),
 ('Mercury', 'B-astronomicalobject', 'O'),
 (',', 'O', 'O'),
 ('Venus', 'B-astronomicalobject', 'O'),
 (',', 'O', 'O'),
 ('Mars', 'B-astronomicalobject', 'O'),
 (',', 'O', 'O'),
 ('Jupiter', 'B-astronomicalobject', 'O'),
 ('and', 'O', 'O'),
 ('Saturn', 'B-astronomicalobject', 'O'),
 ('according', 'O', 'O'),
 ('to', 'O', 'O'),
 ('the', 'O', 'O'),
 ('complicated', 'O', 'O'),
 ('Ptolemaic', 'B-theory', 'B-misc'),
 ('planetary', 'I-theory', 'I-m