In [None]:
!pip install numpy pandas tqdm sklearn transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 64.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 449 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 76.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 70.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attem

In [None]:
!git clone https://github.com/DinhLuan14/CrossNER.git

Cloning into 'CrossNER'...
remote: Enumerating objects: 172, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 172 (delta 2), reused 2 (delta 2), pack-reused 164[K
Receiving objects: 100% (172/172), 2.30 MiB | 11.88 MiB/s, done.
Resolving deltas: 100% (77/77), done.


In [None]:
def prepare_dataset(PATH,convert2bio=False):
  sents = []
  chunks = open(PATH,'r').read().split('\n\n')
  for chunk in chunks:
    lines = chunk.split('\n')
    sent = []
    current_tag = None
    previous_tag = None
    for line in lines:
        if line != '':
            token = line.split('\t')
            previous_tag = current_tag 
            current_tag = token[1]
            if convert2bio:
                if previous_tag == current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                elif previous_tag != current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                else:
                    sent.append((token[0],token[1]))
            else:
                sent.append((token[0],token[1]))
    sents.append(sent)
  return sents

In [None]:
samples_train = prepare_dataset('/content/CrossNER/ner_data/ai/train.txt',convert2bio=True)
samples_valid = prepare_dataset('/content/CrossNER/ner_data/ai/dev.txt',convert2bio=True)
samples_test = prepare_dataset('/content/CrossNER/ner_data/ai/test.txt',convert2bio=True)

In [None]:
samples_train[1]

[('Advocates', 'O'),
 ('of', 'O'),
 ('procedural', 'O'),
 ('representations', 'O'),
 ('were', 'O'),
 ('mainly', 'O'),
 ('centered', 'O'),
 ('at', 'O'),
 ('MIT', 'B-university'),
 (',', 'O'),
 ('under', 'O'),
 ('the', 'O'),
 ('leadership', 'O'),
 ('of', 'O'),
 ('Marvin', 'B-researcher'),
 ('Minsky', 'I-researcher'),
 ('and', 'O'),
 ('Seymour', 'B-researcher'),
 ('Papert', 'I-researcher'),
 ('.', 'O')]

In [None]:
samples = samples_train + samples_test + samples_valid
schema = ['_'] + sorted({tag for sentence in samples 
                             for _, tag in sentence})

In [None]:
samples_test[2]

[('The', 'O'),
 ('task', 'O'),
 ('is', 'O'),
 ('usually', 'O'),
 ('to', 'O'),
 ('derive', 'O'),
 ('the', 'O'),
 ('maximum', 'B-algorithm'),
 ('likelihood', 'I-algorithm'),
 ('estimate', 'I-algorithm'),
 ('of', 'O'),
 ('the', 'O'),
 ('parameters', 'O'),
 ('of', 'O'),
 ('the', 'O'),
 ('HMM', 'B-algorithm'),
 ('given', 'O'),
 ('the', 'O'),
 ('of', 'O'),
 ('output', 'O'),
 ('sequences', 'O'),
 ('.', 'O')]

In [None]:
schema

['_',
 'B-algorithm',
 'B-conference',
 'B-country',
 'B-field',
 'B-location',
 'B-metrics',
 'B-misc',
 'B-organisation',
 'B-person',
 'B-product',
 'B-programlang',
 'B-researcher',
 'B-task',
 'B-university',
 'I-algorithm',
 'I-conference',
 'I-country',
 'I-field',
 'I-location',
 'I-metrics',
 'I-misc',
 'I-organisation',
 'I-person',
 'I-product',
 'I-programlang',
 'I-researcher',
 'I-task',
 'I-university',
 'O']

In [None]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import json
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import AutoConfig, TFXLMRobertaForTokenClassification

MODEL_NAME = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

In [None]:
def tokenize_sample(sample):
  seq = [
            (subtoken, tag)
            for token, tag in sample
            for subtoken in tokenizer(token)['input_ids'][1:-1]
        ]
  return [(3, 'O')] + seq + [(4, 'O')]

def preprocess(samples,schema,max_len_sent=None):
    if max_len_sent != None:
      reduced_samples = []
      for sample in samples:
        if len(sample) < max_len_sent:
          reduced_samples.append(sample)
    else:
      reduced_samples = samples
    
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize_sample, reduced_samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocess(samples_train,schema,max_len_sent=100)
X_val, y_val = preprocess(samples_valid,schema,max_len_sent=100)
X_test, y_test = preprocess(samples_test,schema,max_len_sent=100)

101it [00:00, 344.28it/s]
351it [00:00, 412.34it/s]
432it [00:00, 433.69it/s]


In [None]:
NR_EPOCHS=20
BATCH_SIZE=2
model = TFXLMRobertaForTokenClassification.from_pretrained(MODEL_NAME,num_labels=len(schema))

Downloading:   0%|          | 0.00/3.05G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForTokenClassification.

Some layers of TFXLMRobertaForTokenClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
  #model.load_weights(checkpoint_filepath)
history = model.fit(tf.constant(X_train), tf.constant(y_train),
                      validation_data=(X_val,y_val), epochs=NR_EPOCHS, 
                      batch_size=BATCH_SIZE,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
def aggregate(sample,schema,predictions):
    results = []
    i = 1
    for token, y_true in sample:
        nr_subtoken = len(tokenizer(token)['input_ids']) - 2
        pred = predictions[i:i+nr_subtoken]
        i += nr_subtoken
        y_pred = schema[np.argmax(np.sum(pred, axis=0))]
        results.append((token, y_true, y_pred))
    return results
 
y_probs = model.predict(X_test)[0]
predictions = [aggregate(sample,schema,predictions) for sample, predictions in zip(samples_test, y_probs)]

In [None]:
y = []
y_hat = []
y_pre = []
for pred in predictions:
  for token in pred:
    y.append(token[1])
    y_hat.append(token[2])
    y_pre.append('O')
len(y_hat)

12991

In [None]:
from sklearn.metrics import f1_score
print('model XLM-R: micro f1:',f1_score(y,y_hat,average='micro'))
print('model XLM-R: macro f1:',f1_score(y,y_hat,average='macro'))

model XLM-R: micro f1: 0.7925486875529213
model XLM-R: macro f1: 0.2677620156781576


In [None]:

print('model O: micro f1:',f1_score(y,y_pre,average='micro'))
print('model O: macro f1:',f1_score(y,y_pre,average='macro'))

model O: micro f1: 0.7100300207836193
model O: macro f1: 0.028635513438419212


In [None]:
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

print(classification_report(y, y_hat,digits=4))

                precision    recall  f1-score   support

   B-algorithm     0.2532    0.3390    0.2899       177
  B-conference     0.0000    0.0000    0.0000        93
     B-country     0.7059    0.2727    0.3934        44
       B-field     0.5974    0.4444    0.5097       207
    B-location     0.0000    0.0000    0.0000        39
     B-metrics     0.0000    0.0000    0.0000       191
        B-misc     0.1538    0.0110    0.0206       181
B-organisation     0.2667    0.5793    0.3652       145
      B-person     0.0000    0.0000    0.0000        67
     B-product     0.2885    0.5202    0.3712       198
 B-programlang     0.3333    0.0167    0.0317        60
  B-researcher     0.4712    0.6125    0.5326       160
        B-task     0.4055    0.5388    0.4627       219
  B-university     0.5714    0.1429    0.2286        28
   I-algorithm     0.2746    0.4646    0.3452       198
  I-conference     0.6903    0.3594    0.4727       217
     I-country     0.0000    0.0000    0.0000  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(classification_report(y, y_pre,digits=4))

  _warn_prf(average, modifier, msg_start, len(result))


                precision    recall  f1-score   support

   B-algorithm     0.0000    0.0000    0.0000       177
  B-conference     0.0000    0.0000    0.0000        93
     B-country     0.0000    0.0000    0.0000        44
       B-field     0.0000    0.0000    0.0000       207
    B-location     0.0000    0.0000    0.0000        39
     B-metrics     0.0000    0.0000    0.0000       191
        B-misc     0.0000    0.0000    0.0000       181
B-organisation     0.0000    0.0000    0.0000       145
      B-person     0.0000    0.0000    0.0000        67
     B-product     0.0000    0.0000    0.0000       198
 B-programlang     0.0000    0.0000    0.0000        60
  B-researcher     0.0000    0.0000    0.0000       160
        B-task     0.0000    0.0000    0.0000       219
  B-university     0.0000    0.0000    0.0000        28
   I-algorithm     0.0000    0.0000    0.0000       198
  I-conference     0.0000    0.0000    0.0000       217
     I-country     0.0000    0.0000    0.0000  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
predictions[300]

[('During', 'O', 'O'),
 ('2015', 'O', 'O'),
 (',', 'O', 'O'),
 ('many', 'O', 'O'),
 ('of', 'O', 'O'),
 ('SenseTime', 'B-organisation', 'B-organisation'),
 ("'s", 'O', 'O'),
 ('papers', 'O', 'O'),
 ('were', 'O', 'O'),
 ('accepted', 'O', 'O'),
 ('into', 'O', 'O'),
 ('the', 'O', 'O'),
 ('Conference', 'B-conference', 'I-conference'),
 ('on', 'I-conference', 'I-conference'),
 ('Computer', 'I-conference', 'I-conference'),
 ('Vision', 'I-conference', 'I-conference'),
 ('and', 'I-conference', 'I-conference'),
 ('Pattern', 'I-conference', 'I-conference'),
 ('Recognition', 'I-conference', 'I-organisation'),
 ('(', 'O', 'O'),
 ('CVPR', 'B-conference', 'B-organisation'),
 (')', 'O', 'O'),
 ('.', 'O', 'O')]