In [1]:
!pip install numpy pandas tqdm sklearn transformers
!pip install sentencepiece



In [2]:
!git clone https://github.com/DinhLuan14/CrossNER.git

fatal: destination path 'CrossNER' already exists and is not an empty directory.


In [3]:
def prepare_dataset(PATH,convert2bio=False):
  sents = []
  chunks = open(PATH,'r').read().split('\n\n')
  for chunk in chunks:
    lines = chunk.split('\n')
    sent = []
    current_tag = None
    previous_tag = None
    for line in lines:
        if line != '':
            token = line.split('\t')
            previous_tag = current_tag 
            current_tag = token[1]
            if convert2bio:
                if previous_tag == current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                elif previous_tag != current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                else:
                    sent.append((token[0],token[1]))
            else:
                sent.append((token[0],token[1]))
    sents.append(sent)
  return sents

In [4]:
samples_train = prepare_dataset('/content/CrossNER/ner_data/politics/train.txt',convert2bio=True)
samples_valid = prepare_dataset('/content/CrossNER/ner_data/politics/dev.txt',convert2bio=True)
samples_test = prepare_dataset('/content/CrossNER/ner_data/politics/test.txt',convert2bio=True)

In [5]:
samples_train[1]

[('Parties', 'O'),
 ('with', 'O'),
 ('mainly', 'O'),
 ('Eurosceptic', 'O'),
 ('views', 'O'),
 ('are', 'O'),
 ('Serbian', 'B-politicalparty'),
 ('Radical', 'I-politicalparty'),
 ('Party', 'I-politicalparty'),
 (',', 'O'),
 ('Democratic', 'B-politicalparty'),
 ('Party', 'I-politicalparty'),
 ('of', 'I-politicalparty'),
 ('Serbia', 'I-politicalparty'),
 (',', 'O'),
 ('Dveri', 'B-politicalparty'),
 (',', 'O'),
 ('DJB', 'B-politicalparty'),
 ('and', 'O'),
 ('the', 'O'),
 ('Serbian', 'B-politicalparty'),
 ('People', 'I-politicalparty'),
 ("'s", 'I-politicalparty'),
 ('Party', 'I-politicalparty'),
 ('of', 'O'),
 ('Nenad', 'B-politician'),
 ('Popović', 'I-politician'),
 ('.', 'O')]

In [6]:
samples = samples_train + samples_test + samples_valid
schema = ['_'] + sorted({tag for sentence in samples 
                             for _, tag in sentence})

In [7]:
samples_test[2]

[('Tamsin', 'B-person'),
 ('Greig', 'I-person'),
 ('narrated', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('the', 'O'),
 ('cast', 'O'),
 ('included', 'O'),
 ('Nicky', 'B-person'),
 ('Henson', 'I-person'),
 ('as', 'O'),
 ('Napoleon', 'O'),
 (',', 'O'),
 ('Toby', 'B-person'),
 ('Jones', 'I-person'),
 ('as', 'O'),
 ('the', 'O'),
 ('propagandist', 'O'),
 ('Squealer', 'O'),
 (',', 'O'),
 ('and', 'O'),
 ('Ralph', 'B-person'),
 ('Ineson', 'I-person'),
 ('as', 'O'),
 ('Boxer', 'O'),
 ('.', 'O')]

In [8]:
schema

['_',
 'B-country',
 'B-election',
 'B-event',
 'B-location',
 'B-misc',
 'B-organisation',
 'B-person',
 'B-politicalparty',
 'B-politician',
 'I-country',
 'I-election',
 'I-event',
 'I-location',
 'I-misc',
 'I-organisation',
 'I-person',
 'I-politicalparty',
 'I-politician',
 'O']

In [9]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import json
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import AutoConfig, TFXLMRobertaForTokenClassification

MODEL_NAME = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [10]:
def tokenize_sample(sample):
  seq = [
            (subtoken, tag)
            for token, tag in sample
            for subtoken in tokenizer(token)['input_ids'][1:-1]
        ]
  return [(3, 'O')] + seq + [(4, 'O')]

def preprocess(samples,schema,max_len_sent=None):
    if max_len_sent != None:
      reduced_samples = []
      for sample in samples:
        if len(sample) < max_len_sent:
          reduced_samples.append(sample)
    else:
      reduced_samples = samples
    
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize_sample, reduced_samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocess(samples_train,schema,max_len_sent=100)
X_val, y_val = preprocess(samples_valid,schema,max_len_sent=100)
X_test, y_test = preprocess(samples_test,schema,max_len_sent=100)

201it [00:01, 112.47it/s]
542it [00:03, 180.00it/s]
652it [00:06, 93.56it/s]


In [11]:
NR_EPOCHS=20
BATCH_SIZE=2
model = TFXLMRobertaForTokenClassification.from_pretrained(MODEL_NAME,num_labels=len(schema))

Downloading:   0%|          | 0.00/3.05G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForTokenClassification.

Some layers of TFXLMRobertaForTokenClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
  #model.load_weights(checkpoint_filepath)
history = model.fit(tf.constant(X_train), tf.constant(y_train),
                      validation_data=(X_val,y_val), epochs=NR_EPOCHS, 
                      batch_size=BATCH_SIZE,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
def aggregate(sample,schema,predictions):
    results = []
    i = 1
    for token, y_true in sample:
        nr_subtoken = len(tokenizer(token)['input_ids']) - 2
        pred = predictions[i:i+nr_subtoken]
        i += nr_subtoken
        y_pred = schema[np.argmax(np.sum(pred, axis=0))]
        results.append((token, y_true, y_pred))
    return results
 
y_probs = model.predict(X_test)[0]
predictions = [aggregate(sample,schema,predictions) for sample, predictions in zip(samples_test, y_probs)]

In [14]:
y = []
y_hat = []
y_pre = []
for pred in predictions:
  for token in pred:
    y.append(token[1])
    y_hat.append(token[2])
    y_pre.append('O')
len(y_hat)

27585

In [15]:
from sklearn.metrics import f1_score
print('model XLM-R: micro f1:',f1_score(y,y_hat,average='micro'))
print('model XLM-R: macro f1:',f1_score(y,y_hat,average='macro'))

model XLM-R: micro f1: 0.9021569693674099
model XLM-R: macro f1: 0.6100852484820936


In [16]:

print('model O: micro f1:',f1_score(y,y_pre,average='micro'))
print('model O: macro f1:',f1_score(y,y_pre,average='macro'))

model O: micro f1: 0.6275512053652347
model O: macro f1: 0.040587369171324605


In [17]:
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

print(classification_report(y, y_hat,digits=4))

  _warn_prf(average, modifier, msg_start, len(result))


                  precision    recall  f1-score   support

       B-country     0.5881    0.7584    0.6625       418
      B-election     0.9468    0.9839    0.9650       434
         B-event     0.8036    0.4615    0.5863       195
      B-location     0.8543    0.8614    0.8579       599
          B-misc     0.4140    0.2519    0.3133       258
  B-organisation     0.6807    0.7563    0.7165       513
        B-person     0.3750    0.0169    0.0324       354
B-politicalparty     0.8362    0.8573    0.8466       953
    B-politician     0.5463    0.9237    0.6866       485
       I-country     1.0000    0.0731    0.1362       219
      I-election     0.9531    0.9787    0.9657      1640
         I-event     0.7917    0.6142    0.6917       464
      I-location     0.6048    0.6720    0.6366       189
          I-misc     0.1748    0.1385    0.1545       130
  I-organisation     0.7159    0.8101    0.7601      1011
        I-person     0.0000    0.0000    0.0000       338
I-politicalpa

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
print(classification_report(y, y_pre,digits=4))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                  precision    recall  f1-score   support

       B-country     0.0000    0.0000    0.0000       418
      B-election     0.0000    0.0000    0.0000       434
         B-event     0.0000    0.0000    0.0000       195
      B-location     0.0000    0.0000    0.0000       599
          B-misc     0.0000    0.0000    0.0000       258
  B-organisation     0.0000    0.0000    0.0000       513
        B-person     0.0000    0.0000    0.0000       354
B-politicalparty     0.0000    0.0000    0.0000       953
    B-politician     0.0000    0.0000    0.0000       485
       I-country     0.0000    0.0000    0.0000       219
      I-election     0.0000    0.0000    0.0000      1640
         I-event     0.0000    0.0000    0.0000       464
      I-location     0.0000    0.0000    0.0000       189
          I-misc     0.0000    0.0000    0.0000       130
  I-organisation     0.0000    0.0000    0.0000      1011
        I-person     0.0000    0.0000    0.0000       338
I-politicalpa

  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
predictions[300]

[('Carstairs', 'B-politician', 'B-politician'),
 ('led', 'O', 'O'),
 ('the', 'O', 'O'),
 ('Liberal', 'B-politicalparty', 'B-politicalparty'),
 ('Party', 'I-politicalparty', 'I-politicalparty'),
 ('to', 'O', 'O'),
 ('a', 'O', 'O'),
 ('dramatic', 'O', 'O'),
 ('resurgence', 'O', 'O'),
 ('in', 'O', 'O'),
 ('the', 'O', 'O'),
 ('1988', 'B-election', 'B-election'),
 ('Manitoba', 'I-election', 'I-election'),
 ('general', 'I-election', 'I-election'),
 ('election', 'I-election', 'I-election'),
 (',', 'O', 'O'),
 ('which', 'O', 'O'),
 ('saw', 'O', 'O'),
 ('the', 'O', 'O'),
 ('election', 'O', 'O'),
 ('of', 'O', 'O'),
 ('a', 'O', 'O'),
 ('Progressive', 'B-politicalparty', 'B-politicalparty'),
 ('Conservative', 'I-politicalparty', 'I-politicalparty'),
 ('Party', 'I-politicalparty', 'I-politicalparty'),
 ('of', 'I-politicalparty', 'I-politicalparty'),
 ('Manitoba', 'I-politicalparty', 'I-politicalparty'),
 ('minority', 'O', 'O'),
 ('government', 'O', 'O'),
 ('under', 'O', 'O'),
 ('Gary', 'B-politicia