In [1]:
!pip install numpy pandas tqdm sklearn transformers
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 4.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 501 kB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 50.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 48.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 39.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [2]:
!git clone https://github.com/DinhLuan14/CrossNER.git

Cloning into 'CrossNER'...
remote: Enumerating objects: 172, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 172 (delta 2), reused 2 (delta 2), pack-reused 164[K
Receiving objects: 100% (172/172), 2.30 MiB | 8.14 MiB/s, done.
Resolving deltas: 100% (77/77), done.


In [3]:
def prepare_dataset(PATH,convert2bio=False):
  sents = []
  chunks = open(PATH,'r').read().split('\n\n')
  for chunk in chunks:
    lines = chunk.split('\n')
    sent = []
    current_tag = None
    previous_tag = None
    for line in lines:
        if line != '':
            token = line.split('\t')
            previous_tag = current_tag 
            current_tag = token[1]
            if convert2bio:
                if previous_tag == current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                elif previous_tag != current_tag and current_tag != 'O':
                    sent.append((token[0],token[1]))
                else:
                    sent.append((token[0],token[1]))
            else:
                sent.append((token[0],token[1]))
    sents.append(sent)
  return sents

In [4]:
samples_train = prepare_dataset('/content/CrossNER/ner_data/music/train.txt',convert2bio=True)
samples_valid = prepare_dataset('/content/CrossNER/ner_data/music/dev.txt',convert2bio=True)
samples_test = prepare_dataset('/content/CrossNER/ner_data/music/test.txt',convert2bio=True)

In [5]:
samples_train[1]

[('In', 'O'),
 ('addition', 'O'),
 ('to', 'O'),
 ('relentless', 'O'),
 ('touring', 'O'),
 ('in', 'O'),
 ('the', 'O'),
 ('U.S.', 'B-country'),
 ('and', 'O'),
 ('Canada', 'B-country'),
 (',', 'O'),
 ('PUSA', 'B-band'),
 ('made', 'O'),
 ('multiple', 'O'),
 ('tours', 'O'),
 ('of', 'O'),
 ('Europe', 'B-location'),
 (',', 'O'),
 ('Australia', 'B-country'),
 (',', 'O'),
 ('New', 'B-country'),
 ('Zealand', 'I-country'),
 ('and', 'O'),
 ('Japan', 'B-country'),
 ('.', 'O')]

In [6]:
samples = samples_train + samples_test + samples_valid
schema = ['_'] + sorted({tag for sentence in samples 
                             for _, tag in sentence})

In [7]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import json
import tensorflow as tf
from transformers import AutoTokenizer
from transformers import AutoConfig, TFXLMRobertaForTokenClassification

MODEL_NAME = 'jplu/tf-xlm-roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

In [8]:
def tokenize_sample(sample):
  seq = [
            (subtoken, tag)
            for token, tag in sample
            for subtoken in tokenizer(token)['input_ids'][1:-1]
        ]
  return [(3, 'O')] + seq + [(4, 'O')]

def preprocess(samples,schema,max_len_sent=None):
    if max_len_sent != None:
      reduced_samples = []
      for sample in samples:
        if len(sample) < max_len_sent:
          reduced_samples.append(sample)
    else:
      reduced_samples = samples
    
    tag_index = {tag: i for i, tag in enumerate(schema)}
    tokenized_samples = list(tqdm(map(tokenize_sample, reduced_samples)))
    max_len = max(map(len, tokenized_samples))
    X = np.zeros((len(samples), max_len), dtype=np.int32)
    y = np.zeros((len(samples), max_len), dtype=np.int32)
    for i, sentence in enumerate(tokenized_samples):
        for j, (subtoken_id, tag) in enumerate(sentence):
            X[i, j] = subtoken_id
            y[i,j] = tag_index[tag]
    return X, y

X_train, y_train = preprocess(samples_train,schema,max_len_sent=100)
X_val, y_val = preprocess(samples_valid,schema,max_len_sent=100)
X_test, y_test = preprocess(samples_test,schema,max_len_sent=100)

101it [00:00, 239.61it/s]
381it [00:01, 215.54it/s]
466it [00:02, 214.98it/s]


In [9]:
NR_EPOCHS=20
BATCH_SIZE=2
model = TFXLMRobertaForTokenClassification.from_pretrained(MODEL_NAME,num_labels=len(schema))

Downloading:   0%|          | 0.00/3.05G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFXLMRobertaForTokenClassification.

Some layers of TFXLMRobertaForTokenClassification were not initialized from the model checkpoint at jplu/tf-xlm-roberta-large and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics='accuracy')
  #model.load_weights(checkpoint_filepath)
history = model.fit(tf.constant(X_train), tf.constant(y_train),
                      validation_data=(X_val,y_val), epochs=NR_EPOCHS, 
                      batch_size=BATCH_SIZE,verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [11]:
def aggregate(sample,schema,predictions):
    results = []
    i = 1
    for token, y_true in sample:
        nr_subtoken = len(tokenizer(token)['input_ids']) - 2
        pred = predictions[i:i+nr_subtoken]
        i += nr_subtoken
        y_pred = schema[np.argmax(np.sum(pred, axis=0))]
        results.append((token, y_true, y_pred))
    return results
 
y_probs = model.predict(X_test)[0]
predictions = [aggregate(sample,schema,predictions) for sample, predictions in zip(samples_test, y_probs)]

In [12]:
y = []
y_hat = []
y_pre = []
for pred in predictions:
  for token in pred:
    y.append(token[1])
    y_hat.append(token[2])
    y_pre.append('O')
len(y_hat)

19605

In [13]:
from sklearn.metrics import f1_score
print('micro f1:',f1_score(y,y_hat,average='micro'))
print('macro f1:',f1_score(y,y_hat,average='macro'))

micro f1: 0.8820709002805407
macro f1: 0.4968214696214293


In [14]:
print('micro f1:',f1_score(y,y_pre,average='micro'))
print('macro f1:',f1_score(y,y_pre,average='macro'))

micro f1: 0.6013771996939556
macro f1: 0.02781759303498434


In [15]:
import json
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

print(classification_report(y, y_hat,digits=4))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

            B-album     0.5072    0.8524    0.6360       332
            B-award     0.8393    0.9038    0.8704       260
             B-band     0.7062    0.8117    0.7553       462
          B-country     0.6000    0.8438    0.7013       160
            B-event     0.0000    0.0000    0.0000        69
         B-location     0.7906    0.7376    0.7632       343
             B-misc     0.0000    0.0000    0.0000       171
    B-musicalartist     0.7811    0.9010    0.8368       515
B-musicalinstrument     0.0000    0.0000    0.0000        22
       B-musicgenre     0.7540    0.8686    0.8073       487
     B-organisation     0.7194    0.6779    0.6980       208
           B-person     0.0000    0.0000    0.0000        80
             B-song     0.8462    0.0485    0.0917       227
            I-album     0.5865    0.8385    0.6902       675
            I-award     0.9148    0.9367    0.9256       837
             I-band    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
print(classification_report(y, y_pre,digits=4))

  _warn_prf(average, modifier, msg_start, len(result))


                     precision    recall  f1-score   support

            B-album     0.0000    0.0000    0.0000       332
            B-award     0.0000    0.0000    0.0000       260
             B-band     0.0000    0.0000    0.0000       462
          B-country     0.0000    0.0000    0.0000       160
            B-event     0.0000    0.0000    0.0000        69
         B-location     0.0000    0.0000    0.0000       343
             B-misc     0.0000    0.0000    0.0000       171
    B-musicalartist     0.0000    0.0000    0.0000       515
B-musicalinstrument     0.0000    0.0000    0.0000        22
       B-musicgenre     0.0000    0.0000    0.0000       487
     B-organisation     0.0000    0.0000    0.0000       208
           B-person     0.0000    0.0000    0.0000        80
             B-song     0.0000    0.0000    0.0000       227
            I-album     0.0000    0.0000    0.0000       675
            I-award     0.0000    0.0000    0.0000       837
             I-band    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
predictions[300]

[('In', 'O', 'O'),
 ('the', 'O', 'O'),
 ('Southwestern', 'B-location', 'B-country'),
 ('United', 'I-location', 'B-country'),
 ('States', 'I-location', 'B-country'),
 (',', 'O', 'O'),
 ('it', 'O', 'O'),
 ('was', 'O', 'O'),
 ('the', 'O', 'O'),
 ('Rocky', 'B-location', 'B-musicgenre'),
 ('Mountains', 'I-location', 'I-location'),
 (',', 'O', 'O'),
 ('American', 'B-location', 'B-musicgenre'),
 ('frontier', 'I-location', 'I-location'),
 (',', 'O', 'O'),
 ('and', 'O', 'O'),
 ('Rio', 'B-location', 'B-musicgenre'),
 ('Grande', 'I-location', 'I-musicgenre'),
 ('that', 'O', 'O'),
 ('acted', 'O', 'O'),
 ('as', 'O', 'O'),
 ('a', 'O', 'O'),
 ('similar', 'O', 'O'),
 ('backdrop', 'O', 'O'),
 ('for', 'O', 'O'),
 ('Indigenous', 'B-musicgenre', 'B-musicgenre'),
 ('music', 'I-musicgenre', 'I-musicgenre'),
 ('of', 'I-musicgenre', 'O'),
 ('North', 'I-musicgenre', 'B-musicgenre'),
 ('America', 'I-musicgenre', 'B-country'),
 (',', 'O', 'O'),
 ('Mexican', 'B-misc', 'B-musicgenre'),
 (',', 'O', 'O'),
 ('and', '