# Environment Setup

In [1]:
!pip install datasets evaluate transformers[sentencepiece]
!pip install accelerate
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.6.1-py3-none-any.whl (441 kB)
[K     |████████████████████████████████| 441 kB 7.1 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.3.0-py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 1.9 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 70.6 MB/s 
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 75.0 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading mul

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForTokenClassification, LukeTokenizer, LukeForEntityPairClassification
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import evaluate
import spacy
import nltk
from tqdm import tqdm
nltk.download('punkt')
import nltk.data
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

metric = evaluate.load("seqeval")
# model_checkpoint = 'bert-base-cased'   # BERT model
model_checkpoint = 'bert-large-cased'   # BERT model
# model_checkpoint='studio-ousia/luke-large-finetuned-conll-2003'  # luke-large

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# define abbrev in nltk
punkt_param = PunktParameters()
sentence_splitter = PunktSentenceTokenizer(punkt_param)
punkt_param.abbrev_types = set(['i.e','al','e.g','etc','dr', 'vs', 'mr', 'mrs', 'prof'])
spacy_tokenizer = spacy.load("en_core_web_sm")

# create label name dict
label_names = ['O',
      'B-MethodName','I-MethodName','B-MetricName','I-MetricName',
      'B-MetricValue','I-MetricValue','B-HyperparameterName','I-HyperparameterName',
      'B-HyperparameterValue','I-HyperparameterValue','B-TaskName','I-TaskName',
      'B-DatasetName','I-DatasetName']
label_dict = {'O': 0,
      'B-MethodName': 1,
      'I-MethodName': 2,
      'B-MetricName': 3,
      'I-MetricName': 4,
      'B-MetricValue': 5,
      'I-MetricValue': 6,
      'B-HyperparameterName': 7,
      'I-HyperparameterName': 8,
      'B-HyperparameterValue': 9,
      'I-HyperparameterValue': 10,
      'B-TaskName': 11,
      'I-TaskName': 12,
      'B-DatasetName': 13,
      'I-DatasetName': 14}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]

# Load and split data

In [3]:
# split train and val data, split_ratio = 0.3 
# plus filter out invalid (token, )

all_data=[]
with open('training_data.conll','r',encoding='utf-8') as f:
  each_sentence = []
  for line in f:
    if line is not '\n':
      each_sentence.append(line)
    else:
      all_data.append(each_sentence)
      each_sentence = []
train,val = train_test_split(all_data, test_size=0.3)

with open('train.txt','w') as f:
  for each_sentence in train:
    for word in each_sentence:
      f.write(word)
    f.write('\n')

with open('val.txt','w') as f:
  for each_sentence in val:
    for word in each_sentence:
      f.write(word)
    f.write('\n')



In [4]:
def read_data(file, if_sorted = True):
    with open(file,"r",encoding="utf-8") as f:
        all_data = f.read().split("\n")

    all_text = []
    all_label = []

    text = []
    label = []
    for data in all_data:

        if data == "":
            all_text.append(text)
            all_label.append(label)
            text = []
            label = []
        else:
          try:
            t,l = data.split()
            text.append(t)
            label.append(l)
          except ValueError:continue
    if if_sorted:
      all_text = sorted(all_text, key=lambda x: len(x), reverse=False)
      all_label = sorted(all_label, key=lambda x: len(x), reverse=False)

    return all_text,all_label

train_tokens, train_labels = read_data('train.txt')    
validation_tokens, validation_labels = read_data('val.txt')    
test_tokens_local, test_labels_local = read_data('test.conll')    
test_tokens, test_labels = read_data('anlp-sciner-test-empty.conll',if_sorted=False)  


In [5]:
# function to convert labels to label_ids
def label_2_id(labels):
  id_labels=[]
  for l in labels:
    id_labels.append(label_dict[l])
  return id_labels

# convert all labels to id_labels
def labels_2_id(input):
  all_id_labels = []
  for lb in input:
    all_id_labels.append(label_2_id(lb))
  return all_id_labels

train_id_labels = labels_2_id(train_labels)
validation_id_labels = labels_2_id(validation_labels)
test_id_labels_local = labels_2_id(test_labels_local)
test_id_labels = labels_2_id(test_labels)

In [6]:
# create datasetdict for further use
d = {'train':Dataset.from_dict({'tokens':train_tokens, 'ner_tags':train_id_labels}),
     'validation':Dataset.from_dict({'tokens':validation_tokens, 'ner_tags':validation_id_labels}),
     'test_local':Dataset.from_dict({'tokens':test_tokens_local, 'ner_tags':test_id_labels_local}),
     'test':Dataset.from_dict({'tokens':test_tokens, 'ner_tags':test_id_labels})
     }
data = DatasetDict(d)
data

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3251
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 1394
    })
    test_local: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 268
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 923
    })
})

# Tokenize

In [7]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        #print('==============New round - word id: ',word_id)
        if word_id != current_word:
            # Start of a new word!
            #print('Start a new word ---- ')
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            #print('append new label: ',label)
            new_labels.append(label)
        elif word_id is None:
            # Special token
            #print('Special tokens -100')
            #print('append -100')
            new_labels.append(-100)
        else:
            # Same word as previous token
            
            label = labels[word_id]
            #print('same word: ',label)
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            #print('append new label: ',label)
            new_labels.append(label)

    return new_labels

In [8]:
def tokenize_and_align_labels(input):
    tokenized_inputs = tokenizer(
        input['tokens'], truncation=True, is_split_into_words=True
    )
    all_labels = input['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [9]:
tokenized_datasets = data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=data["train"].column_names,
)
tokenized_datasets

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3251
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1394
    })
    test_local: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 268
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 923
    })
})

In [12]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)


In [13]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [14]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

label2id

{'O': '0',
 'B-MethodName': '1',
 'I-MethodName': '2',
 'B-MetricName': '3',
 'I-MetricName': '4',
 'B-MetricValue': '5',
 'I-MetricValue': '6',
 'B-HyperparameterName': '7',
 'I-HyperparameterName': '8',
 'B-HyperparameterValue': '9',
 'I-HyperparameterValue': '10',
 'B-TaskName': '11',
 'I-TaskName': '12',
 'B-DatasetName': '13',
 'I-DatasetName': '14'}

# Modeling

In [13]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label = id2label,
    label2id = label2id
)

model.config.num_labels

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-large-c

15

In [15]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token


In [15]:
from transformers import TrainingArguments

args = TrainingArguments(
    "bert-large-finetuned-ner-30",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=30,
    weight_decay=0.01,
    push_to_hub=True,
)

In [23]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

# optional: make sure we upload the most recent version of the modelb
trainer.push_to_hub(commit_message="Training complete")

/content/bert-large-finetuned-ner-30 is already a clone of https://huggingface.co/leo93/bert-large-finetuned-ner-30. Make sure you pull the latest changes with `repo.git_pull()`.


# Prediction

In [None]:
trainer.predict(tokenized_datasets['test_local'])

***** Running Prediction *****
  Num examples = 268
  Batch size = 8


  _warn_prf(average, modifier, msg_start, len(result))


PredictionOutput(predictions=array([[[ 1.05580988e+01, -9.01142657e-02,  5.31887040e-02, ...,
         -1.13874507e+00, -2.24225736e+00, -2.61731863e+00],
        [ 1.08075171e+01,  2.08768964e-01, -1.65040463e-01, ...,
         -8.46151829e-01, -2.11256313e+00, -3.07144499e+00],
        [ 1.07693739e+01, -9.34765220e-01, -1.10443167e-01, ...,
         -2.94840515e-01, -2.34363866e+00, -2.92770863e+00],
        ...,
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02],
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02],
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02]],

       [[ 1.21411467e+01, -7.94997394e-01, -5.63403964e-01, ...,
         -1.38457549e+00, -2.29075956e+00, -2.43779802e+00],
        [ 1.25369883e+01, -1.26215160e+00, -7.87446499e-01, ...,
         

# Re-load models for prediction

In [16]:
# load fine-tuned bert-base model with 30 epoches
model_name = 'leo93/bert-finetuned-ner-30'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Downloading:   0%|          | 0.00/347 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/669k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/431M [00:00<?, ?B/s]

In [17]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    "bert-large-finetuned-ner-30",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=30,
    weight_decay=0.01,
    push_to_hub=True,
)


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

Cloning https://huggingface.co/leo93/bert-large-finetuned-ner-30 into local empty directory.


In [21]:
trainer.predict(tokenized_datasets['test_local'])

***** Running Prediction *****
  Num examples = 268
  Batch size = 8
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  _warn_prf(average, modifier, msg_start, len(result))


PredictionOutput(predictions=array([[[ 1.05580988e+01, -9.01142657e-02,  5.31887040e-02, ...,
         -1.13874507e+00, -2.24225736e+00, -2.61731863e+00],
        [ 1.08075171e+01,  2.08768964e-01, -1.65040463e-01, ...,
         -8.46151829e-01, -2.11256313e+00, -3.07144499e+00],
        [ 1.07693739e+01, -9.34765220e-01, -1.10443167e-01, ...,
         -2.94840515e-01, -2.34363866e+00, -2.92770863e+00],
        ...,
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02],
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02],
        [-1.00000000e+02, -1.00000000e+02, -1.00000000e+02, ...,
         -1.00000000e+02, -1.00000000e+02, -1.00000000e+02]],

       [[ 1.21411467e+01, -7.94997394e-01, -5.63403964e-01, ...,
         -1.38457549e+00, -2.29075956e+00, -2.43779802e+00],
        [ 1.25369883e+01, -1.26215160e+00, -7.87446499e-01, ...,
         

In [176]:
# define function to output test predictions in conll format
# Output: predictions.conll

def prediction_2_labels(model,input):
  sample = ' '.join(input)
  encoding = tokenizer(sample, return_tensors='pt')

  # encoding length within batch size   --------------------------
  if len(encoding.tokens())<=512: 
    predictions = model(**encoding)
    predicted_label_classes = predictions.logits.argmax(-1).numpy()[0]
    predicted_labels = [id2label[str(id)] for id in predicted_label_classes]

    with open('predictions.conll','a') as f:
      for i in range(len(input)):
        word = input[i]
        tag = predicted_labels[encoding.word_ids().index(i)]
        content = word+' '+tag
        f.write(content)
        f.write('\n')
      f.write('\n')


  # encoding length over batch size, we divide the input sentence into two parts  -------------
  else:   
    # part one
    input_a = input[:200]
    sample = ' '.join(input_a)
    encoding_a = tokenizer(sample, return_tensors='pt')
    predictions_a = model(**encoding_a)
    predicted_label_classes = predictions_a.logits.argmax(-1).numpy()[0]
    predicted_labels_a = [id2label[str(id)] for id in predicted_label_classes]


    # part two
    input_b = input[200:]
    sample = ' '.join(input_b)
    encoding_b = tokenizer(sample, return_tensors='pt')
    predictions_b = model(**encoding_b)
    predicted_label_classes = predictions_b.logits.argmax(-1).numpy()[0]
    predicted_labels_b = [id2label[str(id)] for id in predicted_label_classes]

    with open('predictions.conll','a') as f:
      for i in range(len(input_a)):
        word = input_a[i]
        tag = predicted_labels_a[encoding.word_ids().index(i)]
        content = word+' '+tag
        f.write(content)
        f.write('\n')

      for i in range(len(input_b)):
        word = input_b[i]
        tag = predicted_labels_b[encoding.word_ids().index(i)]
        content = word+' '+tag
        f.write(content)
        f.write('\n')
      f.write('\n')



In [177]:
# export result with predicted labels
for i in tqdm(range(len(test_tokens))):
  prediction_2_labels(model,test_tokens[i])

100%|██████████| 923/923 [03:11<00:00,  4.81it/s]
