This is going to be where the final training datasets are created

In [1]:
from datasets import load_dataset
coNLL = load_dataset("conllpp")
coNLL

Reusing dataset conllpp (/work/wzkariampuzha/.cache/huggingface/datasets/conllpp/conllpp/1.0.0/04f15f257dff3fe0fb36e049b73d51ecdf382698682f5e590b7fb13898206ba2)


DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [2]:
#NER_tag '5' is B-LOC, '6' is I-LOC
def read_dataset(dataset):
    token_docs = []
    tag_docs = []
    for sentence in dataset:
        #Only add sentences that actually have location tags (i.e. meaningfully annotated sentences)
        if (5 in sentence['ner_tags'] or 6 in sentence['ner_tags']):
            i=0
            tokens = []
            tags = []
            for tag in sentence['ner_tags']:
                label = 'O'
                if tag ==5:
                    label = 'B-LOC'
                if tag == 6:
                    label = 'I-LOC'
                #print(sentence['tokens'][i], label)
                tokens.append(sentence['tokens'][i])
                tags.append(label)
                i+=1
            token_docs.append(tokens)
            tag_docs.append(tags)
    return token_docs, tag_docs

In [3]:
train_texts, train_tags = read_dataset(coNLL["train"])
validation_texts, validation_tags = read_dataset(coNLL["validation"])
test_texts, train_tags = read_dataset(coNLL["test"])

In [4]:
for i in range(2):
    print(test_texts[i][:6], train_tags[i][:6],sep='\n')

['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN']
['O', 'O', 'B-LOC', 'O', 'O', 'O']
['AL-AIN', ',', 'United', 'Arab', 'Emirates', '1996-12-06']
['B-LOC', 'O', 'B-LOC', 'I-LOC', 'I-LOC', 'O']


In [5]:
unique_tags = set(tag for doc in train_tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

print(unique_tags)
print(tag2id)
print(id2tag)

{'O', 'I-LOC', 'B-LOC'}
{'O': 0, 'I-LOC': 1, 'B-LOC': 2}
{0: 'O', 1: 'I-LOC', 2: 'B-LOC'}


In [6]:
from transformers import BertTokenizer, BertTokenizerFast, PreTrainedTokenizerFast


#Figure out if need pretrained tokenizer fast, bc using it for return_offsets_mapping, but if dont need it, then use regular bert tokenizer
#from transformers import BertTokenizer
#tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-large-cased-v1.1')


#tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-large-cased-v1.1')
tokenizer = PreTrainedTokenizerFast.from_pretrained('bert-base-cased')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=192)
val_encodings = tokenizer(validation_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=192)
test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=192) 

In [7]:
import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [8]:
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(validation_tags, val_encodings)
test_labels = encode_tags(train_tags, test_encodings)

ValueError: NumPy boolean array indexing assignment cannot assign 12 input values to the 2 output values where the mask is true

In [None]:
import torch

class CoNLLDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
test_encodings.pop("offset_mapping")
train_dataset = CoNLLDataset(train_encodings, train_labels)
val_dataset = CoNLLDataset(val_encodings, val_labels)
test_dataset = CoNLLDataset(test_encodings, test_labels)

In [None]:
from transformers import BertForTokenClassification
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-large-cased-v1.1', num_labels=len(unique_tags))

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir = True,
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    seed = 1
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    test_dataset=
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
from transformers import set_seed
#Seed is a helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if installed).
set_seed(training_args.seed)

In [None]:
trainer.save_model()
# For convenience, we also re-save the tokenizer to the same directory,
# so that you can share your model easily on huggingface.co/models =)
if trainer.is_world_process_zero():
    tokenizer.save_pretrained(training_args.output_dir)

In [None]:
from transformers import EvalPrediction
def align_predictions(predictions: np.ndarray, label_ids: np.ndarray): # -> Tuple[List[int], List[int]]
        preds = np.argmax(predictions, axis=2)

        batch_size, seq_len = preds.shape

        out_label_list = [[] for _ in range(batch_size)]
        preds_list = [[] for _ in range(batch_size)]
        
        for i in range(batch_size):
            for j in range(seq_len):
                if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                    out_label_list[i].append(label_map[label_ids[i][j]])
                    preds_list[i].append(label_map[preds[i][j]])

        return preds_list, out_label_list

def compute_metrics(p: EvalPrediction): #-> Dict
    preds_list, out_label_list = align_predictions(p.predictions, p.label_ids)
    return {
        "precision": precision_score(out_label_list, preds_list),
        "recall": recall_score(out_label_list, preds_list),
        "f1": f1_score(out_label_list, preds_list)}

In [None]:
# Evaluation
results = {}
result = trainer.evaluate()
output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
if trainer.is_world_process_zero():
    with open(output_eval_file, "w") as writer:
        for key, value in result.items():
            writer.write("%s = %s\n" % (key, value))
            results.update(result)

In [None]:
#Predictions
predictions, label_ids, metrics = trainer.predict(test_dataset)
preds_list, _ = align_predictions(predictions, label_ids)

In [None]:
# Save predictions
output_test_results_file = os.path.join(training_args.output_dir, "test_results.txt")
if trainer.is_world_process_zero():
    with open(output_test_results_file, "w") as writer:
        for key, value in metrics.items():
            writer.write("%s = %s\n" % (key, value))

In [None]:
output_test_predictions_file = os.path.join(training_args.output_dir, "test_predictions.txt")
if trainer.is_world_process_zero():
    with open(output_test_predictions_file, "w") as writer:
        with open(os.path.join(data_args.data_dir, "test.txt"), "r") as f:
            example_id = 0
                for line in f:
                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                        writer.write(line)
                        if not preds_list[example_id]:
                            example_id += 1
                    elif preds_list[example_id]:
                        entity_label = preds_list[example_id].pop(0)
                        if entity_label == 'O':
                            output_line = line.split()[0] + " " + entity_label + "\n"
                        else:
                            output_line = line.split()[0] + " " + entity_label[0] + "\n"
                            # output_line = line.split()[0] + " " + preds_list[example_id].pop(0) + "\n"
                            writer.write(output_line)
                    else:
                        print("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])