# Read ewt Data

In [101]:
import re
import pandas as pd
import nltk
from transformers import AutoTokenizer
import torch

bert_model_name = "bert-base-multilingual-cased"

tokenizer = AutoTokenizer.from_pretrained(bert_model_name)


In [102]:
def read_iob2_with_metadata(file_path):
    documents = []
    with open(file_path, 'r') as f:
        document = []
        metadata = {}
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                if line.startswith('# text'):
                    metadata['text'] = line.split('=')[1].strip()  
                continue
            if not line:  
                if document: 
                    documents.append((metadata, document))
                    document = []  
                    metadata = {}  
            else:
                parts = line.split('\t')
                if len(parts) >= 4:
                    token = parts[1]
                    ner_tag = parts[2]
                    document.append((token, ner_tag))
        if document:  
            documents.append((metadata, document))
    return documents

# Example usage:

training_data = read_iob2_with_metadata('data/ewt_data/en_ewt-ud-train.iob2')
dev_data = read_iob2_with_metadata('data/ewt_data/en_ewt-ud-dev.iob2')

In [103]:
def convert_to_df(data):
    sentences = []
    tags = []

    for line in data:
        sentences.append(line[0]["text"])
        prev_tag = ''
        for i,word_tag in enumerate(line[1]):
            _, tag = word_tag
            if i == 0:
                prev_tag += tag
            else:
                prev_tag += "," + tag

        tags.append(prev_tag)


    return pd.DataFrame({"sentence": sentences, "tags": tags})

training_df = convert_to_df(training_data)
dev_df = convert_to_df(dev_data)

In [104]:

label2id = {k: v for v, k in enumerate(pd.unique(','.join(training_df["tags"].values).split(',')))}
id2label = {v: k for v, k in enumerate(pd.unique(','.join(training_df["tags"].values).split(',')))}


In [105]:


def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = ["[CLS]"]
    labels = ["O"]
    
    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    tokenized_sentence.append("[SEP]")
    labels.append("O")


    return tokenized_sentence, labels

def get_attn_mask(tokenized_sentence):
    return [1 if tok != "[PAD]" else 0 for tok in tokenized_sentence]


In [106]:
def get_proper_data_structure(df):
    df["tokenized_sentence"] = df.apply(lambda x: tokenize_and_preserve_labels(x["sentence"],x["tags"], tokenizer)[0],axis=1)
    df["tokenized_sentence_tags"] = df.apply(lambda x: tokenize_and_preserve_labels(x["sentence"],x["tags"], tokenizer)[1],axis=1)

    df["ids"] = df.apply(lambda x: tokenizer.convert_tokens_to_ids(x["tokenized_sentence"]), axis=1)
    df["attn_mask"] = df.apply(lambda x: get_attn_mask(x["tokenized_sentence"]), axis=1)
    df["targets"] = df.apply(lambda x: [label2id[label] for label in x["tokenized_sentence_tags"]],axis=1)
    return df

training_df = get_proper_data_structure(training_df)
dev_df = get_proper_data_structure(dev_df)

In [107]:
training_df[["ids", "attn_mask", "targets"]]

Unnamed: 0,ids,attn_mask,targets
0,"[101, 23525, 10106, 10105, 11356, 10124, 146, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0]"
1,"[101, 146, 20337, 13078, 23118, 102]","[1, 1, 1, 1, 1, 1]","[0, 1, 1, 1, 2, 0]"
2,"[101, 21660, 10454, 14289, 10114, 10347, 10464...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[101, 10117, 12672, 10108, 10105, 35017, 10124...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[101, 12613, 10105, 42230, 57667, 37158, 10376...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
12538,"[101, 28140, 169, 16745, 10635, 18571, 31391, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12539,"[101, 156, 119, 10111, 146, 10529, 10151, 2474...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12540,"[101, 10117, 10399, 10525, 10114, 32949, 10111...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12541,"[101, 21200, 11131, 117, 15127, 20104, 117, 22...","[1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [108]:
def get_torch_dataset(df):
    data_set = []

    for ids, attn, targets in df[["ids","attn_mask", "targets"]].iloc:
        data_set.append({"input_ids":ids,
                            "attention_mask":attn,
                            "labels":targets})
    return data_set

training_set = get_torch_dataset(training_df)
dev_set = get_torch_dataset(dev_df)

In [109]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 1
                }


training_loader = torch.utils.data.DataLoader(training_set, **train_params)

# Model

In [110]:
from transformers import BertForTokenClassification

device = torch.device("cpu")

model = BertForTokenClassification.from_pretrained('bert-base-uncased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)

from transformers import AutoConfig

bert_config = AutoConfig.from_pretrained(bert_model_name, 
                                         num_labels=len(id2label),
                                         id2label=id2label, label2id=label2id)
def model_init():
    return (BertForTokenClassification
            .from_pretrained(bert_model_name, config=bert_config)
            .to(device))


In [111]:
from transformers import TrainingArguments

num_epochs = 0.1
batch_size = 8
logging_steps = len(training_set) // batch_size
model_name = f"{bert_model_name}-finetuned_ewt"

training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch", 
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    logging_steps=logging_steps, push_to_hub=False)

In [112]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, padding=True)

In [113]:
from datasets import Dataset
import pyarrow as pa

training_set = pd.DataFrame(training_set)
dev_set = pd.DataFrame(dev_set)

training_set_pa = Dataset(pa.Table.from_pandas(training_set))
dev_set_pa = Dataset(pa.Table.from_pandas(dev_set))

In [114]:

from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args, 
                  data_collator=data_collator,
                  train_dataset=training_set_pa,
                  tokenizer=tokenizer)

In [115]:
trainer.train()



  0%|          | 0/157 [00:00<?, ?it/s]

ValueError: Trainer: evaluation requires an eval_dataset.

In [None]:
ids = training_set[2]["ids"].unsqueeze(0)
mask = training_set[2]["mask"].unsqueeze(0)
targets = training_set[2]["targets"].unsqueeze(0)

output = model(input_ids=ids, attention_mask = mask, labels= targets)


[id2label[pred] for pred in torch.argmax(output[1],dim=2).cpu().numpy()[0]]



['O',
 'B-ORG',
 'B-ORG',
 'B-ORG',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'B-ORG',
 'B-ORG',
 'B-LOC',
 'I-ORG',
 'I-LOC',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'B-PER',
 'B-PER',
 'B-ORG',
 'I-PER',
 'I-LOC',
 'I-LOC',
 'I-LOC',
 'I-PER',
 'B-ORG',
 'I-LOC',
 'B-LOC',
 'B-LOC',
 'B-LOC',
 'I-LOC',
 'I-LOC',
 'B-PER',
 'I-LOC',
 'I-LOC',
 'I-LOC']