# Read ewt Data

In [1]:
import re
import pandas as pd
import nltk
from transformers import AutoTokenizer
import torch

bert_model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
def read_iob2_with_metadata(file_path):
    documents = []
    with open(file_path, 'r') as f:
        document = []
        metadata = {}
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                if line.startswith('# text'):
                    metadata['text'] = line.split('=')[1].strip()  
                continue
            if not line:  
                if document: 
                    documents.append((metadata, document))
                    document = []  
                    metadata = {}  
            else:
                parts = line.split('\t')
                if len(parts) >= 4:
                    token = parts[1]
                    ner_tag = parts[2]
                    document.append((token, ner_tag))
        if document:  
            documents.append((metadata, document))
    return documents

# Example usage:

training_data = read_iob2_with_metadata('data/ewt_data/en_ewt-ud-train.iob2')
dev_data = read_iob2_with_metadata('data/ewt_data/en_ewt-ud-dev.iob2')

In [20]:
def convert_to_df(data):
    sentences = []
    tags = []

    for line in data:
        sentences.append(line[0]["text"])
        prev_tag = ''
        for i,word_tag in enumerate(line[1]):
            _, tag = word_tag
            if i == 0:
                prev_tag += tag
            else:
                prev_tag += "," + tag

        tags.append(prev_tag)


    return pd.DataFrame({"sentence": sentences, "tags": tags})

training_df = convert_to_df(training_data)
dev_df = convert_to_df(dev_data)

In [25]:

label2id = {k: v for v, k in enumerate(pd.unique(','.join(training_df["tags"].values).split(',')))}
id2label = {v: k for v, k in enumerate(pd.unique(','.join(training_df["tags"].values).split(',')))}


  label2id = {k: v for v, k in enumerate(pd.unique(','.join(training_df["tags"].values).split(',')))}
  id2label = {v: k for v, k in enumerate(pd.unique(','.join(training_df["tags"].values).split(',')))}


In [5]:


def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = ["[CLS]"]
    labels = ["O"]
    
    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    tokenized_sentence.append("[SEP]")
    labels.append("O")


    return tokenized_sentence, labels

def get_attn_mask(tokenized_sentence):
    return [1 if tok != "[PAD]" else 0 for tok in tokenized_sentence]


In [27]:
def get_proper_data_structure(df):
    df["tokenized_sentence"] = df.apply(lambda x: tokenize_and_preserve_labels(x["sentence"],x["tags"], tokenizer)[0],axis=1)
    df["tokenized_sentence_tags"] = df.apply(lambda x: tokenize_and_preserve_labels(x["sentence"],x["tags"], tokenizer)[1],axis=1)

    df["ids"] = df.apply(lambda x: tokenizer.convert_tokens_to_ids(x["tokenized_sentence"]), axis=1)
    df["attn_mask"] = df.apply(lambda x: get_attn_mask(x["tokenized_sentence"]), axis=1)
    df["targets"] = df.apply(lambda x: [label2id[label] for label in x["tokenized_sentence_tags"]],axis=1)
    return df

training_df = get_proper_data_structure(training_df)
dev_df = get_proper_data_structure(dev_df)

In [28]:
training_df[["ids", "attn_mask", "targets"]]

Unnamed: 0,ids,attn_mask,targets
0,"[101, 2073, 1999, 1996, 2088, 2003, 1045, 1969...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0]"
1,"[101, 1045, 19696, 9759, 4212, 102]","[1, 1, 1, 1, 1, 1]","[0, 1, 1, 1, 2, 0]"
2,"[101, 4235, 2641, 2000, 2022, 2028, 1997, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[101, 1996, 2803, 1997, 1996, 4212, 2003, 2124...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[101, 2007, 1996, 3103, 12277, 5582, 2091, 200...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
12538,"[101, 2130, 1037, 2210, 2051, 2985, 4909, 2878...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12539,"[101, 1055, 1012, 1998, 1045, 2031, 2019, 1836...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12540,"[101, 1996, 9075, 2000, 4047, 1998, 8587, 2068...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12541,"[101, 2202, 2729, 1010, 2026, 2767, 1010, 8507...","[1, 1, 1, 1, 1, 1, 1, 1, 1]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [46]:
def get_torch_dataset(df):
    data_set = []

    for ids, attn, targets in df[["ids","attn_mask", "targets"]].iloc:
        data_set.append({"ids":torch.tensor(ids),
                            "mask":torch.tensor(attn),
                            "targets":torch.tensor(targets)})
    return data_set

training_set = get_torch_dataset(training_df)
dev_set = get_torch_dataset(dev_df)

In [47]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 1
                }


training_loader = torch.utils.data.DataLoader(training_set, **train_params)

# Model

In [31]:
from transformers import BertForTokenClassification

device = torch.device("cpu")

model = BertForTokenClassification.from_pretrained('bert-base-uncased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)

from transformers import AutoConfig

bert_config = AutoConfig.from_pretrained(bert_model_name, 
                                         num_labels=len(id2label),
                                         id2label=id2label, label2id=label2id)
def model_init():
    return (BertForTokenClassification
            .from_pretrained(bert_model_name, config=bert_config)
            .to(device))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
from transformers import TrainingArguments

num_epochs = 1
batch_size = 8
logging_steps = len(training_set) // batch_size
model_name = f"{bert_model_name}-finetuned_ewt"

training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs, 
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch", 
    save_steps=1e6, weight_decay=0.01, disable_tqdm=False, 
    logging_steps=logging_steps, push_to_hub=False)

In [33]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [49]:

from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args, 
                  data_collator=data_collator,
                  train_dataset=training_loader,
                  tokenizer=tokenizer)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [50]:
trainer.train()



TypeError: 'DataLoader' object is not subscriptable

In [12]:
ids = training_set[2]["ids"].unsqueeze(0)
mask = training_set[2]["mask"].unsqueeze(0)
targets = training_set[2]["targets"].unsqueeze(0)

output = model(input_ids=ids, attention_mask = mask, labels= targets)


[id2label[pred] for pred in torch.argmax(output[1],dim=2).cpu().numpy()[0]]



['O',
 'B-ORG',
 'I-ORG',
 'I-LOC',
 'I-LOC',
 'O',
 'I-LOC',
 'I-LOC',
 'O',
 'B-ORG',
 'B-ORG',
 'I-LOC',
 'O',
 'O',
 'B-ORG',
 'O',
 'O',
 'I-ORG',
 'I-ORG',
 'B-ORG',
 'I-ORG',
 'I-ORG',
 'I-PER',
 'I-ORG',
 'O',
 'O',
 'B-ORG',
 'O',
 'I-ORG',
 'B-LOC',
 'I-ORG',
 'B-ORG',
 'I-LOC',
 'I-ORG',
 'B-LOC',
 'I-LOC',
 'I-PER',
 'I-PER',
 'I-PER']