# Read ewt Data

In [344]:
import re
import pandas as pd
import nltk
from transformers import AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
MAX_LEN = 20

In [112]:
def read_iob2_with_metadata(file_path):
    documents = []
    with open(file_path, 'r') as f:
        document = []
        metadata = {}
        for line in f:
            line = line.strip()
            if line.startswith('#'):
                if line.startswith('# text'):
                    metadata['text'] = line.split('=')[1].strip()  
                continue
            if not line:  
                if document: 
                    documents.append((metadata, document))
                    document = []  
                    metadata = {}  
            else:
                parts = line.split('\t')
                if len(parts) >= 4:
                    token = parts[1]
                    ner_tag = parts[2]
                    document.append((token, ner_tag))
        if document:  
            documents.append((metadata, document))
    return documents

# Example usage:
iob2_file_path = 'data/ewt_data/en_ewt-ud-train.iob2'
data_with_metadata = read_iob2_with_metadata(iob2_file_path)
data_with_metadata

[({'text': 'Where in the world is Iguazu?'},
  [('Where', 'O'),
   ('in', 'O'),
   ('the', 'O'),
   ('world', 'O'),
   ('is', 'O'),
   ('Iguazu', 'B-LOC'),
   ('?', 'O')]),
 ({'text': 'Iguazu Falls'}, [('Iguazu', 'B-LOC'), ('Falls', 'I-LOC')]),
 ({'text': 'Widely considered to be one of the most spectacular waterfalls in the world, the Iguazu Falls on the border of Argentina and Brazil, are a certainly must see attraction in the area.'},
  [('Widely', 'O'),
   ('considered', 'O'),
   ('to', 'O'),
   ('be', 'O'),
   ('one', 'O'),
   ('of', 'O'),
   ('the', 'O'),
   ('most', 'O'),
   ('spectacular', 'O'),
   ('waterfalls', 'O'),
   ('in', 'O'),
   ('the', 'O'),
   ('world', 'O'),
   (',', 'O'),
   ('the', 'O'),
   ('Iguazu', 'B-LOC'),
   ('Falls', 'I-LOC'),
   ('on', 'O'),
   ('the', 'O'),
   ('border', 'O'),
   ('of', 'O'),
   ('Argentina', 'B-LOC'),
   ('and', 'O'),
   ('Brazil', 'B-LOC'),
   (',', 'O'),
   ('are', 'O'),
   ('a', 'O'),
   ('certainly', 'O'),
   ('must', 'O'),
   ('see'

In [355]:
sentences = []
tags = []

for line in data_with_metadata:
    sentences.append(line[0]["text"])
    prev_tag = ''
    for i,word_tag in enumerate(line[1]):
        _, tag = word_tag
        if i == 0:
            prev_tag += tag
        else:
            prev_tag += "," + tag

    tags.append(prev_tag)


df = pd.DataFrame({"sentence": sentences, "tags": tags})
df

Unnamed: 0,sentence,tags
0,Where in the world is Iguazu?,"O,O,O,O,O,B-LOC,O"
1,Iguazu Falls,"B-LOC,I-LOC"
2,Widely considered to be one of the most specta...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-LOC,I-LOC,O,O,..."
3,The centre of the falls is known locally as ‘G...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,"With the sun shinning down on the spray, creat...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...
12538,"Even a little time spent receiving wholesome, ...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
12539,S. and I have an acquaintance who has hosted s...,"B-PER,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
12540,The urge to protect and gather them all in is ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
12541,"Take care, my friend, Linda","O,O,O,O,O,O,B-PER"


In [356]:
label2id = {k: v for v, k in enumerate(pd.unique(','.join(tags).split(',')))}
id2label = {v: k for v, k in enumerate(pd.unique(','.join(tags).split(',')))}


  label2id = {k: v for v, k in enumerate(pd.unique(','.join(tags).split(',')))}
  id2label = {v: k for v, k in enumerate(pd.unique(','.join(tags).split(',')))}


In [357]:
def check_if_sentence_is_longer_than_max_len_and_truncate_if(sentence: list, padding):
    if len(sentence) > MAX_LEN:
        sentence = sentence[:MAX_LEN]
    else:
        sentence = sentence + [padding for _ in range(MAX_LEN - len(sentence))]
    return sentence

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = ["[CLS]"]
    labels = ["O"]
    
    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    tokenized_sentence.append("[SEP]")
    labels.append("O")

    tokenized_sentence = check_if_sentence_is_longer_than_max_len_and_truncate_if(tokenized_sentence, "[PAD]")
    labels = check_if_sentence_is_longer_than_max_len_and_truncate_if(labels, "O")

    return tokenized_sentence, labels

def get_attn_mask(tokenized_sentence):
    return [1 if tok != "[PAD]" else 0 for tok in tokenized_sentence]


In [358]:
df["tokenized_sentence"] = df.apply(lambda x: tokenize_and_preserve_labels(x["sentence"],x["tags"], tokenizer)[0],axis=1)
df["tokenized_sentence_tags"] = df.apply(lambda x: tokenize_and_preserve_labels(x["sentence"],x["tags"], tokenizer)[1],axis=1)

df["ids"] = df.apply(lambda x: tokenizer.convert_tokens_to_ids(x["tokenized_sentence"]), axis=1)
df["attn_mask"] = df.apply(lambda x: get_attn_mask(x["tokenized_sentence"]), axis=1)
df["targets"] = df.apply(lambda x: [label2id[label] for label in x["tokenized_sentence_tags"]],axis=1)

In [359]:
df[["ids", "attn_mask", "targets"]]

Unnamed: 0,ids,attn_mask,targets
0,"[101, 2777, 1107, 1103, 1362, 1110, 146, 13855...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, ..."
1,"[101, 146, 13855, 10337, 6230, 102, 0, 0, 0, 0...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[101, 15268, 1193, 1737, 1106, 1129, 1141, 110...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[101, 1109, 2642, 1104, 1103, 4887, 1110, 1227...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[101, 1556, 1103, 3336, 188, 8265, 3381, 1205,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
12538,"[101, 2431, 170, 1376, 1159, 2097, 4172, 2006,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12539,"[101, 156, 119, 1105, 146, 1138, 1126, 20125, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12540,"[101, 1109, 8869, 1106, 3244, 1105, 8422, 1172...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12541,"[101, 5055, 1920, 117, 1139, 1910, 117, 8138, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [373]:
training_set = []

for ids, attn, targets in df[["ids","attn_mask", "targets"]].iloc:
    training_set.append({"ids":torch.tensor(ids),
                         "mask":torch.tensor(attn),
                         "targets":torch.tensor(targets)})

In [381]:
train_params = {'batch_size': 8,
                'shuffle': True,
                'num_workers': 1
                }


training_loader = torch.utils.data.DataLoader(training_set, **train_params)


# Model

In [382]:
from transformers import BertForTokenClassification

model = BertForTokenClassification.from_pretrained('bert-base-uncased', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)




Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [405]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)

output = model(input_ids=ids, attention_mask = mask, labels= targets)


[id2label[pred] for pred in torch.argmax(output[1],dim=2).cpu().numpy()[0]]



['O',
 'O',
 'O',
 'O',
 'O',
 'I-LOC',
 'O',
 'O',
 'B-ORG',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']