In [1]:
import pandas as pd
import re
import torch
from torch.utils.data import Dataset
from itertools import chain 

In [2]:
class CoNLLDataset(Dataset):
    def __init__(self, conl_text):
        super().__init__()
        
        #Create a dictionary to store the data
        self.data={'id':[], 'labels':[], 'sentences':[]}
        data=[]
        with open(conl_text, 'r') as f:
            data=f.read()
            data=data.split('\n\n')
            
        self.lines=[]
        for lino in range(len(data)):
            lino=data[lino].split('\n')[1:]
            self.lines.append(lino)
            
                            
        for line in self.lines:
            texts=[]
            labels=[]
            for index, word in enumerate(line):
                if index==0:
                    part=word.split()
                    self.data['id'].append(part[2])
                
                else:
                    split_line=word.split('_')
                    split_line=[x.strip() for x in split_line]
                    split_line=list(filter(lambda x:x!="", split_line))
                    text, label = split_line
                    texts.append(text)
                    labels.append(label)
            
            self.data['labels'].append(labels)
            self.data['sentences'].append(texts)
            
    def __len__(self):
        return len(data.data['sentences'])
    
    
    
    def __getitem__(self, indx):
        data_point=data.data['sentences'][indx]
        data_label=data.data['labels'][indx]
        return data_point, data_label       
        

In [3]:
data=CoNLLDataset('train_dev/en-train.conll')

In [4]:
import spacy
import torchtext

In [5]:
nlp=spacy.load('en_core_web_sm')

In [6]:
with open("words.txt", "w") as f:
  # Loop through the elements in the array
  for element in data.data['sentences']:
    for i in element:
        f.write(i + "\n")  #add to seperate file in case we need -> i suspect not

In [7]:
with open("labels.txt", "w") as f:
  # Loop through the elements in the array
  for element in data.data['labels']:
    for i in element:
        f.write(i + "\n")

In [8]:
WORD=[]

for sentences in data.data['sentences']:
    sentence=""
    for index, word in enumerate(sentences):
        if index==(len(sentences)-1) or (index==len(sentences)-2):
            sentence+=word
        else:
            sentence+=word
            sentence += " "
    WORD.append(sentence)

    


In [9]:
unique_labels=[]

for ner_labels in data.data['labels']:
    for label in ner_labels:
        unique_labels.append(label)

In [10]:
unique_labels=set(unique_labels) #67 labels
len(unique_labels)
#map each label to its representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

In [11]:
#Before using the BERT model to classify the entity of tokens we first need data processing
#tokenization and adjust label to match tokenization
#We use BERT tokenizer class from a pretrained model on hugging face


from transformers import BertTokenizerFast
tokenizer=BertTokenizerFast.from_pretrained('bert-base-cased')
text_tokenized = tokenizer(WORD[0], padding='max_length', max_length=68, truncation=True, return_tensors="pt") #max length 68 

#padding - pad sequence to maximum length we specify - with the BERT model this is 512
#max length - maximum length of a sequence

#truncation : this is a Boolean value. If we set the value to True, then tokens that exceed the maximum length will
#not be used

#tensor type we want returning - since we are using pytorch we use pt

print(text_tokenized)

{'input_ids': tensor([[  101, 11580,  3740,  1400,  2145, 17436,  1377,  3061,  8717,  2574,
          2981,  1105,  3249,  1104, 10224, 21704,  8427,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 

In [12]:
#Output we get from the tokenization process is a dictionary cotaining input ids - 101 reserved for special cls token
#102 reserved for special sep token. Attention mask identifies if a token is real or padding
#ADJUSTING LABEL AFTER TOKENIZATION

print(tokenizer.convert_ids_to_tokens(text_tokenized['input_ids'][0]))  #Extra sep and cls labels - some unique words split so we know their semantic meaning
word_ids=text_tokenized.word_ids()
print(tokenizer.convert_ids_to_tokens(text_tokenized['input_ids'][0]))
print(word_ids)

['[CLS]', 'robe', '##rt', 'got', '##ts', '##chal', '##k', '1939', 'academy', 'award', 'winner', 'and', 'founder', 'of', '##pan', '##avi', '##sion', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
['[CLS]', 'robe', '##rt', 'got', '##ts', '##chal', '##k', '1939', 'academy', 'award', 'winner', 'and', 'founder', 'of', '##pan', '##avi', '##sion', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[

In [13]:

#We only provide a label to the first sub-word of each splitted token. The continuation of the sub-word then will simply have ‘-100’ as a label. All tokens that don’t have word_ids will also be labeled with ‘-100’.
#We provide the same label among all of the sub-words that belong to the same token. All tokens that don’t have word_ids will be labeled with ‘-100’

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=68, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids


In [14]:
labels=[]
for lb in data.data['labels']:
    hi=[]
    for l in lb:
        hi.append(l)
    labels.append(l)

In [15]:
class DataSequence(torch.utils.data.Dataset):
    
    def __init__(self, sentence_use, label):
        self.sentence= [tokenizer(str(txt), padding='max_length', max_length=68, truncation=False, return_tensors='pt') for txt in sentence_use]
        self.label=[align_label(i,j) for i,j in zip(sentence_use, label)]
        
    def __len__(self):
        return len(self.label)
    
    def get_batch_data(self, indx):
        return self.sentence[indx]
    
    def get_batch_labels(self, indx):
        return torch.LongTensor(self.label[indx])
    
    def __getitem__(self, indx):
        batch_data = self.get_batch_data(indx)
        batch_labels = self.get_batch_labels(indx)

        return batch_data, batch_labels
    
    
        
        

In [16]:

from transformers import BertForTokenClassification

class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

In [17]:
def train_loop(model):

    train_dataset = DataSequence(WORD, labels)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=BATCH_SIZE, shuffle=True)

    use_mps = torch.backends.mps.is_available()
    device = torch.device("mps" if use_mps else "cpu")

    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][train_label[i] != -100]
              label_clean = train_label[i][train_label[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

LEARNING_RATE = 5e-3
EPOCHS = 5
BATCH_SIZE = 2

model = BertModel()

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [18]:
from torch.utils.data import DataLoader
import torch.optim
from tqdm.notebook import tqdm

In [None]:
train_loop(model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/8390 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/miniconda3/envs/torch/lib/python3.10/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/opt/miniconda3/envs/torch/lib/python3.10/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'DataSequence' on <module '__main__' (built-in)>
