In [1]:
import numpy as np 
import pandas as pd 
import torch
from torch import nn
from torch.utils.data import Dataset,DataLoader
from torch.optim import SGD,AdamW
from transformers import BertTokenizerFast
from transformers import BertForTokenClassification
from matplotlib import pyplot as plt
from tqdm import tqdm
import ast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

In [68]:
df = pd.read_csv('train.csv')

In [92]:
def tokenize_and_align(df,add_sep_prompt=True):
    maxl = 0
    labels = []
    texts = []
    label_map = {'<pad>':3, 'O':0, 'I':1, 'B':2}
    attention_masks = []
    for i in range(len(df)):
        text = ast.literal_eval(df['tokens'][i])
        texts.append(text)
        spans = [(i,j) for i,j in zip(ast.literal_eval(df['span_start_index'][i]), ast.literal_eval(df['span_end_index'][i]))]
        label = ['O']*len(text)
        for span in spans:
            label[span[0]] = 'B'
            for i in range(span[0]+1, span[1]+1):
                label[i] = 'I'
        labels.append(label)

    tokens = []
    labels_final = []
    map_list = []

    for i in range(len(texts)):
        text = texts[i]
        tokens_list = [500]*PROMPT_LENGTH
        labels_list = [label_map['O']]*PROMPT_LENGTH
        cur_map = {}

        #adding seperation token
        if add_sep_prompt:
            tokens_list.append(tokenizer.convert_tokens_to_ids(tokenizer.sep_token))
            labels_list.append(label_map['O'])

        for j in range(len(text)):
            tokenized = tokenizer.tokenize(text[j])
            cur_map[j] = []
            for sub_token in tokenized:
                cur_map[j].append(len(tokens_list))
                tokens_list.append(tokenizer.convert_tokens_to_ids(sub_token))
        
        for k in range(len(text)):
            for j in cur_map[k]:
                labels_list.append(label_map[labels[i][k]])
        
        #Adding seperation token
        tokens_list.append(tokenizer.convert_tokens_to_ids(tokenizer.sep_token))
        labels_list.append(label_map['O'])
        attention_mask = [1]*len(tokens_list)

        tokens_list.extend([(tokenizer.pad_token_id)]*(MAX_LEN - len(tokens_list)))
        labels_list.extend([label_map['O']]*(MAX_LEN - len(labels_list)))
        attention_mask.extend([0]*(MAX_LEN - len(attention_mask)))

        labels_final.append(labels_list)
        tokens.append(tokens_list)
        attention_masks.append(attention_mask)
        map_list.append(cur_map)

    return tokens, labels_final, attention_masks, map_list
        

In [93]:
class DataForBert(Dataset):

    def __init__(self, df):
        super(DataForBert, self).__init__()

        self.df = df
        self.tokens, self.labels, self.attention_masks, self.map_list = tokenize_and_align(df)

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self,idx):
        return torch.LongTensor(self.tokens[idx]), torch.LongTensor(self.labels[idx]), torch.LongTensor(self.attention_masks[idx])
        

In [95]:
class prompt(nn.Module):

    def __init__(self,
                 wte : nn.Embedding,
                 prompt_length : int = 20,
                 rand_range : float = 0.5,
                 initialize_from_vocab : bool = True):

        super(prompt, self).__init__()
        self.wte = wte
        self.prompt_length = prompt_length
        self.learned_embedding = nn.Parameter(self.initialize_embedding(wte,
                                                                        prompt_length,
                                                                        rand_range,
                                                                        initialize_from_vocab))

    
    def initialize_embedding(self,
                             wte : nn.Embedding,
                             prompt_length : int =10,
                             random_range : float = 0.5,
                             initialize_from_vocab : bool = True):

        if initialize_from_vocab:
            return self.wte.weight[:prompt_length].clone().detach()

        return torch.FloatTensor(wte.weight.size(1),prompt_length).uniform_(-random_range,random_range)

    def forward(self,tokens):
        input_embedding = self.wte(tokens[:,self.prompt_length:])
        learned_embedding = self.learned_embedding.repeat(input_embedding.size(0),1,1)
        return torch.cat((learned_embedding,input_embedding),dim=1)
                    
class BertModel(torch.nn.Module):

    def __init__(self,
                 prompt_length = 20,
                 initialize_from_vocab = True):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=3)
        if prompt_length>0:
            self.prompt_length = prompt_length
            self.prompt = prompt(self.bert.get_input_embeddings(),
                                prompt_length = self.prompt_length,
                                initialize_from_vocab = initialize_from_vocab)
            
            self.bert.set_input_embeddings(self.prompt)

    def forward(self, input_id, label, mask):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output

In [97]:
def train_loop(model,df_train, df_val,EPOCHS,LEARNING_RATE,BATCH_SIZE,optim = "SGD",):

    train_data = DataForBert(df_train)
    val_data = DataForBert(df_val)

    train_loader = DataLoader(train_data,  batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_data,  batch_size=BATCH_SIZE)

    if optim == "AdamW":
        optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    elif optim == "SGD":
        optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

    if DEVICE == 'cuda':
        model = model.cuda()

    epoch_data = []
    for epoch_num in range(EPOCHS):

        total_acc_train = 0
        total_loss_train = 0
        
        total_acc_train_o = 0
        print(model.prompt.learned_embedding)
        model.train()

        for train_in, train_out,train_mask in tqdm(train_loader):

            train_out = train_out.to(DEVICE)
            train_in = train_in.to(DEVICE)
            mask = train_mask.to(DEVICE)


            optimizer.zero_grad()
            loss, logits = model(train_in, train_out, mask)
            
            for i in range(logits.shape[0]):

              logits_clean= logits[i][train_out[i] != -100]
              label_clean = train_out[i][train_out[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_train += acc
              total_loss_train += loss.item()
                
              logits_clean2 = logits[i][train_out[i] != -100]
              label_clean2 = train_out[i][train_out[i] != -100]
              logits_clean2 = logits_clean2[label_clean2 != 1]
              label_clean2 = label_clean2[label_clean2 != 1]

              predictions2 = logits_clean2.argmax(dim=1)
              acc2 = (predictions2 == label_clean2).float().mean()
                
              if len(predictions2)>0 and len(label_clean2)>0:
                total_acc_train_o += acc2
              
            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_acc_val_o = 0
        total_loss_val = 0

        for val_in, val_out, val_mask in val_loader:
            

            val_out = val_out.to(DEVICE)
            val_in = val_in.to(DEVICE)
            mask = val_mask.to(DEVICE)

            loss, logits = model(val_in, val_out, mask)

            for i in range(logits.shape[0]):

              logits_clean = logits[i][val_out[i] != -100]
              label_clean = val_out[i][val_out[i] != -100]

              predictions = logits_clean.argmax(dim=1)
              acc = (predictions == label_clean).float().mean()
              total_acc_val += acc
              total_loss_val += loss.item()
                
              logits_clean2 = logits[i][val_out[i] != -100]
              label_clean2 = val_out[i][val_out[i] != -100]
              logits_clean2 = logits_clean2[label_clean2 != 1]
              label_clean2 = label_clean2[label_clean2 != 1]
              
              
              predictions2 = logits_clean2.argmax(dim=1)
              acc2 = (predictions2 == label_clean2).float().mean()
              if len(predictions2)>0 and len(label_clean2)>0:
                total_acc_val_o += acc2
                
        val_accuracy = (total_acc_val / len(df_val))
        val_loss = total_loss_val / len(df_val)
        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')
        print(f'Accuracy train w/o O {total_acc_train_o / len(df_train): .3f} | Accuracy val w/o O = {total_acc_val_o / len(df_val): .3f}')
        
        epoch_data.append((total_loss_train / len(df_train),total_acc_train / len(df_train),total_acc_train_o / len(df_train),total_loss_val / len(df_val),val_accuracy,total_acc_val_o / len(df_val)))
    return epoch_data,model

In [104]:
def test_pipeline(maxl=256,batch_size=2,epochs=5,learning_rate=1e-5,optim = "SGD"):
    
    df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.8 * len(df)), int(.9 * len(df))])
    
    model = BertModel()
    ep_data,model = train_loop(model,df_train,df_val,epochs,learning_rate,batch_size,optim= optim)
    
    train_loss,train_acc,train_acc_o,val_loss,val_acc,val_acc_o = [],[],[],[],[],[]
    
    for a,b,c,d,e,f in ep_data:
        try:
            train_loss.append(a.item())
        except:
            train_loss.append(a)
        try:
            train_acc.append(b.item())
        except:
            train_acc.append(b)
        try:
            train_acc_o.append(c.item())
        except:
            train_acc_o.append(c)
        try:
            val_loss.append(d.item())
        except:
            val_loss.append(d)
        try:
            val_acc.append(e.item())
        except:
            val_acc.append(e)
        try:
            val_acc_o.append(f.item())
        except:
            val_acc_o.append(f)
        
    print(train_loss,train_acc,train_acc_o,val_loss,val_acc,val_acc_o)
    loss_metrics = pd.DataFrame({'Train loss':train_loss,
               'Val loss':val_loss
               })
    
    acc_metrics = pd.DataFrame({'Train Accuracy':train_acc,
                                'Train Accuracy without O':train_acc_o,
                                'Val Accuracy':val_acc,
                                'Val Accuracy without O':val_acc_o})
    
    loss_metrics.plot()
    acc_metrics.plot()
    return df_test, model, (loss_metrics,acc_metrics)
        
    

In [106]:
#Defining constants
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 2e-5
PROMPT_LENGTH = 20
f_test2k_np, model2k_np, metrics2k_np = test_pipeline()
print()
print()


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

KeyError: 3