In [None]:
!pip install transformers

In [None]:
# imports 
import numpy as np
import pandas as pd
import re
import torch

from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from torch.optim import SGD

## **Constructing NER dataset**

In [None]:
df = pd.read_csv('../input/homeworkexercise/Homework Exercise.csv')

In [None]:
df

In [None]:
# Fucntion to construct Named Entity Recognition Labels for the given store no. task
def construct_ner_labels(sentence, label):
  ner_labels = []
  s = sentence.split(" ")

  for index, word in enumerate(s):
        number = re.findall(r'\d+', word)
        match = re.findall(r'\d+', label)
        if(len(number) > 0):
          if (match[0] == number[0].lstrip('0')):
            ner_labels.append('B-sto')
          else:
            ner_labels.append('O')
        else: 
          ner_labels.append('O')
  
  return ner_labels

In [None]:
des = df.iloc[295]['transaction_descriptor']
label = df.iloc[295]['store_number']

In [None]:
des, label

In [None]:
construct_ner_labels(des, label)  # We denote the store entity by B-sto tag 

In [None]:
# construct a label list 
label_list = []

for index, row in df.iterrows():
  sen, label = row['transaction_descriptor'], row['store_number']
  ner_labels = construct_ner_labels(sen, label)
  label_list.append(' '.join(ner_labels))

In [None]:
label_list

In [None]:
# construct a label column 
df['labels'] = label_list

In [None]:
# splitting the dataframes according to dataset column
df_train = df[df['dataset'] == 'train']
df_val = df[df['dataset'] == 'validation']
df_test = df[df['dataset'] == 'test']

In [None]:
# restting the indices
df_train = df_train.reset_index()
df_val = df_val.reset_index()
df_test = df_test.reset_index()

In [None]:
df_test.head()

## **Pre-processing the dataset for NER task**

In [None]:
# get all the labels 
labels = [labs.split() for labs in df_train['labels'].values.tolist()]
labels[:5]

In [None]:
# Check how many labels are there in the dataset
#unique_labels = set()

#for lb in labels:
  #[unique_labels.add(i) for i in lb if i not in unique_labels]

# Print the unique labels 
#print(unique_labels)
# There are only two unique labels here blank and store_no. 

In [None]:
# Map each label into its index representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}

# print a two way mapping 
print(labels_to_ids)
print(ids_to_labels)

### **Tokenizing the data**

In [None]:
# We use HuggingFace library for tokenizer and BERT model 
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [None]:
sample = df_train.iloc[0]['transaction_descriptor']
text_tokenized = tokenizer(sample, padding='max_length', max_length=512, truncation=True, return_tensors="pt")

In [None]:
#text_tokenized

In [None]:
# word ids are useful for identifying various pieces of the word 
# This is necessary as we need to adjust the ner labels according to pieces  

word_ids = text_tokenized.word_ids()
print(sample)
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
print(word_ids)

In [None]:
# This is a function that encapsulates the process of adjusting labels 
# We either fill all the pieces with same label or we only fill first sub word with label and rest with -100 
# -100 is used as pytorch doesnot calculate the loss for this token 

def align_label(texts, labels, label_all_tokens = False):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [None]:
label = labels[0]

print(f'label : {label}\n')

# set label all tokens to True 
label_all_tokens = True

new_label = align_label(sample, label, label_all_tokens)

print(f'label is assigned to all sub words : ')
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
print(new_label)

print()

# set label all tokens to False
label_all_tokens = False

new_label = align_label(sample, label, label_all_tokens)

print(f'label is assigned to first sub word only : ')
print(tokenizer.convert_ids_to_tokens(text_tokenized["input_ids"][0]))
print(new_label)

## **Constructing a Pytorch Dataset**

In [None]:
class NERDataset(torch.utils.data.Dataset):

    def __init__(self, df):
        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['transaction_descriptor'].values.tolist()
        self.texts = [tokenizer(str(i),
                               padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

## **Model Definition**

In [None]:
# BERT model for Named Entity Recognition : This is basically a model for token classification 
class BertNER(torch.nn.Module):

    def __init__(self, num_labels):
        super(BertNER, self).__init__()
        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=num_labels)

    def forward(self, input_id, mask, label):
        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        return output

In [None]:
model = BertNER(len(unique_labels))

## **Training**

In [None]:
# Hyperparams
lr = 1e-3
epochs = 10
use_cuda = torch.cuda.is_available()

In [None]:
# set device to 'cuda' if available
device = torch.device("cuda" if use_cuda else "cpu")

In [None]:
# Construct the dataset and dataloaders 
train_dataset = NERDataset(df_train)
val_dataset = NERDataset(df_val)

train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=1, shuffle=True)
val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=1)

In [None]:
#for i,j in val_dataloader:
  #print(i,j)

In [None]:
# We use SGD optimizer, we can use Adam or something else too 
optimizer = SGD(model.parameters(), lr=lr)

In [None]:
if use_cuda:
  model = model.cuda()

In [None]:
best_acc = 0
best_loss = 1000

for epoch_num in range(epochs):
    
    #######################################################
    ######################Training#########################
    #######################################################

    total_acc_train = 0
    total_loss_train = 0

    model.train()

    for train_data, train_label in tqdm(train_dataloader):

        train_label = train_label[0].to(device)
        mask = train_data['attention_mask'][0].to(device)
        input_id = train_data['input_ids'][0].to(device)

        optimizer.zero_grad()
        loss, logits = model(input_id, mask, train_label)

        logits_clean = logits[0][train_label != -100]
        label_clean = train_label[train_label != -100]

        predictions = logits_clean.argmax(dim=1)

        acc = (predictions == label_clean).float().mean()
        total_acc_train += acc
        total_loss_train += loss.item()

        loss.backward()
        optimizer.step()

    #######################################################
    ######################Validation#######################
    #######################################################
    
    model.eval()

    total_acc_val = 0
    total_loss_val = 0

    for val_data, val_label in val_dataloader:

        val_label = val_label[0].to(device)
        mask = val_data['attention_mask'][0].to(device)

        input_id = val_data['input_ids'][0].to(device)

        loss, logits = model(input_id, mask, val_label)

        logits_clean = logits[0][val_label != -100]
        label_clean = val_label[val_label != -100]

        predictions = logits_clean.argmax(dim=1)          

        acc = (predictions == label_clean).float().mean()
        total_acc_val += acc
        total_loss_val += loss.item()

    val_accuracy = total_acc_val / len(df_val)
    val_loss = total_loss_val / len(df_val)

    print(
    f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')

## **Evaluation on Test Dataset**

In [None]:
test_dataset = NERDataset(df_test)
test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

In [None]:
# predicting labels over the test dataset 
model.eval()

predicted_labels = []

for test_data, test_label in test_dataloader:

    test_label = test_label[0].to(device)
    mask = test_data['attention_mask'][0].to(device)

    input_id = test_data['input_ids'][0].to(device)

    loss, logits = model(input_id, mask, test_label)

    logits_clean = logits[0][test_label != -100]
    label_clean = test_label[test_label != -100]

    predictions = logits_clean.argmax(dim=1)  
    
    # obtaining the label from indices 
    pred_ner = [ids_to_labels[pred.cpu().detach().numpy().item()]  for pred in predictions]

    # print(test_data['input_ids'][0].shape)
    # print(tokenizer.decode(test_data['input_ids'][0][0]))
    predicted_labels.append(pred_ner)

In [None]:
true_labels = []
pred_labels = []

# Now we iterate through each row of the test dataset and obtain the true store no. and predicted store no. 
for index, row in df_test.iterrows():
  sen, label = row['transaction_descriptor'], row['store_number']

  s = sen.split(" ")
  print(s)
  
  try:
    i = predicted_labels[index].index('B-sto')
  except:
    i = predicted_labels[index].index('O')

  try:
    store_no = re.findall(r'\w+', s[i])
    store_no = store_no[0].lstrip('0')
  except:
    store_no = s[i]

  true_labels.append(label)
  pred_labels.append(store_no)  
  print(f'True label : {label}, predicted label : {store_no}')

In [None]:
acc = 0

for i in range(len(true_labels)):
    if(true_labels[i] == pred_labels[i]):
      acc += 1

print(f'acc : {(acc/len(true_labels))*100}')

In [None]:
# End of notebook 