In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaConfig, RobertaForTokenClassification

## Load the data

In [2]:
path_to_data= "../data/annotated/cleaned_data.csv"
df = pd.read_csv(path_to_data)
prod_start = 'B-PROD-STRT'
prod_end = 'B-PROD-END'
prod = 'B-PROD'

In [3]:
df.head()

Unnamed: 0,id,text,status,url,file_name,label,Comments
0,1,Page Not Found The page you are looking for do...,1,http://www.vawayside.net/store/products/tag/beds,data/crawled_texts/idx_4_2_s_1.json,[],[]
1,2,(07) 5556 0693 [email protected] 130 Siganto D...,1,https://hemisphereliving.com.au/products/,data/crawled_texts/idx_0_4_s_1.json,"[[187, 210, 'product'], [237, 267, 'product'],...",[]
2,3,404 We could not find the page you were lookin...,0,https://edenliving.online/collections/summerlo...,data/crawled_texts/idx_1_4_s_0.json,[],[]
3,4,404 Page not found Continue shopping,0,https://www.ourfurniturewarehouse.com.au/produ...,data/crawled_texts/idx_2_0_s_0.json,[],[]
4,5,View fullsize image Email us about this produc...,1,https://www.hudsonfurniture.com.au/products/st...,data/crawled_texts/idx_3_1_s_1.json,"[[98, 123, 'product'], [590, 617, 'product'], ...",[]


In [4]:
# getting the count of labels
anno_counts = df.label.apply(len)

In [5]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [6]:
def transform_labels(x):
    text, labels =x['text'], eval(x['label'])
    if len(labels) == 0:
        return text, ",".join(["O"]*len(text.split()))
    last_index = 0
    results = []
    for label in labels:
        start, end, l = label
        before_words = text[last_index: start].split(" ")
        before_words = [word for word in before_words if len(word) > 0]
        num_of_words_before = len(before_words)
        
        between_words = text[start: end].split(" ")
        between_words = [word for word in between_words if len(word) > 0]
        num_of_words_between = len(between_words)
        results.extend(["O"]*num_of_words_before)
#         product_label = [prod_start] + ['O'] * (num_of_words_between-2) 
#         if num_of_words_between > 1:
#             product_label += [prod_end]
        product_label = [prod]*num_of_words_between
        results.extend(product_label)
        last_index = end
    last_words = text[last_index:].split(" ")
    last_words = [word for word in last_words if len(word) > 0]
    results.extend(["O"] * len(last_words))
    
    return text, ",".join(results)
df = pd.read_csv(path_to_data)
df[['text', 'label']] = df.apply(lambda x: pd.Series([transform_labels(x)[0], transform_labels(x)[1]]), axis=1)
final_df = df[['text', 'label']] 

In [7]:
final_df.head()

Unnamed: 0,text,label
0,Page Not Found The page you are looking for do...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
1,(07) 5556 0693 [email protected] 130 Siganto D...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,404 We could not find the page you were lookin...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,404 Page not found Continue shopping,"O,O,O,O,O,O"
4,View fullsize image Email us about this produc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-PROD,B-PROD,..."


## check the class imbalance

In [8]:
counts = final_df.apply(lambda x:len([ label for label in x['label'].split(",") if not label == 'O']), axis=1)
final_df = final_df[counts>0]

In [9]:
len(counts[counts>0]), len(counts[counts==0])

(105, 149)

In [10]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

In [11]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """
    labels = text_labels.split(",")
    words = sentence.split()
    tokenized_sentence = []
    processed_labels = []
    for word, label in zip(words, labels):
        
        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        processed_labels.extend([label] * n_subwords)

    return tokenized_sentence, processed_labels

In [12]:
# tokenize_and_preserve_labels(final_df.iloc[1,0], final_df.iloc[1,1], tokenizer)

In [13]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.text[index]  
        word_labels = self.data.label[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [14]:
MAX_LEN = 128
label2id = {'O':0, prod:1}#, prod_start:2, prod_end:3}
id2label = {0:'O', 1:prod}#, 2:prod_start, 3:prod_end}
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [15]:
train_size = 0.8
train_dataset = final_df.sample(frac=train_size,random_state=200)
test_dataset = final_df.drop(train_dataset.index)

## try to unbalance the data

In [16]:
counts = train_dataset.apply(lambda x:len([ label for label in x['label'].split(",") if not label == 'O']), axis=1)
train_dataset = pd.concat([train_dataset]+[train_dataset[counts>3]]*2)

counts = test_dataset.apply(lambda x:len([ label for label in x['label'].split(",") if not label == 'O']), axis=1)
test_dataset = pd.concat([test_dataset] +[test_dataset[counts>3]]*2)

## prepare training data

In [17]:
train_dataset = train_dataset.reset_index(drop=True)
test_dataset = test_dataset.reset_index(drop=True)
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [19]:
model = RobertaForTokenClassification.from_pretrained('roberta-base', 
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [20]:
ids = training_set[0]["ids"].unsqueeze(0)
mask = training_set[0]["mask"].unsqueeze(0)
targets = training_set[0]["targets"].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss

tensor(0.5416, device='cuda:0', grad_fn=<NllLossBackward0>)

In [21]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 2])

## training

In [22]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [23]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()    
    for idx, batch in enumerate(training_loader):        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")
    

def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy
    
    # print(eval_labels)
    # print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    # print(labels)
    # print(predictions)
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [24]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)
    labels, predictions = valid(model, testing_loader)

Training epoch: 1
Training loss per 100 training steps: 0.5795525908470154
Training loss epoch: 0.3515933001482928
Training accuracy epoch: 0.8794285847675769
Validation loss per 100 evaluation steps: 0.2232712209224701
Validation Loss: 0.3062053466072449
Validation Accuracy: 0.8853665865384616
Training epoch: 2
Training loss per 100 training steps: 0.3445429801940918
Training loss epoch: 0.20640906222440578
Training accuracy epoch: 0.8975257858908766
Validation loss per 100 evaluation steps: 0.1316119134426117
Validation Loss: 0.3617195969877335
Validation Accuracy: 0.9107572115384616
Training epoch: 3
Training loss per 100 training steps: 0.16416431963443756
Training loss epoch: 0.1190920157565011
Training accuracy epoch: 0.9497668079438932
Validation loss per 100 evaluation steps: 0.3135789930820465
Validation Loss: 0.3456073824602824
Validation Accuracy: 0.9030949519230769
Training epoch: 4
Training loss per 100 training steps: 0.0625772550702095
Training loss epoch: 0.072650006592

KeyboardInterrupt: 

In [25]:
train_dataset = final_df.sample(frac=train_size,random_state=200)
test_dataset = final_df.drop(train_dataset.index)

test_dataset = test_dataset.reset_index(drop=True)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
testing_loader = DataLoader(testing_set, **test_params)

In [26]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.2304617017507553
Validation Loss: 0.28299977257847786
Validation Accuracy: 0.9367897727272727


In [27]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

        PROD       0.77      0.48      0.59       266

   micro avg       0.77      0.48      0.59       266
   macro avg       0.77      0.48      0.59       266
weighted avg       0.77      0.48      0.59       266



In [28]:
# print(classification_report([labels], [predictions]))
# labels

In [None]:
path = "../data/saved_models/roberta_trained.pth"
torch.save(model, path)

In [None]:
# idx= np.random.randint(100)
# # print the first 30 tokens and corresponding labels
# for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[idx]["ids"][:100]), training_set[idx]["targets"][:100]):
#   print('{0:10}  {1}'.format(token, id2label[label.item()]))