## Tweet Sentiment Extraction
### 2. Question-Answering (Q&A) approach

Given a question and a context, train a transformer model to find the answer in the text column (the context).

- Question: sentiment column 
- Context:  text column
- Answer: selected_text column

In [None]:
import pandas as pd
import torch
import numpy as np
from torch.utils.data import DataLoader
from helper import(
jaccard, 
find_all,
create_qa_train_dataset,
read_squad,
add_end_idx,
add_original_answer,
add_token_positions)
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import RobertaForQuestionAnswering, AutoTokenizer, AdamW, DistilBertForQuestionAnswering
SEED = 0

In [2]:
train = pd.read_csv('tweet-sentiment-extraction/train.csv')
test = pd.read_csv('tweet-sentiment-extraction/test.csv')

In [3]:
# 1 missing value from the train set can be dropped:
train.dropna(axis = 0, how ='any',inplace=True)

Split the dataset into train and validation sets (80%/20%):

In [4]:
train_df, validation_df = train_test_split(train, test_size=0.2, random_state=SEED)

print('Train set size: %s' % len(train_df))
print('Validation set size: %s' % len(validation_df))

Train set size: 21984
Validation set size: 5496


**Step 1**: Preparing the Tweeter dataset in the Q&A format (that matches the SQUAD dataset that the transformer model was pre-trained on): 

In [5]:
train = np.array(train_df)
test = np.array(validation_df)

In [None]:
qa_train = create_qa_train_dataset(train)
qa_test = create_qa_train_dataset(test)

with open('dataset_qa_format/train.json', 'w') as outfile:
    json.dump(qa_train, outfile)
    
with open('dataset_qa_format/test.json', 'w') as outfile:
    json.dump(qa_test, outfile)

In [6]:
# read the json files in the Q&A format and prepare the training and validation sets
train_contexts, train_questions, train_answers = read_squad('dataset_qa_format/train.json')
val_contexts, val_questions, val_answers = read_squad('dataset_qa_format/test.json')

In [7]:
train_contexts[3]

' Ben and Jerry...yummmmy!!!'

In [8]:
train_questions[3]

'positive'

In [9]:
train_answers[3]

{'answer_start': 16, 'text': '.yummmmy!'}

In [10]:
# adding an 'answer_end' value to the dicts
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [11]:
train_answers[3:6]

[{'answer_start': 16, 'text': '.yummmmy!', 'answer_end': 25},
 {'answer_start': 1,
  'text': 'wow.. purple leopard skin.  fieeerrceee..',
  'answer_end': 42},
 {'answer_start': 13, 'text': 'fun', 'answer_end': 16}]

**Step 2**: Tokenise the dataset to train a transformer model

In [12]:
# initialize the tokeniser 
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [13]:
# tokenize train and validation datasets 
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

In [14]:
train_encodings.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
print(train_encodings["input_ids"][3])

[0, 1664, 8, 6509, 734, 219, 783, 5471, 4783, 16506, 2, 2, 22173, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [100]:
print(train_encodings["input_ids"][3])

[101, 3841, 1998, 6128, 1012, 1012, 1012, 9805, 7382, 18879, 999, 999, 999, 102, 3893, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
tokenizer.decode(train_encodings["input_ids"][3])

'<s> Ben and Jerry...yummmmy!!!</s></s>positive</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [None]:
tokenizer.decode(train_encodings["input_ids"][3])

'[CLS] Ben and Jerry... yummmmy!!! [SEP] positive [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [None]:
print(train_encodings["attention_mask"][3])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
print(train_encodings["attention_mask"][3])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [101]:
# add ground truth answers to the dict, tokenize and pad them to the same length
original_answer_train = add_original_answer(train_encodings, train_answers, tokenizer)
original_answer_val = add_original_answer(val_encodings, val_answers, tokenizer)

In [102]:
# add start_positions and end_positions to the Encoding objects
add_token_positions(train_encodings, train_answers, original_answer_train, tokenizer)
add_token_positions(val_encodings, val_answers, original_answer_val, tokenizer)

In [None]:
val_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions', 'orig_answer'])

**Step 3**: Create a PyTorch dataset object

In [103]:
class TweetDataset(torch.utils.data.Dataset):
    """
    Create a PyTorch dataset object
    """
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training and validation sets
train_dataset = TweetDataset(train_encodings)
val_dataset = TweetDataset(val_encodings)

**Step 4**: Fine-tuning the RoBERTa model on the Twitter dataset

In [None]:
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
# model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')
#model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

In [74]:
# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=5e-5)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(3):
    # set model to train mode
    model.train()
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        orig_answer = batch['orig_answer'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 1380/1380 [07:07<00:00,  3.23it/s, loss=0.741]
Epoch 1: 100%|██████████| 1380/1380 [07:23<00:00,  3.11it/s, loss=0.546]
Epoch 2: 100%|██████████| 1380/1380 [07:23<00:00,  3.11it/s, loss=0.953]


In [None]:
model_save_name = 'distilbert_squad_twitter.pt'
path = F"/models/{model_save_name}" 
torch.save(model.state_dict(), path)

In [None]:
model_save_name = 'distilbert_squad_twitter.pt'
path = F"/models/{model_save_name}" 
model.load_state_dict(torch.load(path))

In [None]:
def eval_model(model_type):
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    # move model over to detected device
    model.to(device)
    # switch model out of training mode
    model.eval()

    val_loader = DataLoader(val_dataset, batch_size=16)

    true_answers = []
    pred_answers = []
    jaccard_scores = []
    
    if model_type == "bert":
        split_token = 1
    if model_type == "roberta":
        split_token = 0

    # initialize loop for progress bar
    loop = tqdm(val_loader)
    # loop through batches
    for batch in loop:
        # we don't need to calculate gradients as we're not training
        with torch.no_grad():
            # pull batched items from loader
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_true = batch['start_positions'].to(device) 
            end_true = batch['end_positions'].to(device)
            orig_answer = batch['orig_answer'].to(device)
            for answer in orig_answer:
                counter = 0
                for token in answer:
                    counter += 1
                    if token == split_token:
                        answer = answer[1:counter-2]
                        counter = 0
                        break
                true_answers.append(tokenizer.decode(answer))
            # make predictions
            outputs = model(input_ids, attention_mask=attention_mask)
            # pull preds out
            # Get the highest probability from the model output for the start and end positions
            start_pred = torch.argmax(outputs['start_logits'], dim=1)
            end_pred = torch.argmax(outputs['end_logits'], dim=1)

            for input_id, s_pred, e_pred in zip(input_ids, start_pred, end_pred):
                predict_answer_tokens = input_id[s_pred:e_pred+1]
                pred_answers.append(tokenizer.decode(predict_answer_tokens))

            for true_answer, pred_answer in zip(true_answers, pred_answers):
                jaccard_scores.append(jaccard(true_answer, pred_answer))

In [77]:
# roberta-base-squad2
score = sum(jaccard_scores)/len(jaccard_scores)
print(score)  

0.7023163663566468


In [91]:
# distilbert-base-cased-distilled-squad
score = sum(jaccard_scores)/len(jaccard_scores)
print(score) 

0.6856482923838751


In [107]:
# distilbert-base-uncased
score = sum(jaccard_scores)/len(jaccard_scores)
print(score) 

0.6918300386014856
