### loading and looking at the dataset

In [None]:
import torch
import numpy as np
import pandas as pd
import json
import pytorch_lightning as pl
import time
import logging
import re
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics import Metric
from torch.utils.data import Dataset, DataLoader
from termcolor import colored
from itertools import chain
from string import punctuation
from sklearn.model_selection import train_test_split
from transformers import pipeline
from transformers import AdamW, AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
!pip install pytorch-lightning

 ### creating the dataset

In [None]:
# extract questions and answers (incl. empty)
def extract_qa(json_file):
    with open(json_file) as f:
        data = json.load(f)

    data_rows = []
    for i in range(220,230):
        questions = data["data"][i]["paragraphs"]

        for question in questions:
            context = question["context"]
            for question_and_answers in question["qas"]:
                if question_and_answers["answers"] == []:
                    question = question_and_answers["question"]
                    answers = [{'text':'', 'answer_start': 0}]
                else:
                    question = question_and_answers["question"]
                    answers = question_and_answers["answers"]

                for answer in answers:
                    answer_text = answer["text"]
                    answer_start = answer["answer_start"]
                    answer_end = answer_start + len(answer_text)

                    data_rows.append({
                        "question": question,
                        "context": context,
                        "answer_text": answer_text,
                        "answer_start": answer_start,
                        "answer_end": answer_end
                    })
    return pd.DataFrame(data_rows)

In [None]:
df = extract_qa('data/CUADv1.json')

In [None]:
df.head()

In [None]:
df.shape

### Looking at samples and coloring

In [None]:
# look at one example question_
sample_question = df.iloc[2]
sample_question

In [None]:
# colour the answer
def color_answer(question):
    answer_start, answer_end = question['answer_start'], question['answer_end']
    context = question["context"]
    
    return colored(context[: answer_start], "white") + colored(context[answer_start : answer_end + 1], "blue") + colored(context[answer_end + 1:], "white")

In [None]:
print(color_answer(sample_question))

### Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/macaw-3b")

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
class CUAD(Dataset):
    def __init__(self, data, tokenizer, source_max_token_len, target_max_token_len):
        
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = 512
        self.target_max_token_len = 512
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        data_row = self.data.iloc[index]
        
        source_encoding = tokenizer(
        data_row["question"],
        data_row["context"],
        max_length=self.source_max_token_len,
        padding = "max_length",
        truncation = 'only_second',
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors= 'pt')
        
        target_encoding = tokenizer(
        data_row['answer_text'],
        max_length=self.target_max_token_len,
        padding = "max_length",
        truncation= True,
        return_attention_mask=True,
        add_special_tokens = True,
        return_tensors = 'pt'
        )
        
        labels = target_encoding["input_ids"]
        labels[labels == 0] = -100
        
        return dict(
            question = data_row["question"],
            context = data_row["context"],
            answer_text = data_row["answer_text"],
            input_ids = source_encoding["input_ids"].flatten(),
            attention_mask = source_encoding["attention_mask"].flatten(),
            labels = labels.flatten(),
            decoder_attention_mask = target_encoding["attention_mask"].flatten()
        )

In [None]:
#sample_dataset = CUAD(df, tokenizer, 396, 32)

### Balanced dataset

In [None]:
def get_dataset_pos_mask(df):
    """
    Returns a list, pos_mask, where pos_mask[i] indicates is True if the ith example in the dataset is positive
    (i.e. it contains some text that should be highlighted) and False otherwise.
    """
    return np.array((df["answer_text"] != '').to_list())

In [None]:
def get_balanced_dataset(dataset, df):
    """
    returns a new dataset, where positive and negative examples are approximately balanced
    """
    pos_mask = get_dataset_pos_mask(df)
    neg_mask = [~mask for mask in pos_mask]
    npos, nneg = np.sum(pos_mask), np.sum(neg_mask)

    neg_keep_frac = npos / nneg  # So that in expectation there will be npos negative examples (--> balanced)
    neg_keep_mask = [mask and np.random.random() < 0.2 for mask in neg_mask]
    
    # keep all positive examples and subset of negative examples
    keep_mask = [pos_mask[i] or neg_keep_mask[i] for i in range(len(pos_mask))]
    keep_indices = [i for i in range(len(keep_mask)) if keep_mask[i]]
    
    with torch.no_grad():
        subset_dataset = torch.utils.data.Subset(dataset, keep_indices)
    return subset_dataset

In [None]:
#get_balanced_dataset(sample_dataset)

### Train/test-split

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2)

In [None]:
train_df.shape, val_df.shape

In [None]:
class CUADDataModule(pl.LightningDataModule):
    
    def __init__(
        self,
        train_df,
        test_df,
        tokenizer,
        batch_size = 4,
        source_max_token_len = 512,
        target_max_token_len = 512):

        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self, stage = None):
        self.train_dataset = get_balanced_dataset(CUAD(
            self.train_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        ), self.train_df)
        
        self.test_dataset = get_balanced_dataset(CUAD(
            self.test_df,
            self.tokenizer,
            self.source_max_token_len,
            self.target_max_token_len
        ), self.test_df)
    
    def train_dataloader(self):
        return DataLoader(
        self.train_dataset,
        batch_size= self.batch_size,
        shuffle=True,
        num_workers=4)
    
    def val_dataloader(self):
        return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4)
    
    def test_dataloader(self):
        return DataLoader(
        self.test_dataset,
        batch_size=1,
        num_workers=4)

In [None]:
BATCH_SIZE = 2
EPOCHS = 2

data_module = CUADDataModule(train_df, val_df, tokenizer, batch_size=BATCH_SIZE)
data_module.setup()

### Model

In [None]:
class CUADModel(pl.LightningModule):
    
    def __init__(self):
        super().__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained("allenai/macaw-3b")
        
    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels = None):
        output = self.model(
        input_ids = input_ids,
        attention_mask = attention_mask,
        decoder_attention_mask = decoder_attention_mask,
        labels = labels)
        
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["decoder_attention_mask"]
        loss, outputs = self(input_ids, attention_mask, decoder_attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger = True)
        return loss
    
    def validation_step(self, batch , batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["decoder_attention_mask"]
        loss, outputs = self(input_ids, attention_mask, decoder_attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger = True)
        return loss
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        decoder_attention_mask = batch["decoder_attention_mask"]
        loss, outputs = self(input_ids, attention_mask, decoder_attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger = True)
        return loss
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr = 0.00003)

#### Load the model

In [None]:
model = CUADModel()

In [None]:
checkpoint_callback = ModelCheckpoint(
    dirpath = "cpoints",
    filename = "best-checkpoint",
    save_top_k = 1,
    verbose = True,
    monitor = "val_loss",
    mode = "min")

In [None]:
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs = EPOCHS,
    accelerator = "gpu",
    devices = 1,
    log_every_n_steps=40)

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir . --bind_all

In [None]:
import gc

gc.collect()

torch.cuda.empty_cache()

#### Train

In [None]:
trainer.fit(checkpoint_model, data_module)

#### Train from checkpoint

In [None]:
checkpoint_model = CUADModel.load_from_checkpoint("/notebooks/cpoints/best-checkpoint-v4.ckpt")

### Predictions

In [None]:
trained_model = CUADModel.load_from_checkpoint("/notebooks/cpoints/best-checkpoint-v5.ckpt")
trained_model.freeze()

In [None]:
def generate_answer(question):
    source_encoding = tokenizer(
        question["question"],
        question["context"],
        max_length = 512,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt'
    )
    generated_ids = trained_model.model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask = source_encoding["attention_mask"],
        num_beams = 1, 
        max_length = 256, 
        repetition_penalty = 2.5, 
        length_penalty = 1.0, 
        early_stopping = True, use_cache = True)
    
    preds = [tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in generated_ids]
    return "".join(preds)

In [None]:
def generate_answers(df):
    source_encoding = tokenizer(
        df["question"].to_list(),
        df["context"].to_list(),
        max_length = 512,
        padding = "max_length",
        truncation = "only_second",
        return_attention_mask = True,
        add_special_tokens = True,
        return_tensors = 'pt'
    )
    generated_ids = trained_model.model.generate(
        input_ids=source_encoding["input_ids"],
        attention_mask = source_encoding["attention_mask"],
        num_beams = 1, 
        max_length = 512, 
        repetition_penalty = 2.5, 
        length_penalty = 1.0, 
        early_stopping = True, use_cache = True)
    
    preds_list = [tokenizer.batch_decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for generated_id in generated_ids]
    return [" ".join(preds) for preds in preds_list]

In [None]:
answer_list = val_df["answer_text"].to_list()

In [None]:
pred_list = generate_answers(val_df)

### Metrics

#### Precision/recall

In [None]:
IOU_THRESH = 0.5

In [None]:
def compute_precision_recall_v2(preds, answers):
    tp, fp, fn = 0, 0, 0    
    # first check if answers is empty
    if len(answers) == 0:
        if len(preds) > 0:
            fp += len(preds)  # false positive for each one
    else:
        for ans in answers:
            match_found = False
            if ans == '':
                if preds[answers.index(ans)] == ans:
                    match_found = True
            else:
                # check if there is a match
                for pred in preds:
                    is_match = cosine_similarity(ans, pred) >= IOU_THRESH or ans in pred

                    if is_match:
                        match_found = True

            if match_found:
                tp += 1
            else:
                fn += 1

        # now also get any fps by looping through preds
        for pred in preds:
            # Check if there's a match. if so, don't count (don't want to double count based on the above)
            # but if there's no match, then this is a false positive.
            # (Note: we get the true positives in the above loop instead of this loop so that we don't double count
            # multiple predictions that are matched with the same answer.)
            match_found = False
            for ans in answers:
                is_match = cosine_similarity(ans, pred) >= IOU_THRESH or pred in ans
                if is_match:
                    match_found = True

            if not match_found:
                fp += 1

    precision = (tp-fp) / tp if tp + fp > 0 else np.nan
    recall = tp / (tp + fn) if tp + fn > 0 else np.nan
    print(f"tp: {tp}, fp:{fp}, fn: {fn}, sum: {tp+fp+fn}")
    return print(f"precision: {precision:.2f}, recall: {recall:.2f}")

### Precision @80% Recall

##### Cosine similarity

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def cosine_similarity(sent1, sent2):
    if sent1 == '':
        sent1 = '.'
    if sent2 == '':
        sent2 = '.'
        
    X_list = word_tokenize(sent1) 
    Y_list = word_tokenize(sent2)
    
    sw = stopwords.words('english') 
    l1 =[];l2 =[]

    # remove stop words from the string
    X_set = {w for w in X_list if not w in sw} 
    Y_set = {w for w in Y_list if not w in sw}

    # form a set containing keywords of both strings 
    rvector = X_set.union(Y_set) 
    for w in rvector:
        if w in X_set: l1.append(1) # create a vector
        else: l1.append(0)
        if w in Y_set: l2.append(1)
        else: l2.append(0)
    c = 0

    # cosine formula 
    for i in range(len(rvector)):
            c+= l1[i]*l2[i]
    cosine = c / float((sum(l1)*sum(l2))**0.5)
    
    return cosine

### Andre hjelpefunksjoner

In [None]:
def remove_space(sentence):
    fixed = " ".join(sentence.split())
    return fixed