<a href="https://colab.research.google.com/github/kevin-rn/Grounding-LM/blob/main/retrieval_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Google Colab setup


In [None]:
import os
from google.colab import drive

drive.mount("/content/drive")

outdir = "./out"
if not os.path.exists(outdir):
    os.mkdir(outdir)

Change below command to point to the correct data folder


In [None]:
%cp -R ./drive/MyDrive/Grounding_LM/data ./

# Packages & Imports


In [None]:
%pip install -q datasets pytorch-lightning transformers sentence-transformers openai tiktoken annoy
%pip install -U -q pip setuptools wheel
%pip install -U -q spacy
!python -m spacy download en_core_web_sm

In [None]:
from datasets import load_dataset, load_from_disk
import pandas as pd
import tiktoken
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from tqdm.auto import tqdm
import openai
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import random
import spacy
from spacy.lang.en import English

nlp = spacy.load("en_core_web_sm")

# Data


## Dataset


In [None]:
# Download Dataset
def download_data(dataname):
    match dataname:
        case "xsum":
            data = load_dataset("xsum")
        case "cnn_dailymail":
            data = load_dataset("cnn_dailymail", "3.0.0")
        case _:
            data = load_dataset("webis/tldr-17")

    # Split Dataset
    df_train = pd.DataFrame(data=data["train"])
    df_val = pd.DataFrame(data=data["validation"])
    df_test = pd.DataFrame(data=data["test"])

    # Rename columns for later usage
    df_train.columns = ["text", "summary", "id"]
    df_val.columns = ["text", "summary", "id"]
    df_test.columns = ["text", "summary", "id"]

    # Save to data folder (inside Grounding_LM folder)
    df_train.to_csv(f"data/{dataname}/train.csv", index=False)
    df_val.to_csv(f"data/{dataname}/validation.csv", index=False)
    df_test.to_csv(f"data/{dataname}/test.csv", index=False)


# Read Dataset from folder
def read_data(dataname):
    df_train = pd.read_csv(f"data/{dataname}/train.csv")
    df_val = pd.read_csv(f"data/{dataname}/validation.csv")
    df_test = pd.read_csv(f"data/{dataname}/test.csv")
    df_test[["text", "summary"]] = df_test[["text", "summary"]].astype(str)
    return df_train, df_val, df_test

In [None]:
dataset = "xsum"
# download_data(dataset)
df_train, df_val, df_test = read_data(dataname=dataset)
df_train.head()

Unnamed: 0,text,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


## Custom Datamodule


In [None]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, text_max_len=512, summary_max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.text_max_len = text_max_len
        self.summary_max_len = summary_max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        text = self.data.iloc[idx]["text"]
        summary = self.data.iloc[idx]["summary"]
        id = self.data.iloc[idx]["id"]

        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        summary_encoding = self.tokenizer(
            summary,
            max_length=self.summary_max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors="pt",
        )

        labels = summary_encoding["input_ids"]
        labels[labels == 0] = -100

        return {
            "text": text,
            "summary": summary,
            "id": id,
            "input_ids": text_encoding["input_ids"],
            "attention_mask": text_encoding["attention_mask"],
            "labels": labels.flatten(),
            "labels_attention_mask": summary_encoding["attention_mask"],
        }

In [None]:
class CustomDataModule(pl.LightningDataModule):
    def __init__(
        self,
        df_train,
        df_val,
        df_test,
        tokenizer,
        batch=8,
        text_max_len=512,
        summary_max_len=128,
    ):
        super().__init__()
        self.df_train = df_train
        self.df_val = df_val
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.batch = batch
        self.text_max_len = text_max_len
        self.summary_max_len = summary_max_len

    def setup(self, stage=None):
        self.train_dataset = CustomDataset(
            self.df_train, self.tokenizer, self.text_max_len, self.summary_max_len
        )
        self.val_dataset = CustomDataset(
            self.df_val, self.tokenizer, self.text_max_len, self.summary_max_len
        )
        self.test_dataset = CustomDataset(
            self.df_test, self.tokenizer, self.text_max_len, self.summary_max_len
        )

    def collate_fn(self, batch):
        texts = [item["text"] for item in batch]
        summaries = [item["summary"] for item in batch]
        ids = [item["id"] for item in batch]
        text_input_ids = pad_sequence(
            [item["input_ids"].flatten() for item in batch], batch_first=True
        )
        text_attention_masks = pad_sequence(
            [item["attention_mask"].flatten() for item in batch], batch_first=True
        )
        labels = pad_sequence([item["labels"] for item in batch], batch_first=True)
        labels_attention_masks = pad_sequence(
            [item["labels_attention_mask"].flatten() for item in batch],
            batch_first=True,
        )

        return {
            "text": texts,
            "summary": summaries,
            "id": ids,
            "input_ids": text_input_ids,
            "attention_mask": text_attention_masks,
            "labels": labels,
            "labels_attention_mask": labels_attention_masks,
        }

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch,
            shuffle=True,
            num_workers=2,
            collate_fn=self.collate_fn,
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch,
            shuffle=True,
            num_workers=2,
            collate_fn=self.collate_fn,
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            batch_size=self.batch,
            shuffle=True,
            num_workers=2,
            collate_fn=self.collate_fn,
        )

## Model


In [None]:
class SummaryModel(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model, return_dict=True)

    def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=labels,
        )
        return output

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        decoder_attention_mask = batch["labels_attention_mask"]
        labels = batch["labels"]

        output = self.forward(input_ids, attention_mask, decoder_attention_mask, labels)
        loss = output.loss
        self.log(
            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        decoder_attention_mask = batch["labels_attention_mask"]
        labels = batch["labels"]

        output = self.forward(input_ids, attention_mask, decoder_attention_mask, labels)
        loss = output.loss
        self.log(
            "val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        decoder_attention_mask = batch["labels_attention_mask"]
        labels = batch["labels"]

        output = self.forward(input_ids, attention_mask, decoder_attention_mask, labels)
        loss = output.loss
        self.log(
            "test_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        return loss

    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=0.0001)

# Fine-Tuned models


### Pretrained T5 model


In [None]:
# N_EPOCHS = 5
# BATCH_SIZE = 32

# tokenizer = AutoTokenizer.from_pretrained('t5-base')
# data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=BATCH_SIZE)
# model = SummaryModel('t5-base')

# checkpoint_callback = ModelCheckpoint(dirpath="checkpoints",
#                                       filename="best_checkpoints",
#                                       save_top_k=1,
#                                       verbose=True,
#                                       monitor="val_loss",
#                                       mode="min")

# trainer = pl.Trainer(callbacks=checkpoint_callback, max_epochs=N_EPOCHS, accelerator="gpu", enable_progress_bar=True)

In [None]:
# trainer.fit(model, data_module)

# trained_model = SummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
# trained_model.freeze()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sysresearch101/t5-large-finetuned-xsum-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained(
    "sysresearch101/t5-large-finetuned-xsum-cnn"
)

In [None]:
data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=8)
data_module.setup()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

summaries, ids = [], []

with torch.no_grad():
    for batch in tqdm(data_module.test_dataloader()):
        summary_ids = model.generate(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            max_length=150,
            num_beams=2,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
        )

        summary_text = [
            tokenizer.decode(
                sum_id, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
            for sum_id in summary_ids
        ]
        summaries.extend(summary_text)
        ids.extend(batch["id"])

In [None]:
data = {"id": ids, "generated": summaries}
df_new = pd.DataFrame(data)
df_new.to_csv(f"out/t5_large_{dataset}.csv", index=False)

### BART


In [None]:
# N_EPOCHS = 5
# BATCH_SIZE = 32

# tokenizer = AutoTokenizer.from_pretrained('bart-base')
# data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=BATCH_SIZE)
# model = SummaryModel('bart-base')

# checkpoint_callback = ModelCheckpoint(dirpath="checkpoints",
#                                       filename="best_checkpoints",
#                                       save_top_k=1,
#                                       verbose=True,
#                                       monitor="val_loss",
#                                       mode="min")

# trainer = pl.Trainer(callbacks=checkpoint_callback, max_epochs=N_EPOCHS, accelerator="gpu", enable_progress_bar=True)

In [None]:
# trainer.fit(model, data_module)

# trained_model = SummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
# trained_model.freeze()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-xsum")

In [None]:
data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=8)
data_module.setup()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

summaries, ids = [], []

dataloader = data_module.test_dataloader()

with torch.no_grad():
    for batch in tqdm(dataloader):
        summary_ids = model.generate(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device),
            num_beams=6,
            length_penalty=2.0,
            no_repeat_ngram_size=4,
            min_length=10,
            max_length=60,
            early_stopping=True,
        )

        summary_text = [
            tokenizer.decode(
                sum_id, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
            for sum_id in summary_ids
        ]
        summaries.extend(summary_text)
        ids.extend(batch["id"])

In [None]:
data = {"id": ids, "generated": summaries}
df_new = pd.DataFrame(data)
df_new.to_csv(f"results/bart_large_{dataset}.csv", index=False)

# OPEN-AI Models


In [None]:
openai.api_key = ""

### text-davinci-003


In [None]:
def format_prompt(example_text, input_text):
    prompt = (
        """System: You are an extractive summarizer that follows the output pattern.
            User: The following examples are successful extractive summarization instances: """
        + example_text
        + """.
            Please summarize the following document. Document: + """
        + str(input_text)
        + """,output: """
    )
    return prompt


def generate_summary_davinci003(input_list, example_text):
    sum_dict = {"text": [], "summary": []}

    for i, input_text in enumerate(tqdm(input_list)):
        # if i == 2:
        #   break

        sum_dict["text"].append(input_text)
        prompt = format_prompt(example_text, input_text)

        response = openai.Completion.create(
            model="text-davinci-003",
            prompt=prompt,
            temperature=0.5,
            max_tokens=128,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=0.0,
        )
        sum_dict["summary"].append(response["choices"][0]["text"])

    sum_df = pd.DataFrame(sum_dict)
    sum_df.to_csv(f"out/davinci003_{dataset}.csv", index=False)

In [None]:
text_inputs = df_test["text"].tolist()

random_rows = df_train.sample(n=2)
examples_prompt = ""
for _, row in random_rows.iterrows():
    examples_prompt += (
        "input: " + row["text"] + ",output: " + row["summary"].replace("\n", "") + "\n"
    )

# print(examples_prompt + " \n\n Total wordcount examples: " + str(len(examples_prompt)))

In [None]:
# Ensure max token length of prompt is below 4097 token boundary (Note: Take into account max_tokens in 'openai.Completion.create' above)
encoding = tiktoken.encoding_for_model("text-davinci-003")
token_len = []
for text in text_inputs:
    input = format_prompt(examples_prompt, text)
    tokens = encoding.encode(input)
    token_len.append(len(tokens))

max(token_len)

4424

In [None]:
generate_summary_davinci003(text_inputs, examples_prompt)

### get-3.5-turbo


In [None]:
text_inputs = df_test["text"].tolist()

tiktoken_enc = tiktoken.encoding_for_model("gpt-3.5-turbo")
token_len = []
for text in tqdm(text_inputs):
    tokens = tiktoken_enc.encode(str(text))
    token_len.append(len(tokens))

print(f"max token length: {max(token_len)}, total tokens: {sum(token_len)}")

100%|██████████| 11334/11334 [00:05<00:00, 2035.46it/s]

max token length: 15255, total tokens: 5395732





In [None]:
tiktoken_enc = tiktoken.encoding_for_model("gpt-3.5-turbo")


def summmaries_to_chunks(summary_text):
    chunks = [[]]
    chunk_token_count = 0

    # split sentences
    sentences = nlp(summary_text)

    for sentence in sentences.sents:
        # calculate token size using tiktoken library
        token_count = len(tiktoken_enc.encode(sentence.text))
        chunk_token_count += token_count

        # Check if token count is above the 4097 limit.
        # Else add new sublist and reset count for new chunk.
        if chunk_token_count > 3900:
            chunks.append([])
            chunk_token_count = token_count

        # Add text to last chunk
        chunks[len(chunks) - 1].append(sentence.text)

    return chunks

In [None]:
def generate_summary_gpt35(input_list, chunking=False):
    sum_dict = {"text": [], "summary": []}
    for i, input_text in enumerate(tqdm(input_list)):
        # if i == 2:
        #   break

        sum_dict["text"].append(input_text)

        if chunking:
            chunks = summmaries_to_chunks(input_text)
        else:
            chunks = [input_text]
        chunk_summaries = []

        for chunk_text in chunks:
            prompt_text = " ".join(chunk_text)
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {
                        "role": "system",
                        "content": "You are assistant who replies with a clear and concise summary for every text.",
                    },
                    {
                        "role": "user",
                        "content": "Summarize the following text in max 5 sentences: "
                        + prompt_text,
                    },
                ],
                temperature=0.5,
                max_tokens=128,
                top_p=1.0,
                frequency_penalty=0.0,
                presence_penalty=0.0,
            )
            chunk_summaries.append(response["choices"][0]["message"]["content"])

        summary = " ".join(chunk_summaries)
        sum_dict["summary"].append(summary)
    return pd.DataFrame(sum_dict)

In [None]:
gpt35_df = generate_summary_gpt35(text_inputs, True)
gpt35_df.to_csv(f"results/davinci003_{dataset}.csv", index=False)
gpt35_df.head()

# Task

Extract sentences from cnn dailymail articles and index them. Use claim detection or evidence sentence selection models to achieve this. For each summary generated from model consider it to be a claim and retrieve closed sentences from index. Use an out of box stance detection model to verify the summary against retrieved evidences.


In [None]:
from annoy import AnnoyIndex
import ast
from collections import Counter
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer
import torch
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import sent_tokenize
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import random
import time
import os
from google.colab import drive

drive.mount('/content/drive')
nltk.download('punkt')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tqdm.pandas()

%cd drive/MyDrive/Grounding_LM/

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


/content/drive/MyDrive/Grounding_LM


### Load data


In [None]:
df_t5_cnn = pd.read_csv(
    "out/generated summaries/t5_large_cnn_dailymail.csv", index_col=0
)
df_t5_cnn["sentences"] = df_t5_cnn["text"].apply(sent_tokenize)
df_t5_cnn.head()

Unnamed: 0,text,summary,id,generated
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,f001ec5c4704938247d27a44948eebb37ae98d01,The Palestinians have become a member of the I...
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...",230c522854991d053fe98a718b1defa077a8efef,A dog that was apparently buried alive after b...
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,4495ba8f3a340d97a9df1476f8a35502bcce1f69,It's been a busy week for Iran.
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,a38e72fed88684ec8d60dd5856282e999dc8c0ca,Five Americans who were being treated for Ebol...
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,c27cf1b136cc270023de959e7ab24638021bc43f,A student at Duke University has admitted hang...


In [None]:
# HaluEval dataset containing right and hallucinated summaries for a given document
df_halueval = pd.read_csv("data/halueval/summarization_data.csv")
total_docs = 500
sampled_df = df_halueval.sample(n=total_docs, random_state=42)
sampled_df["index"] = sampled_df.index
sampled_df["sentences"] = sampled_df["document"].apply(sent_tokenize)
sampled_df.head()

Unnamed: 0,document,right_summary,hallucinated_summary,index
6252,Driving around in their mother's consular BMW ...,"Marc Wabafiyebazu, 15, bragged to officials th...",Brothers Marc and Jean Wabafiyebazu were arres...,6252
4684,Lance Armstrong has said the World Anti-Doping...,WADA director general David Howman said he was...,Lance Armstrong has apologized to the World An...,4684
1731,Andy King thinks his 50th goal for Leicester C...,Andy King scored his 50th goal to earn Leicest...,Leicester City secured a crucial win against W...,1731
4742,West Ham have announced a new five-year multi-...,West Ham have signed a new kit deal with Umbro...,West Ham have announced a partnership with Umb...,4742
4521,"At half-time, everything pointed to another hu...",George Ford scythed through the Leinster defen...,Bath's George Ford scored a hat-trick of tries...,4521


### Claim detection

Load pre-trained claim detection model and extract claimworthy sentences for each document.


In [None]:
claim_tokenizer = AutoTokenizer.from_pretrained("Nithiwat/bert-base_claimbuster")
claim_model = AutoModelForSequenceClassification.from_pretrained(
    "Nithiwat/bert-base_claimbuster"
).to(device)

In [None]:
def extract_claimworthy(sentences):
    tokenized_inputs = claim_tokenizer(
        sentences, padding=True, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        logits = claim_model(**tokenized_inputs).logits
        logits = logits.cpu()
    label_indices = torch.nonzero(logits.argmax(dim=1) == 0).squeeze().cpu()
    # Prevent looping over 0d-tensor error.
    if label_indices.dim() == 0:
        label_indices = label_indices.unsqueeze(0)

    claimworthy = [sentences[idx] for idx in label_indices]
    return claimworthy

In [None]:
# df_test['claims'] = df_test['sentences'].progress_apply(extract_claims)
# df_test.to_csv('claims.csv', index=False)

In [None]:
sentences = extract_claimworthy(df_test["sentences"][0])

print(f"evidence: {' '.join(sentences)} \nclaim: {df_test['generated'][0]}")

evidence: The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. These are substantive commitments, which cannot be taken lightly," she said. Prosecutor Fatou Bensouda said her office would "conduct its analysis in full independence and impartiality." 
claim: The Palestinians have become a member of the International Criminal Court (ICC).


In [None]:
df_test["text"][0]

'(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC\'s founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed "in the occupied Palestinian territory, including East Jerusalem, since June 13, 2014." Later that month, the ICC opened a preliminary examination into the situation in Palestinian territories, paving the way for possible war crimes investigations against Israelis. As members of the court, Palestinians may be subject to counter-charges as well. Israel and the United States, neither of which is an ICC member, opposed the Palestinians\' efforts to join the body. But Palestinian Foreign Minister Riad al-Malki, speaking at Wednesday\'s ceremony

### Construct Index

Calculate embeddings for each (claimworthy) sentence and Store embeddings using ANNOY library for index and retrieval.


In [None]:
model = SentenceTransformer(
    "sentence-transformers/paraphrase-MiniLM-L6-v2"
)  # 384 dimensional dense vector space

In [None]:
def index_annoy(df_input, df_name, embedding_dim=384, number_of_trees=100):
    for doc_id, row in df_input.iterrows():
        embeddings = [model.encode(txt) for txt in row["sentences"]]
        ann = AnnoyIndex(embedding_dim, metric="angular")
        for index, embed in enumerate(embeddings):
            ann.add_item(index, embed)
        ann.build(number_of_trees)
        ann.save(f"data/{df_name}/annoy/{doc_id}_{df_name}.annoy")

In [None]:
index_annoy(sampled_df, "halueval")
sampled_df.to_csv(f"data/halueval/sample_df_{total_docs}.csv", index=False)

# Inference

1. Retrieve top-k source document claimworthy sentence embeddings from ANNOY for a given claim (generated summary).
2. Calculate cosine similarity between the given claim and the retrieved sentences and keep the ones above certain cosine similarity.
3. Load pre-trained fact-checking model and infer whether evidence supports or refutes the given claim.


In [None]:
class KnnSearch:
    def __init__(self, emb_dim=384):
        """
        Initialize the KnnSearch class.
        Parameters: emb_dim - The dimension of the embeddings used for similarity calculation. Default is 384 (sentence transformer dim).
        """
        self.annoy = AnnoyIndex(384, metric="angular")
        self.model = SentenceTransformer(
            "sentence-transformers/paraphrase-MiniLM-L6-v2"
        )
        self.emb_dim = emb_dim

    def get_embeddings_for_data(self, data_ls):
        """
        Parameters: data_ls - A list of sentences or phrases.
        Returns: An array of sentence embeddings.
        """
        embeddings = self.model.encode(data_ls)
        return embeddings

    def standardize_normalize_cosine_similarities(self, cosine_similarities):
        """
        Parameters: cosine_similarities - An array of cosine similarity scores.
        Returns: An array of standardized and normalized cosine similarity scores.
        """
        cosine_sims_norm = (cosine_similarities - np.min(cosine_similarities)) / (
            np.max(cosine_similarities) - np.min(cosine_similarities)
        )
        cosine_sims_norm = 0.5 + (
            cosine_sims_norm - np.mean(cosine_sims_norm)
        ) / np.std(cosine_sims_norm)
        return cosine_sims_norm

    def max_normalize_cosine_similarities(self, cosine_similarities):
        """
        Parameters: cosine_similarities - An array of cosine similarity scores.
        Returns: An array of max-normalized cosine similarity scores.
        """
        return 1 / np.max(cosine_similarities) * cosine_similarities.squeeze(axis=1)

    def max_normalize_cosine_similarities_pairwise(self, cosine_similarities):
        """
        Parameters: cosine_similarities - An array of pairwise cosine similarity scores.
        Returns: An array of max-normalized pairwise cosine similarity scores.
        """
        cosine_sims_norm = np.copy(cosine_similarities)
        np.fill_diagonal(cosine_sims_norm, np.NaN)
        cosine_sims_norm = (
            cosine_similarities - np.nanmin(cosine_similarities, axis=0)
        ) / (
            np.nanmax(cosine_similarities, axis=0)
            - np.nanmin(cosine_similarities, axis=0)
        )
        cosine_sims_norm = 0.5 + (
            cosine_sims_norm - np.nanmean(cosine_sims_norm, axis=0)
        ) / np.nanstd(cosine_sims_norm, axis=0)
        return cosine_sims_norm

    def get_top_nn_neighbours(self, df_name, df_input, df_index, claim, k, beta):
        """
        Retrieve the top N nearest neighbors for a given claim from a dataset.

        Parameters:
            df_name - The name of the dataset.
            df_input - The input dataframe containing sentence data.
            df_index - The index of the claim in the input dataframe.
            claim - The claim for which to find nearest neighbors.
            k - The number of nearest neighbors to retrieve.
            beta - The weight parameter to balance text similarity and diversity (MMR).
        Returns: A list of top-K nearest neighbor sentences.
        """
        annoy_index = df_input["index"][df_index]
        self.annoy.load(f"data/{df_name}/annoy/{annoy_index}_{df_name}.annoy")

        new_emb = self.model.encode(claim)
        top_matches = self.annoy.get_nns_by_vector(new_emb, k)
        evidence_sentences = [df_input["sentences"][df_index][i] for i in top_matches]
        evidence_embeddings = self.get_embeddings_for_data(evidence_sentences)
        # top_sent = [evidence_sentences[idx] for idx, similarity in sorted(enumerate(text_sims[0]), key=lambda x: x[1], reverse=True) if similarity > beta]

        text_sims = cosine_similarity(evidence_embeddings, [new_emb]).tolist()
        candidate_sims = cosine_similarity(evidence_embeddings)
        text_sims_norm = self.standardize_normalize_cosine_similarities(text_sims)
        phrase_sims_norm = self.max_normalize_cosine_similarities_pairwise(
            candidate_sims
        )

        selected_data_indices = []
        data_len = len(evidence_sentences)
        unselected_data_indices = list(range(data_len))

        best_idx = np.argmax(text_sims)
        selected_data_indices.append(best_idx)
        unselected_data_indices.remove(best_idx)

        for _ in range(min(data_len, k) - 1):
            unselected_data_distances_to_text = text_sims_norm[
                unselected_data_indices, :
            ]
            unselected_data_distances_pairwise = phrase_sims_norm[
                unselected_data_indices
            ][:, selected_data_indices]
            # if dimension of data distances is 1 we add additional axis to the end
            if unselected_data_distances_pairwise.ndim == 1:
                unselected_data_distances_pairwise = np.expand_dims(
                    unselected_data_distances_pairwise, axis=1
                )

            idx = int(
                np.argmax(
                    beta * unselected_data_distances_to_text
                    - (1 - beta)
                    * np.max(unselected_data_distances_pairwise, axis=1).reshape(-1, 1)
                )
            )
            best_idx = unselected_data_indices[idx]

            # select new best phrase and update selected/unselected phrase indices list
            selected_data_indices.append(best_idx)
            unselected_data_indices.remove(best_idx)
            top_sent = [evidence_sentences[i] for i in selected_data_indices]

        return top_sent

In [None]:
knn = KnnSearch()
checkpoint = "Dzeniks/roberta-fact-check"
factcheck_model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(
    device
)
factcheck_tokenizer = AutoTokenizer.from_pretrained(checkpoint)
label_mapping = ["support", "refute", "neutral"]


def fact_check_split_sent(claim_sent, evidence_ls):
    factcheck_model.eval()
    labels = []
    for evidence in evidence_ls:
        features = factcheck_tokenizer.encode_plus(
            claim_sent, evidence, truncation=True, return_tensors="pt", max_length=512
        ).to(device)
        with torch.no_grad():
            prediction = factcheck_model(**features).logits
            logits = prediction.cpu().numpy()
            result = label_mapping[logits.argmax().item()]
            labels.append(result)

    # Majority vote
    vote_counts = Counter(labels)
    label = vote_counts.most_common(1)[0][0]
    return label


def fact_check_join_sent(claim_sent, evidence_ls):
    factcheck_model.eval()
    features = factcheck_tokenizer.encode_plus(
        claim_sent,
        " ".join(evidence_ls),
        truncation=True,
        return_tensors="pt",
        max_length=512,
    ).to(device)
    with torch.no_grad():
        prediction = factcheck_model(**features).logits
        logits = prediction.cpu().numpy()
    label = label_mapping[logits.argmax().item()]
    return label


def fact_check_split_claim(claim_ls, evidence_ls):
    factcheck_model.eval()
    results = []
    for c_idx, claim in enumerate(claim_ls):
        labels = []
        for evidence in evidence_ls[c_idx]:
            features = factcheck_tokenizer.encode_plus(
                claim, evidence, truncation=True, return_tensors="pt", max_length=512
            ).to(device)
            with torch.no_grad():
                prediction = factcheck_model(**features).logits
                logits = prediction.cpu().numpy()
                result = label_mapping[logits.argmax().item()]
                labels.append(result)

        # Majority vote
        vote_counts = Counter(labels)
        majority_vote = vote_counts.most_common(1)[0][0]
        results.append(majority_vote)
    return results


def fact_check_join_claim(claim_ls, evidence_ls):
    factcheck_model.eval()
    results = []
    for c_idx, claim in enumerate(claim_ls):
        features = factcheck_tokenizer.encode_plus(
            claim,
            " ".join(evidence_ls[c_idx]),
            truncation=True,
            return_tensors="pt",
            max_length=512,
        ).to(device)
        with torch.no_grad():
            prediction = factcheck_model(**features).logits
            logits = prediction.cpu().numpy()
        label = label_mapping[logits.argmax().item()]
        results.append(label)
    return results


def multi_fact_check(df_name, df_input, colname, k, beta, fact_type):
    stances, times, top_k = [], [], []
    for idx in df_input.index:
        start_time = time.time()

        if "sent" in fact_type:
            claim = df_input[colname][idx]
            top_sent = knn.get_top_nn_neighbours(
                df_name=df_name,
                df_input=df_input,
                df_index=idx,
                claim=claim,
                k=k,
                beta=beta,
            )
            top_k.append(top_sent)

            label = (
                fact_check_split_sent(claim, top_sent)
                if fact_type == "split_sent"
                else fact_check_join_sent(claim, top_sent)
            )
            stances.append(label)

        else:
            claims = df_input[f"{colname}_sent"][idx]
            top_sents = [
                knn.get_top_nn_neighbours(
                    df_name=df_name,
                    df_input=df_input,
                    df_index=idx,
                    claim=claim,
                    k=k,
                    beta=beta,
                )
                for claim in claims
            ]
            top_k.append(top_sents)

            labels = (
                fact_check_split_claim(claims, top_sents)
                if fact_type == "split_claim"
                else fact_check_join_claim(claims, top_sents)
            )
            stances.append("refute" if "refute" in labels else "support")

        times.append(time.time() - start_time)
    return stances, times, top_k

### Analysis stances


### Attribution Score

"Given an attribution report, A, and a textual statement y consisting of sentences, y = {s1, s2, . . .}, we use a NLI model to measure the likely that for each sentence s of y, and for each evidence snippet e in A, let NLI(e, s) be the model probability of e entailing s."


In [None]:
nli_model = CrossEncoder("cross-encoder/nli-deberta-v3-base")
label_mapping = ["contradiction", "entailment", "neutral"]


def attributionScore(evidence, claim):
    attribution_scores = []

    # Loop through all claim sentences.
    for claim_sent in claim:
        nli_scores = []

        # Calculate for each evidence sentence the entailment logits for a given claim.
        scores = [
            nli_model.predict([(evidence_sent, claim)]) for evidence_sent in evidence
        ]
        attributions = [
            1 if label_mapping[score_max] == "entailment" else 0
            for score_max in scores.argmax(axis=1)
        ]
        attribution_scores.append(np.max(attributions))

    # Average the attribution score over all the claim sentences
    average_attribution_score = sum(attribution_scores) / len(claim)
    return average_attribution_score

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
evidences = df_t5_cnn["text_sent"].to_numpy()
claims = df_t5_cnn["generated_sent"].to_numpy()

results = []

for evidence, claim in list(zip(evidences, claims))[:5]:
    score = attributionScore(evidence, claim)
    results.append(score)

# print attribution score for each claim sentence
print(results)

[4.610058, -3.4430819, 4.4542346, -0.84390503, -3.1257496, -2.3221798, -3.8293803, 3.3900251, -3.557179, 2.7546098, -3.3666167, -3.1471353, -4.2763104, 4.5471964, -3.177677, -3.2147415, -3.744464, -3.305184, -4.236372, -3.7386951, 1.0857098, -4.1586113, -3.6753638, -3.6549397, -3.659264, -4.0385637, -3.1192727]
[-3.9692008, 4.4903736, -2.9544744, -2.8308911, -2.6367564, -2.1710672, -2.272644, -3.762628, -3.4680176, -3.1006231, -0.3906805, -2.7799911, -3.0923533, -2.1286106, 0.18282771, -2.5904522, -2.470103, -2.8686101, -2.6457815]
[-2.6214466, -3.0218263, -1.9485716, -2.2346535, -2.5833519, -2.6667042, -2.311144, -2.3788195, -3.6011777, -3.3477194, -2.7722626, -4.635331, -4.626549, -4.7943993, -3.4071517, -3.336154, -4.8379974, -3.1885579, -2.6299386, -3.0556386, -2.488968, -2.4536462, -4.481355, -3.1709976, -3.7901912, -4.3972054, -3.6297266, -3.8985898, -3.904263, -3.073994, -3.8846707, -3.613608, -2.502956, -3.1016655, -2.8352327, -2.6519861, -2.9373336, -1.9294426, -2.5284245]
[4.

In [None]:
k_vals = [3, 5, 10, 15]
beta_vals = [0.7, 0.9, 1.0]
fact_types = ["split_sent", "join_sent", "split_claim", "join_claim"]
total_docs = 500

df_basis = pd.read_csv(f"data/halueval/sample_df_{total_docs}.csv")
df_basis["right_summary_sent"] = df_basis["right_summary"].apply(sent_tokenize)
df_basis["hallucinated_summary_sent"] = df_basis["hallucinated_summary"].apply(
    sent_tokenize
)
df_basis["sentences"] = df_basis["sentences"].apply(ast.literal_eval)

for fact_type in tqdm(fact_types, desc="Main Loop"):
    for kv in tqdm(k_vals, desc="K-Values Loop", leave=False):
        for bv in tqdm(beta_vals, desc="Beta-Values Loop", leave=False):
            sampled_df = df_basis.copy(deep=True)

            stances, times, top_k = multi_fact_check(
                "halueval", sampled_df, "right_summary", kv, bv, fact_type
            )
            sampled_df["right_stance"] = stances
            sampled_df["right_inference_time"] = times
            sampled_df["right_top_k"] = top_k

            stances, times, top_k = multi_fact_check(
                "halueval", sampled_df, "hallucinated_summary", kv, bv, fact_type
            )
            sampled_df["hallucinated_stance"] = stances
            sampled_df["hallucinated_inference_time"] = times
            sampled_df["hallucinated_top_k"] = top_k

            sampled_df.drop(columns=["sentences"], inplace=True)
            sampled_df.to_csv(
                f"out/halueval/sampled_{total_docs}/sampled_k{kv}_b{bv}_{fact_type}.csv",
                index=False,
            )

In [None]:
df_stances = pd.DataFrame(columns=["Name", "FP", "FN", "Precision", "Recall", "F1"])
fact_types = ["split_sent", "join_sent", "split_claim", "join_claim"]
for kv in k_vals:
    for bv in beta_vals:
        for fact_type in fact_types:
            df_temp = pd.read_csv(
                f"out/halueval/sampled_{total_docs}/sampled_k{kv}_b{bv}_{fact_type}.csv"
            )
            fp = df_temp["right_stance"].value_counts().get("refute", 0)
            fn = df_temp["hallucinated_stance"].value_counts().get("support", 0)
            tp = total_docs - fp
            tn = total_docs - fn

            precision = tp / (tp + fp) if (tp + fp) > 0 else 0
            recall = tp / (tp + fn) if (tp + fn) > 0 else 0
            f1_score = (
                2 * (precision * recall) / (precision + recall)
                if (precision + recall) > 0
                else 0
            )

            df_stances = pd.concat(
                [
                    df_stances,
                    pd.DataFrame(
                        {
                            "Name": [f"{fact_type}_k{kv}_b{bv}"],
                            "FP": [fp],
                            "FN": [fn],
                            "Precision": [precision],
                            "Recall": [recall],
                            "F1": [f1_score],
                        }
                    ),
                ],
                ignore_index=True,
            )

df_stances.to_csv(
    f"results/halueval/sampled_{total_docs}/stance_analysis.csv", index=False
)
df_stances

Unnamed: 0,Name,FP,FN,Precision,Recall,F1
0,split_sent_k3_b0.7,165,365,0.67,0.478571,0.558333
1,join_sent_k3_b0.7,69,409,0.862,0.513095,0.643284
2,split_claim_k3_b0.7,368,202,0.264,0.39521,0.316547
3,join_claim_k3_b0.7,171,348,0.658,0.485968,0.559048
4,split_sent_k3_b0.9,165,365,0.67,0.478571,0.558333
5,join_sent_k3_b0.9,69,409,0.862,0.513095,0.643284
6,split_claim_k3_b0.9,368,202,0.264,0.39521,0.316547
7,join_claim_k3_b0.9,171,347,0.658,0.486686,0.559524
8,split_sent_k3_b1.0,165,365,0.67,0.478571,0.558333
9,join_sent_k3_b1.0,69,410,0.862,0.512485,0.642804


# Calculate metric scores


In [None]:
import evaluate
import nltk
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
import pandas as pd
from tqdm import tqdm

nltk.download("punkt")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def read_dataset(dataname, modelname):
    df_actual = pd.read_csv(f"data/{dataname}/test.csv")
    df_generated = pd.read_csv(f"results/{modelname}_large_{dataname}.csv")
    df_merge = pd.merge(df_actual, df_generated, on="id")
    return df_merge

In [None]:
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("sacrebleu")
bertscore_metric = evaluate.load("bertscore")


def rougeScore(preds, refs):
    pred_tokens = ["\n".join(nltk.sent_tokenize(str(item).strip())) for item in preds]
    refs_tokens = ["\n".join(nltk.sent_tokenize(item.strip())) for item in refs]
    rogue_scores = rouge_metric.compute(
        predictions=pred_tokens, references=refs_tokens, use_stemmer=True
    )
    rouge_dict = {key: value * 100 for key, value in rogue_scores.items()}
    return pd.DataFrame(rouge_dict, index=[0])


def bleuScore(preds, refs):
    bleu_dic = {}
    bleu_dic["bleu-1-grams"] = corpus_bleu(refs, preds, weights=(1.0, 0, 0, 0))
    bleu_dic["bleu-1-2-grams"] = corpus_bleu(refs, preds, weights=(0.5, 0.5, 0, 0))
    bleu_dic["bleu-1-3-grams"] = corpus_bleu(refs, preds, weights=(0.3, 0.3, 0.3, 0))
    bleu_dic["bleu-1-4-grams"] = corpus_bleu(
        refs, preds, weights=(0.25, 0.25, 0.25, 0.25)
    )
    sacrebleu = bleu_metric.compute(predictions=preds, references=refs)
    bleu_dic["sacrebleu"] = sacrebleu["score"]
    return pd.DataFrame(bleu_dic, index=[0])


def bertScore(preds, refs):
    bert_scores = bertscore_metric.compute(
        predictions=preds,
        references=refs,
        lang="en",
        model_type="distilbert-base-uncased",
    )
    bert_dict = {
        "bert_precision": np.mean(bert_scores["precision"]),
        "bert_recall": np.mean(bert_scores["recall"]),
        "bert_F1": np.mean(bert_scores["f1"]),
    }
    return pd.DataFrame(bert_dict, index=[0])


def calculateMetrics(df_data):
    generated_summaries = df_data["generated"].values
    actual_summaries = df_data["summary"].values

    df_rouge = rougeScore(generated_summaries, actual_summaries)
    df_blue = bleuScore(generated_summaries, actual_summaries)
    df_bert = bertScore(generated_summaries, actual_summaries)

    df_results = pd.concat([df_rouge, df_blue, df_bert], axis=1)
    return df_results

In [None]:
datasets = ["xsum", "cnn_dailymail"]
models = ["t5", "bart"]

results = []
for dataset in tqdm(datasets, desc="dataset loop", position=0):
    for model in tqdm(models, desc="model loop", position=1):
        df_data = read_dataset(dataname=dataset, modelname=model)
        df_metrics = calculateMetrics(df_data)
        results.append(df_metrics)

In [None]:
df_merge = pd.concat(results, axis=0)
df_merge.index = [
    "T5-large-XSum",
    "BART-large-XSum",
    "T5-large-CNN/Dailymail",
    "BART-large-CNN/Dailymail",
]
df_merge.to_csv("summary_metrics.csv")
df_merge

# pd.read_csv("summary_metrics.csv", index_col=0)

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum,bleu-1-grams,bleu-1-2-grams,bleu-1-3-grams,bleu-1-4-grams,sacrebleu,bert_precision,bert_recall,bert_F1
T5-large-XSum,40.009396,16.606227,32.047688,32.035912,0.231279,7.1736579999999995e-155,1.650622e-185,1.263405e-231,10.075937,0.846289,0.822568,0.833896
BART-large-XSum,44.94845,21.598009,36.50578,36.507496,0.205585,6.763446e-155,1.5933239999999998e-185,1.22675e-231,15.327531,0.850004,0.845715,0.847557
T5-large-CNN/Dailymail,23.960417,6.842925,16.562357,20.852092,0.229411,7.144618e-155,1.64661e-185,1.260845e-231,1.257227,0.80856,0.737391,0.770937
BART-large-CNN/Dailymail,26.193286,7.862815,17.641984,22.477502,0.199242,6.658284e-155,1.578413e-185,1.217176e-231,2.055372,0.801691,0.747194,0.773092
