# Dataset prep for Gemini - FAVA

In [59]:
# load by sentence
sentences_for_decontext = []
fava_bs_all = []
with open('./data/rarr-input_fava_bs_all.jsonl', 'r') as f:
    for line in f:
        fava_bs_all.append(json.loads(line))
        
# load by passage
fava_bp_all = []
with open('/Users/jjr/PycharmProjects/RARR/data/rarr-input_fava-gold.jsonl', 'r') as f:
    for line in f:
        fava_bp_all.append(json.loads(line))

        
# iterate over sentences, add passage to sentence dictionary
for s in fava_bs_all:
    passage_id = s['input_info']['cid']
    passage = fava_bp_all[passage_id]['input_info']['claim']
    s['input_info']['passage'] = passage
    # rename 'claim' to 'passage'
    s['input_info']['sentence'] = s['input_info'].pop('claim')
    sentences_for_decontext.append(s)

# write output
# Write to a .jsonl file
with open('./data/rarr_sentences_for_decon.jsonl', 'w') as f:
    for entry in sentences_for_decontext:
        f.write(json.dumps(entry) + '\n')


# Dataset prep for Gemini - WikiBib

## -> Helper classes / Imports

In [144]:
from datasets import load_dataset
from sympy.physics.units import current
from torch.utils.data import Subset
# import dataloader
from torch.utils.data import Dataset, DataLoader
import json, jsonlines
import tqdm
import os
import random
import argparse
import nltk


class WikibibDataset(Dataset):
    def __init__(self, json_file="./data/wikibib_halluc/labeled/ChatGPT.jsonl",
                       artificial_fact_path=None):
        if artificial_fact_path is not None:
            self.artificial_facts = []
            self.artificial_evidence = []
            with open(artificial_fact_path) as f:
                for line in f:
                    item = json.loads(line)
                    question_list =  item["result"]["question_gen"]['questions']
                    self.artificial_facts.append(question_list)
                    self.artificial_evidence.append(item["result"]["search"]['used_evidence'])
        else:
            self.artificial_facts = None
            self.artificial_evidence = None

        self.data = []
        with open(json_file) as f:
            for line in f:
                self.data.append(json.loads(line))



    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        current_sample = self.data[idx]
        question  = current_sample["input"]
        llm_reply = current_sample["output"]
        topic = current_sample["topic"]
        # annotations = current_sample["annotations"]# ["human-atomic-facts"]
        # annotations represent the list of sentences, where each sentence
        #   is a dictionary with the "text" and "label" keys, Labels being "S", "NS" or "IR"
        return {
            "question": question,
            "llm_reply": llm_reply,
            "topic": topic,
            # "annotations": annotations,
        }

    def load_rarr_results(self, output_file):
        finished_results = {}
        for l in jsonlines.open(output_file):
            finished_results[l["llm_reply"]] = l["result"]
        return finished_results

    def get_claim(self, idx):
        current_sample = self.data[idx]
        return current_sample["output"]

    def get_gt_facts(self, idx):
        """
        Get the facts for a given index
        Args:
            idx:

        Returns:
            evidence_list: List of evidence
            label_list: List of labels
            topic: The name of the Bibliography topic
        """
        item = self.data[idx]
        # print(item.keys(), item['topic'])

        label_mapping = {"S": 1, "NS": 0, "IR": -1}
        evidence_list, label_list = [], []

        # if annotations are null, return empty lists
        if item["annotations"] is not None:
            # print(item["annotations"])
            for sentence in item["annotations"]:
                atomic_facts = sentence["human-atomic-facts"]
                #print("Atomic Facts: ", atomic_facts)
                if atomic_facts is not None:
                    for fact in atomic_facts:
                        evidence_list.append(fact["text"])
                        label_list.append(label_mapping[fact["label"]])
        return evidence_list, label_list, item["topic"]

## -> Prepping for decontextualization

In [145]:
# load wikibib dataset
dset = WikibibDataset(json_file='/Users/jjr/Documents/data/rarr-rep/wikiBib/ChatGPT.jsonl', artificial_fact_path=None)

dset_with_sentences = []

# iterate over inputs
for i in tqdm.tqdm(range(len(dset))):
    # retrieve the passage
    llm_passage = dset.get_claim(i)
    # split passage into sentences
    claim_list = nltk.sent_tokenize(llm_passage)
    # retrieve the entire sample
    wiki_whole_sample = dset[i]
    # add sentences to the sample
    wiki_whole_sample['sentences'] = claim_list
    # add to list
    dset_with_sentences.append(wiki_whole_sample)
    
    

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 183/183 [00:00<00:00, 5988.96it/s]


## -> Save as jsonl

In [149]:
# Write to a .jsonl file
with open('./data/wikibib_with_sentences.jsonl', 'w') as f:
    for entry in dset_with_sentences:
        f.write(json.dumps(entry) + '\n')