In [1]:
import json

predictions_file_path = "command_r_attributions_natural_questions.jsonl"

with open(predictions_file_path, "r") as f:
    predictions = [json.loads(line) for line in f]

print(predictions[0].keys())

dict_keys(['id', 'question', 'answer', 'retrieved_docs', 'selected_docs'])


In [3]:
predictions[2]['answer']

'Relevant Documents: 0,1,2,3,4\nCited Documents: 0,1,2,3,4\nAnswer: Antlers are bony structures that deer shed and replace each year. The process of gathering shed antlers is called "shed hunting" or "bone picking".\nGrounded answer: Antlers are <co: 0>bony structures</co: 0> that deer <co: 0>shed and replace each year.</co: 0> The process of gathering shed antlers is called <co: 1>"shed hunting"</co: 1> or <co: 1>"bone picking"</co: 1>.'

In [4]:
# get all invidual citations for each claim

grounded_answer = predictions[2]['answer'].split('Grounded answer: ')[1]
grounded_answer

'Antlers are <co: 0>bony structures</co: 0> that deer <co: 0>shed and replace each year.</co: 0> The process of gathering shed antlers is called <co: 1>"shed hunting"</co: 1> or <co: 1>"bone picking"</co: 1>.'

In [5]:
import regex as re

def extract_citations(grounded_answer):

    pattern = r'<co: ([\d,]+)>(.*?)<\/co: \1>'
    matches = re.findall(pattern, grounded_answer)
    citations = []

    for match in matches:
        documents = match[0]
        claim = match[1]
        citations.append({'claim': claim, 'documents': documents.split(',')})
        # print(f'Claim: "{claim}", provenance document: {documents}')

    return citations

citations = extract_citations(grounded_answer)
citations

[{'claim': 'bony structures', 'documents': ['0']},
 {'claim': 'shed and replace each year.', 'documents': ['0']},
 {'claim': '"shed hunting"', 'documents': ['1']},
 {'claim': '"bone picking"', 'documents': ['1']}]

In [6]:
import nltk
from typing import List

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/wallat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def split_into_sentences(text) -> List[str]:
    # Step 1: Remove the tags using regex
    text = re.sub(r'<co: [\d,]+>(.*?)<\/co: [\d,]+>', r'\1', text)

    # Step 2: Tokenize the cleaned text into sentences
    sentences = nltk.sent_tokenize(text)
    return sentences

sentences = split_into_sentences(grounded_answer)

for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")

Sentence 1: Antlers are bony structures that deer shed and replace each year.
Sentence 2: The process of gathering shed antlers is called "shed hunting" or "bone picking".


In [10]:
def add_source_sentence_to_citations(grounded_answer, citations):

    sentences = split_into_sentences(grounded_answer)

    for citation in citations:
        # Match claim back to one of the sentences
        # print(f"Looking for claim '{citation['claim']}' in the sentences")
        selected_sentence_id = -1
        for i, sentence in enumerate(sentences):
            # print("Current sentence: ", sentence)
            if citation['claim'] in sentence:
                citation['sentence'] = sentence
                selected_sentence_id = i
                break
        if selected_sentence_id > 0:
            del sentences[:i]
        elif selected_sentence_id == -1:
            raise Exception(f"Claim '{citation['claim']}' not found in any of the sentences")
        # print(citation)
    return citations

citations = add_source_sentence_to_citations(grounded_answer, citations)

In [12]:
for prediction in predictions:
    print('\n\n\nQuestion:', prediction['question'])
    
    try:
        grounded_answer = prediction['answer'].split('Grounded answer: ')[1]
        print('Grounded answer:', grounded_answer)
        print("\n")

        citations = extract_citations(grounded_answer)
        citations = add_source_sentence_to_citations(grounded_answer, citations)

        for citation in citations:
            print(f"Source sentence: {citation['sentence']}")
            print(f"Claim: '{citation['claim']}'")
            print(f"Cited documents: {citation['documents']}")
            print('---')
            for attribution in citation['documents']:
                print(f"Cited document {attribution}: \n{prediction['selected_docs'][int(attribution)]}")
                print('---')
    except Exception as e:
        print("\n\n\nThere was an error processing this QA pair")
        print(e)
        print("\n\n")
        print(prediction['answer'])





Question: what is the definition of bcc in email
Grounded answer: BCC stands for <co: 0,1,2,3,4>blind carbon copy.</co: 0,1,2,3,4> It allows the <co: 0>sender of an email</co: 0> to <co: 0>conceal the person entered in the BCC field from the other recipients.</co: 0> This means that the <co: 1>primary and secondary recipients cannot see the tertiary recipients.</co: 1>


Source sentence: BCC stands for blind carbon copy.
Claim: 'blind carbon copy.'
Cited documents: ['0', '1', '2', '3', '4']
---
Cited document 0: 
Blind carbon copy
Blind carbon copy
Blind carbon copy (abbreviated Bcc:) allows the sender of a message to conceal the person entered in the Bcc: field from the other recipients. This concept originally applied to paper correspondence and now also applies to email.
In some circumstances, the typist creating a paper correspondence must ensure that multiple recipients of such a document do not see the names of other recipients. To achieve this, the typist can: 
BULLET::::-
--