### Module 0: Importing Libraries

In [6]:
import pandas as pd
import numpy as np
import pprint
import os 
from time import time 
from dotenv import load_dotenv
import json
import argparse
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from openai import OpenAI
import certifi

# Override bad SSL_CERT_FILE if set
os.environ["SSL_CERT_FILE"] = certifi.where()

from sentence_transformers import SentenceTransformer
import torch
import sklearn

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
## Importing VE libraries
from utils import *
from dataset_utils import read_wikiqa_data
from prompt_helper import get_joint_prompt_helper, normalize_prediction
from dataset_utils import read_wikiqa_data, wiki_evaluation


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
load_dotenv()
client = OpenAI()

In [10]:
from prompt_self_ask import self_ask_prompt
from consistency import _parse_args as consistency_parse_args, in_context_manual_prediction  as consistency_in_context_manual_prediction, post_process_consistency, evaluate_manual_predictions as consistency_evaluate_manual_predictions
from verifying_questions import extract_follow_up_questions, _parse_args as verifying_questions_args
from relevant_context import _parse_args as relevant_context_args


from verifying_answers import in_context_manual_prediction as verifying_answers_in_context_manual_prediction, _parse_args as verifying_answers_args
from answer_again import in_context_manual_prediction as answer_again_in_context_manual_prediction

### Filtering Records with inaccurate Evaluation

In [71]:
path_100_250 = "misc/iterative_predictions_self_ask_final_iter_100_250.jsonl"

In [72]:
with open(path_100_250, 'r') as f:
    data = [json.loads(line) for line in f]
df_100_250 = pd.DataFrame(data)

In [73]:
df_100_250.shape

(149, 20)

In [74]:
df_100_250_anomaly =  df_100_250[(df_100_250["high_consistency"] == False) & (df_100_250["verifying_questions"].apply(lambda x: x == []))]

In [75]:
df_100_250_anomaly.shape

(9, 20)

In [76]:
# Filtering 0 to 100 samples 
path_0_100 = "misc/iterative_predictions_self_ask_final.jsonl"

In [77]:
with open(path_0_100, 'r') as f:
    data = [json.loads(line) for line in f]
df_0_100 = pd.DataFrame(data)

FileNotFoundError: [Errno 2] No such file or directory: 'misc/iterative_predictions_self_ask_final.jsonl'

In [68]:
# df_0_100.head()

In [59]:
df_0_100_anomaly =  df_0_100[(df_0_100["high_consistency"] == False) & (df_0_100["verifying_questions"].apply(lambda x: x == []))]
df_0_100_anomaly.shape

(8, 20)

In [69]:
# df_0_100_anomaly.head()

In [61]:
df_anomaly_all = pd.concat([df_0_100_anomaly, df_100_250_anomaly], axis=0 )
df_anomaly_all.shape

(17, 20)

In [63]:
df_anomaly_all.reset_index(drop = True, inplace=True)

In [11]:
df_anomaly_all.head()

NameError: name 'df_anomaly_all' is not defined

In [66]:
self_ask_ids_rerun=  list(df_anomaly_all.id)

### 

In [12]:
self_ask_ids_rerun = ['28942dc20bdd11eba7f7acde48001122',
 '608c01f00bdd11eba7f7acde48001122',
 '8f6592fc0bdd11eba7f7acde48001122',
 '25b06fd408c311ebbd8dac1f6bf848b6',
 'e04a5b1a0bda11eba7f7acde48001122',
 '05c81d7a0bde11eba7f7acde48001122',
 '205e0f460bde11eba7f7acde48001122',
 '80483b220bdc11eba7f7acde48001122',
 '27efedf20bb011ebab90acde48001122',
 '43a4db600bde11eba7f7acde48001122',
 '4a29f0e40bdc11eba7f7acde48001122',
 'ce1bdcc80baf11ebab90acde48001122',
 '27ee96820bb011ebab90acde48001122',
 'c20c5dda0bdd11eba7f7acde48001122',
 'a9e322b20bdc11eba7f7acde48001122',
 'ca8695b20baf11ebab90acde48001122',
 '30001f5c0bde11eba7f7acde48001122']

In [13]:
dev_set = read_wikiqa_data("data/dev_sampled.json")
len(dev_set)

7 not found


1000

In [14]:
dev_set[:1]

[{'id': '1ee47f380bde11eba7f7acde48001122',
  'question': 'Where was the performer of song Get A Life – Get Alive born?',
  'answer': 'Vöcklabruck',
  'all_pars': ['Bernard Bonvoisin, known as Bernie Bonvoisin( born 9 July 1956 in Nanterre, Hauts- de- Seine), is a French hard rock singer and film director.',
   'He is best known for having been the singer of Trust.',
   'He was one of the best friends of Bon Scott the singer of AC/ DC and together they recorded the song" Ride On" which was one of the last songs by Bon Scott.',
   '"Get a Life – Get Alive" is the Austrian entry for the Eurovision Song Contest 2007 and also that year\'s official Life Ball song.',
   'The song is sung by Eric Papilaya, and was written by Greg Usek (music) and Austin Howard (lyrics).',
   'As Austria did not compete in the 2006 contest, the song was performed in the semi-final.',
   'Here, it performed 27th, the first time in the history of the competition that an entry had been performed in this position.

In [15]:
dev_set_rerun_data = [dev_set[i] for i in range(len(dev_set)) if dev_set[i]["id"] in self_ask_ids_rerun]

In [16]:
len(dev_set_rerun_data)

17

### Module 1: Building Consolidated function

In [17]:
# model embedding function required for relevant context
def model_embeddings(sentence, model):
    embedding = model.encode([sentence])
    return embedding[0] #should return an array of shape 384

In [18]:
def run_iterative_prediction():

    # Consistency Arguments
    args = consistency_parse_args()

    # output_file = f"misc/iterative_predictions_self_ask_final_iter_100_250.jsonl"
    output_file = f"misc/iterative_predictions_self_ask_final_rerun.jsonl"

    # Load existing IDs from output file
    existing_ids = set()
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    existing_ids.add(data["id"])
                except:
                    continue

    # train_set = read_wikiqa_data("data/train_subset.json", manual_annotation_style=args.style)
    # train_set = train_set[args.train_slice:(args.train_slice + args.num_shot)]

    dev_set = read_wikiqa_data("data/dev_sampled.json")
    # Only filtering the data that needs to be rerun
    dev_set = [dev_set[i] for i in range(len(dev_set)) if dev_set[i]["id"] in self_ask_ids_rerun]
    
    # dev_set = dev_set[args.dev_slice:(args.dev_slice + args.num_dev)]

    results = []
    total_start = time()

    for idx, ex in enumerate(tqdm(dev_set, desc="Processing one-by-one")):
        if ex['id'] in existing_ids:
            continue

        start = time()
        try:
            # Step 1: Predict
            raw_pred = consistency_in_context_manual_prediction(ex, self_ask_prompt, engine=args.engine, prompt_helper=args.helper, length_test_only=False, n=args.num_shot)
            
            # Step 2: Consistency post-processing
            con_score, final_pred = post_process_consistency(ex, raw_pred, args)
            final_pred["consistency"]= con_score

            # Step  3: check if the consistency is higher than the threshold
            # args =verifying_questions_args()

            if con_score > 0.5:
                # Step 4: Storing the result if consistency is higher than the threshold 
                final_pred['high_consistency'] = True
                final_pred["time_taken_sec"] = round(time() - start, 2)
                with open(output_file, "a") as fout:
                    fout.write(json.dumps(final_pred) + "\n")

            else:
                start2 = time()
                final_pred["consistency_time"] = round(start2 - start, 2)
                final_pred['high_consistency'] = False
                # Step 5: Generating verifying questions

                sentences = rationale_tokenize(final_pred['rationale'])
                # extracting follow up questions directly from the self-ask prompt, no need of LLM call
                vq = extract_follow_up_questions(final_pred['rationale'])
                
                final_pred["verifying_questions"] = vq

                # Step 5: Pulling Relevant Context
                start3 = time()
                final_pred["time_verifying_question"] = round(start3 - start2, 2)
                args = relevant_context_args()

                contexts = []
                embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

                # sentences = rationale_tokenize(final_pred['rationale'])
                all_pars_text = []
                all_pars = []
                va = []
                for j, s in enumerate(sentences):
                    pars_text = get_texts_to_rationale_wikipedia(vq[j], False)
                
                    pars_text = list(dict.fromkeys(pars_text)) #remove potential duplicates
                    all_pars_text += pars_text

                    if pars_text != []:
                            # sen_embeds = [model_embeddings(s, embedding_model)]
                            # Checking embedding from question instead of the answer
                            sen_embeds = [model_embeddings(vq[j], embedding_model)]
                            par_embeds = [model_embeddings(s, embedding_model) for s in pars_text]

                            pars = sklearn.metrics.pairwise.pairwise_distances(sen_embeds, par_embeds)
                            pars = pars.argsort(axis = 1)[0][:args.topk]

                            pars = [pars_text[i] for i in pars]
                            contexts.append(pars)
                            all_pars += pars
                    
                    # Step 6: Creating Verified Answers
                    answer = verifying_answers_in_context_manual_prediction(args.engine, args.helper, args.model, pars, vq[j])
                    va.append(answer['text'].lstrip())
                
                start4 = time()
                final_pred["time_context_and_verifying_answer"] = round(start4- start3,2)
                final_pred["context"] = contexts
                final_pred["verifying_answers"] = va

                # Step 7: Answering Again
                new_rationale = answer_again_rationale(vq, va)

                final_pred["new_rationale"] = new_rationale
                new_p = answer_again_in_context_manual_prediction(ex, self_ask_prompt, args.engine, args.helper, args.model,
                    new_rationale )

                start5 =time()
                final_pred["time_answer_again"] = round(start5 - start4, 2)
                final_pred["new_answer"] = new_p["text"]
                
                final_pred["time_taken_sec"] = round(time() - start, 2)
                with open(output_file, "a") as fout:
                    fout.write(json.dumps(final_pred) + "\n")
            
            results.append(final_pred)

            # # Step 4: Save this prediction to file immediately
            # with open(output_file, "a") as fout:
            #     fout.write(json.dumps(final_pred) + "\n")

        except Exception as e: 
            print(f"Error in record {idx} ({ex['id']}): {e}")
            continue

    total_duration = round(time() - total_start, 2)
    avg_time = total_duration / len(results) if results else 0
    print(f"\n✅ Finished {len(results)} records in {total_duration:.2f} seconds (avg {avg_time:.2f} sec/record)")
    print(f"📄 Results saved to: {output_file}")

    return None

In [19]:
run_iterative_prediction()

7 not found


Processing one-by-one: 100%|██████████| 17/17 [00:31<00:00,  1.86s/it]


✅ Finished 2 records in 31.55 seconds (avg 15.78 sec/record)
📄 Results saved to: misc/iterative_predictions_self_ask_final_rerun.jsonl





In [1]:
from evaluation import evaluate_model_performance

In [2]:
path = "misc/iterative_predictions_self_ask_final_rerun.jsonl"

In [3]:
df, results =evaluate_model_performance(path)

In [4]:
results

{'exact_match_percentage': 0.0,
 'verify_percentage': 50.0,
 'average_time_per_record_sec': 11.43}

In [5]:
df

Unnamed: 0,response,id,question,consistency,rationale,answer,right_answer,original_answers,original_rationales,high_consistency,...,consistency_time,verifying_questions,time_verifying_question,time_context_and_verifying_answer,context,verifying_answers,new_rationale,time_answer_again,new_answer,match_flag
0,Are follow up questions needed here: Yes.\nFol...,28942dc20bdd11eba7f7acde48001122,Where did the director of film Temptation (195...,0.6,Are follow up questions needed here: Yes.\nFol...,"Los Angeles, California, USA",Nice,"[Los Angeles, California, USA, The director of...",[Are follow up questions needed here: Yes.\nFo...,True,...,,,,,,,,,,0
1,Are follow up questions needed here: Yes.\nFol...,608c01f00bdd11eba7f7acde48001122,What is the place of birth of the director of ...,0.6,Are follow up questions needed here: Yes.\nFol...,"London, England",London,"[London, England, The director of the film ""Th...",[Are follow up questions needed here: Yes.\nFo...,True,...,,,,,,,,,,0
2,Are follow up questions needed here: Yes.\nFol...,8f6592fc0bdd11eba7f7acde48001122,What is the cause of death of director of film...,0.6,Are follow up questions needed here: Yes.\nFol...,Heart attack,Parkinson,"[Heart attack, Heart attack, Heart failure, Na...",[Are follow up questions needed here: Yes.\nFo...,True,...,,,,,,,,,,0
3,Are follow up questions needed here: Yes.\nFol...,25b06fd408c311ebbd8dac1f6bf848b6,"Which film whose director was born first, Apar...",0.4,Are follow up questions needed here: Yes.\nFol...,The House By The Cemetery,The House By The Cemetery,"[The House By The Cemetery, The House By The C...",[Are follow up questions needed here: Yes.\nFo...,False,...,2.84,"[Who directed the film Apartment Zero?, Who di...",0.0,31.69,"[[Apartment Zero, also known as Conviviendo co...",[The film Apartment Zero was directed by Marti...,Are follow up questions needed here: Yes.\nFol...,0.7,"The director of The House By The Cemetery, Luc...",0
4,Are follow up questions needed here: Yes.\nFol...,e04a5b1a0bda11eba7f7acde48001122,Which country the director of film Chain (Film...,0.4,Are follow up questions needed here: Yes.\nFol...,United States,American,"[United States, NOT ENOUGH INFO, United States...",[Are follow up questions needed here: Yes.\nFo...,False,...,1.23,"[Who directed the film Chain (Film)?, What is ...",0.0,15.3,[[Chain is a 2004 docufiction film written and...,"[The film Chain was directed by Jem Cohen., Je...",Are follow up questions needed here: Yes.\nFol...,0.62,United States.,0
5,Are follow up questions needed here: Yes.\nFol...,05c81d7a0bde11eba7f7acde48001122,What is the place of birth of the director of ...,0.4,Are follow up questions needed here: Yes.\nFol...,"Los Angeles, California, USA",Danvers,"[Los Angeles, California, USA, Los Angeles, Ca...",[Are follow up questions needed here: Yes.\nFo...,False,...,1.73,[Who directed the film Claustrophobia (2003 Fi...,0.0,14.89,[[Claustrophobia (retitled Serial Slayer for h...,[The film Claustrophobia (2003 Film) was direc...,Are follow up questions needed here: Yes.\nFol...,0.56,"Danvers, Massachusetts.",0
6,Are follow up questions needed here: No. \nAns...,205e0f460bde11eba7f7acde48001122,What nationality is the director of film Let'S...,0.2,none,NOT ENOUGH INFO,Nepal,"[NOT ENOUGH INFO, The director of the film Let...","[none, Are follow up questions needed here: No...",False,...,1.11,[],0.0,2.6,[],[],Are follow up questions needed here: Yes.\nSo ...,0.5,French.,0
7,Are follow up questions needed here: Yes.\nFol...,80483b220bdc11eba7f7acde48001122,What is the place of birth of the director of ...,0.4,Are follow up questions needed here: Yes.\nFol...,Various countries,Santa Monica,"[Various countries, The director of the film N...",[Are follow up questions needed here: Yes.\nFo...,False,...,2.45,"[Who directed the film Nightmare Cinema?, What...",0.0,42.63,[[Nightmare Cinema is a 2018 American horror a...,[The film Nightmare Cinema was directed by Ale...,Are follow up questions needed here: Yes.\nFol...,1.13,"Alejandro Brugués was born in Argentina, Joe D...",0
8,Are follow up questions needed here: Yes.\nFol...,27efedf20bb011ebab90acde48001122,"Who is Walter Butler, 11Th Earl Of Ormond's pa...",0.4,Are follow up questions needed here: Yes.\nFol...,"James Butler, 9th Earl of Ormond",James Butler,"[James Butler, 9th Earl of Ormond, James Butle...",[Are follow up questions needed here: Yes.\nFo...,False,...,1.65,"[Who is Walter Butler, 11th Earl of Ormond's f...",0.0,20.66,"[[1600–1634), Irish soldier of fortune and ass...","[Walter Butler, 11th Earl of Ormond's father w...",Are follow up questions needed here: Yes.\nFol...,0.63,"Thomas Butler, 7th Earl of Ormond.",0
9,Are follow up questions needed here: Yes.\nFol...,43a4db600bde11eba7f7acde48001122,"Where did Thomas Watson, 3Rd Earl Of Rockingha...",0.6,Are follow up questions needed here: Yes.\nFol...,"Christ Church, Oxford",Merton College,"[Trinity College, Cambridge, Thomas Watson, 3r...",[Are follow up questions needed here: Yes.\nFo...,True,...,,,,,,,,,,0


In [None]:
df.to_csv("misc/ve_predictions_self_ask_final.csv", index= False)

In [20]:
dev_set = read_wikiqa_data("data/dev_sampled.json")
len(dev_set)

7 not found


1000

In [23]:
dev_set[788]

{'id': 'c39a9f5108c611ebbd91ac1f6bf848b6',
 'question': 'Did Sam Barsky and Linda Vaughn share the same nationality?',
 'answer': 'yes',
 'all_pars': ['" House of Love" is a song by Australian duo Vika and Linda.',
  'It was released as the second single from her debut studio album" Vika and Linda"( 1994).',
  'It peaked at number 98 in Australia and 32 in New Zealand.',
  'Sam Barsky is an American artist and internet celebrity.',
  'He is known for knitting sweaters of iconic landmarks, then taking selfies in front of those landmarks while wearing them.',
  'He also knits sweaters of Jewish and other holidays.',
  'His sweaters are knitted without a pattern freehanding them as he goes along.',
  'His sweaters have been recognized as art, having been displayed in numerous galleries and at the American Visionary Arts Museum',
  'Though he gets lots of requests, Barsky does not sell his sweaters because he says it is impossible to be a" human sweater mill."',
  'Several people share the