### Module 0: Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pprint
import os 
from time import time 
from dotenv import load_dotenv
import json
import argparse
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from openai import OpenAI
import certifi

# Override bad SSL_CERT_FILE if set
os.environ["SSL_CERT_FILE"] = certifi.where()

from sentence_transformers import SentenceTransformer
import torch
import sklearn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Importing VE libraries
from utils import *
from dataset_utils import read_wikiqa_data
from prompt_helper import get_joint_prompt_helper, normalize_prediction
from dataset_utils import read_wikiqa_data, wiki_evaluation



In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
load_dotenv()
client = OpenAI()

In [5]:
from prompt_self_ask import self_ask_prompt
from consistency import _parse_args as consistency_parse_args, in_context_manual_prediction  as consistency_in_context_manual_prediction, post_process_consistency, evaluate_manual_predictions as consistency_evaluate_manual_predictions
from verifying_questions import extract_follow_up_questions, _parse_args as verifying_questions_args
from relevant_context import _parse_args as relevant_context_args


from verifying_answers import in_context_manual_prediction as verifying_answers_in_context_manual_prediction, _parse_args as verifying_answers_args
from answer_again import in_context_manual_prediction as answer_again_in_context_manual_prediction
from evaluation import evaluate_model_performance

### Module 1: Building Consolidated function

In [6]:
# model embedding function required for relevant context
def model_embeddings(sentence, model):
    embedding = model.encode([sentence])
    return embedding[0] #should return an array of shape 384

In [7]:
def run_iterative_prediction():

    # Consistency Arguments
    args = consistency_parse_args()

    output_file = f"misc/iterative_predictions_self_ask_final_iter_new_0_250.jsonl"

    # Load existing IDs from output file
    existing_ids = set()
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    existing_ids.add(data["id"])
                except:
                    continue

    # train_set = read_wikiqa_data("data/train_subset.json", manual_annotation_style=args.style)
    # train_set = train_set[args.train_slice:(args.train_slice + args.num_shot)]

    dev_set = read_wikiqa_data("data/dev_sampled.json")
    dev_set = dev_set[args.dev_slice:(args.dev_slice + args.num_dev)]

    results = []
    total_start = time()

    for idx, ex in enumerate(tqdm(dev_set, desc="Processing one-by-one")):
        if ex['id'] in existing_ids:
            continue

        start = time()
        try:
            # Step 1: Predict
            raw_pred = consistency_in_context_manual_prediction(ex, self_ask_prompt, engine=args.engine, prompt_helper=args.helper, length_test_only=False, n=args.num_shot)
            
            # Step 2: Consistency post-processing
            con_score, final_pred = post_process_consistency(ex, raw_pred, args)
            final_pred["consistency"]= con_score

            # Step  3: check if the consistency is higher than the threshold
            # args =verifying_questions_args()

            if con_score > 0.5:
                # Step 4: Storing the result if consistency is higher than the threshold 
                final_pred['high_consistency'] = True
                final_pred["time_taken_sec"] = round(time() - start, 2)
                with open(output_file, "a") as fout:
                    fout.write(json.dumps(final_pred) + "\n")

            else:
                start2 = time()
                final_pred["consistency_time"] = round(start2 - start, 2)
                final_pred['high_consistency'] = False
                # Step 5: Generating verifying questions

                sentences = rationale_tokenize(final_pred['rationale'])
                # extracting follow up questions directly from the self-ask prompt, no need of LLM call
                vq = extract_follow_up_questions(final_pred['rationale'])
                
                final_pred["verifying_questions"] = vq

                # Step 5: Pulling Relevant Context
                start3 = time()
                final_pred["time_verifying_question"] = round(start3 - start2, 2)
                args = relevant_context_args()

                contexts = []
                embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

                # sentences = rationale_tokenize(final_pred['rationale'])
                all_pars_text = []
                all_pars = []
                va = []
                for j, s in enumerate(sentences):
                    pars_text = get_texts_to_rationale_wikipedia(vq[j], False)
                
                    pars_text = list(dict.fromkeys(pars_text)) #remove potential duplicates
                    all_pars_text += pars_text

                    if pars_text != []:
                            # sen_embeds = [model_embeddings(s, embedding_model)]
                            # Checking embedding from question instead of the answer
                            sen_embeds = [model_embeddings(vq[j], embedding_model)]
                            par_embeds = [model_embeddings(s, embedding_model) for s in pars_text]

                            pars = sklearn.metrics.pairwise.pairwise_distances(sen_embeds, par_embeds)
                            pars = pars.argsort(axis = 1)[0][:args.topk]

                            pars = [pars_text[i] for i in pars]
                            contexts.append(pars)
                            all_pars += pars
                    
                    # Step 6: Creating Verified Answers
                    answer = verifying_answers_in_context_manual_prediction(args.engine, args.helper, args.model, pars, vq[j])
                    va.append(answer['text'].lstrip())
                
                start4 = time()
                final_pred["time_context_and_verifying_answer"] = round(start4- start3,2)
                final_pred["context"] = contexts
                final_pred["verifying_answers"] = va

                # Step 7: Answering Again
                new_rationale = answer_again_rationale(vq, va)

                final_pred["new_rationale"] = new_rationale
                new_p = answer_again_in_context_manual_prediction(ex, self_ask_prompt, args.engine, args.helper, args.model,
                    new_rationale )

                start5 =time()
                final_pred["time_answer_again"] = round(start5 - start4, 2)
                final_pred["new_answer"] = new_p["text"]
                
                final_pred["time_taken_sec"] = round(time() - start, 2)
                with open(output_file, "a") as fout:
                    fout.write(json.dumps(final_pred) + "\n")
            
            results.append(final_pred)

            # # Step 4: Save this prediction to file immediately
            # with open(output_file, "a") as fout:
            #     fout.write(json.dumps(final_pred) + "\n")

        except Exception as e: 
            print(f"Error in record {idx} ({ex['id']}): {e}")
            continue

    total_duration = round(time() - total_start, 2)
    avg_time = total_duration / len(results) if results else 0
    print(f"\n✅ Finished {len(results)} records in {total_duration:.2f} seconds (avg {avg_time:.2f} sec/record)")
    print(f"📄 Results saved to: {output_file}")

    return None

In [8]:
run_iterative_prediction()

7 not found


Processing one-by-one:  46%|████▌     | 114/250 [08:48<1:09:27, 30.64s/it]

Error in record 113 (1faf51ba0baf11ebab90acde48001122): HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)


Processing one-by-one:  52%|█████▏    | 131/250 [12:18<1:03:13, 31.88s/it]

Error in record 130 (3451827a088d11ebbd70ac1f6bf848b6): ('Connection aborted.', OSError(65, 'No route to host'))


Processing one-by-one:  54%|█████▎    | 134/250 [12:48<37:33, 19.42s/it]  

Error in record 133 (026b56060bde11eba7f7acde48001122): HTTPSConnectionPool(host='en.wikipedia.org', port=443): Read timed out. (read timeout=10.0)


Processing one-by-one: 100%|██████████| 250/250 [28:11<00:00,  6.76s/it]


✅ Finished 198 records in 1691.23 seconds (avg 8.54 sec/record)
📄 Results saved to: misc/iterative_predictions_self_ask_final_iter_new_0_250.jsonl





### Evaluating Model Performance

In [9]:
path = "misc/iterative_predictions_self_ask_final_iter_new_0_250.jsonl"

In [10]:
df, results =evaluate_model_performance(path)

In [11]:
results

{'exact_match_percentage': 29.96,
 'verify_percentage': 29.55,
 'average_time_per_record_sec': 7.75}

In [12]:
df.shape

(247, 21)

In [13]:
pd.set_option('display.max_columns', None)

In [14]:
df.head()

Unnamed: 0,response,id,question,consistency,rationale,answer,right_answer,original_answers,original_rationales,consistency_time,high_consistency,verifying_questions,time_verifying_question,time_context_and_verifying_answer,context,verifying_answers,new_rationale,time_answer_again,new_answer,time_taken_sec,match_flag
0,Are follow up questions needed here: Yes.\nFol...,1ee47f380bde11eba7f7acde48001122,Where was the performer of song Get A Life – G...,0.4,Are follow up questions needed here: Yes.\nFol...,London,Vöcklabruck,"[American, London, England, London, Russia]",[Are follow up questions needed here: Yes.\nFo...,2.76,False,"[Who performed the song ""Get A Life - Get Aliv...",0.0,19.21,[[Get a life or Get a Life may refer to:\n\nGe...,"[Eric Papilaya performed the song ""Get A Life ...",Are follow up questions needed here: Yes.\nFol...,0.58,Austria.,22.56,0
1,Are follow up questions needed here: Yes.\nFol...,49eb87280bdc11eba7f7acde48001122,Where did the director of film Don Juan (1922 ...,1.0,Are follow up questions needed here: Yes.\nFol...,Hollywood,Westerland,"[Hollywood, Hollywood, Hollywood, Hollywood, H...",[Are follow up questions needed here: Yes.\nFo...,,True,,,,,,,,,2.04,0
2,Are follow up questions needed here: Yes.\nFol...,2e17069c0bde11eba7f7acde48001122,What is the place of birth of the composer of ...,0.6,Are follow up questions needed here: Yes.\nFol...,Indian,Tamil,"[Indian, India, Indian, India, Indian]",[Are follow up questions needed here: Yes.\nFo...,,True,,,,,,,,,2.95,0
3,Are follow up questions needed here: Yes.\nFol...,28942dc20bdd11eba7f7acde48001122,Where did the director of film Temptation (195...,0.8,Are follow up questions needed here: Yes.\nFol...,Los Angeles,Nice,"[Los Angeles, Los Angeles, California, Los Ang...",[Are follow up questions needed here: Yes.\nFo...,,True,,,,,,,,,4.41,0
4,Are follow up questions needed here: Yes.\nFol...,6a8a71280bb011ebab90acde48001122,"Who is the uncle of John Kennedy, 2Nd Lord Ken...",0.2,Are follow up questions needed here: Yes.\nFol...,Gilbert Kennedy,James Kennedy,"[Gilbert Kennedy, John Kennedy, John Kennedy o...",[Are follow up questions needed here: Yes.\nFo...,2.64,False,"[Who is the parent of John Kennedy, 2nd Lord K...",0.0,37.67,"[[A member of the Kennedy family, he is a son ...","[The parent of John Kennedy, 2nd Lord Kennedy ...",Are follow up questions needed here: Yes.\nFol...,0.77,John F. Kennedy.,41.08,0


In [15]:
df.to_csv("misc/ve_predictions_self_ask_iter_latest_0_250.csv", index= False)

In [22]:
63/149

0.4228187919463087

In [None]:
# ve self ask 0-100 match percentage
47%