### Module 0: Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pprint
import os 
from time import time 
from dotenv import load_dotenv
import json
import argparse
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from openai import OpenAI
import certifi

# Override bad SSL_CERT_FILE if set
os.environ["SSL_CERT_FILE"] = certifi.where()

from sentence_transformers import SentenceTransformer
import torch
import sklearn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
## Importing VE libraries
from utils import *
from dataset_utils import read_wikiqa_data
from prompt_helper import get_joint_prompt_helper, normalize_prediction
from dataset_utils import read_wikiqa_data, wiki_evaluation


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
load_dotenv()
client = OpenAI()

In [5]:
from consistency import _parse_args as consistency_parse_args, in_context_manual_prediction  as consistency_in_context_manual_prediction, post_process_consistency, evaluate_manual_predictions as consistency_evaluate_manual_predictions
from verifying_questions import _parse_args as verifying_questions_args, in_context_manual_prediction as vq_in_context_manual_prediction
from relevant_context import _parse_args as relevant_context_args
from verifying_answers import in_context_manual_prediction as verifying_answers_in_context_manual_prediction, _parse_args as verifying_answers_args
from answer_again import rationale as answer_again_rationale, in_context_manual_prediction as answer_again_in_context_manual_prediction

### Module 1: Building Consolidated function

In [6]:
# model embedding function required for relevant context
def model_embeddings(sentence, model):
    embedding = model.encode([sentence])
    return embedding[0] #should return an array of shape 384

In [7]:
def run_iterative_prediction():

    # Consistency Arguments
    args = consistency_parse_args()

    output_file = f"misc/iterative_predictions_cot_final_100_250.jsonl"

    # Load existing IDs from output file
    existing_ids = set()
    if os.path.exists(output_file):
        with open(output_file, "r") as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    existing_ids.add(data["id"])
                except:
                    continue

    train_set = read_wikiqa_data("data/train_subset.json", manual_annotation_style=args.style)
    train_set = train_set[args.train_slice:(args.train_slice + args.num_shot)]

    dev_set = read_wikiqa_data("data/dev_sampled.json")
    dev_set = dev_set[args.dev_slice:(args.dev_slice + args.num_dev)]

    results = []
    total_start = time()

    for idx, ex in enumerate(tqdm(dev_set, desc="Processing one-by-one")):
        if ex['id'] in existing_ids:
            continue

        start = time()
        try:
            # Step 1: Predict
            raw_pred = consistency_in_context_manual_prediction(ex, train_set, engine=args.engine, prompt_helper=args.helper, length_test_only=False, n=args.num_shot)
            
            # Step 2: Consistency post-processing
            con_score, final_pred = post_process_consistency(ex, raw_pred, args)
            final_pred["consistency"]= con_score

            # Step  3: check if the consistency is higher than the threshold
            args =verifying_questions_args()

            if con_score > args.consistency_threshold:
                # Step 4: Storing the result if consistency is higher than the threshold 
                final_pred['high_consistency'] = True
                final_pred["time_taken_sec"] = round(time() - start, 2)
                with open(output_file, "a") as fout:
                    fout.write(json.dumps(final_pred) + "\n")

            else:
                start2 = time()
                final_pred["consistency_time"] = round(start2 - start, 2)
                final_pred['high_consistency'] = False
                # Step 5: Generating verifying questions
                vq = []
                sentences = rationale_tokenize(final_pred['rationale'])
                for q, s in enumerate(sentences):
                    question = vq_in_context_manual_prediction(final_pred['question'], s, args.engine, args.model, args.helper)
                    if question != None:
                        vq.append(question['text'])
                
                final_pred["verifying_questions"] = vq

                # Step 5: Pulling Relevant Context
                start3 = time()
                final_pred["time_verifying_answer"] = round(start3 - start2, 2)
                args = relevant_context_args()

                contexts = []
                embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2').to(device)

                sentences = rationale_tokenize(final_pred['rationale'])
                all_pars_text = []
                all_pars = []
                va = []
                for j, s in enumerate(sentences):
                    pars_text = get_texts_to_rationale_wikipedia(vq[j], False)
                
                    pars_text = list(dict.fromkeys(pars_text)) #remove potential duplicates
                    all_pars_text += pars_text

                    if pars_text != []:
                            sen_embeds = [model_embeddings(s, embedding_model)]
                            par_embeds = [model_embeddings(s, embedding_model) for s in pars_text]

                            pars = sklearn.metrics.pairwise.pairwise_distances(sen_embeds, par_embeds)
                            pars = pars.argsort(axis = 1)[0][:args.topk]

                            pars = [pars_text[i] for i in pars]
                            contexts.append(pars)
                            all_pars += pars
                    
                    # Step 6: Creating Verified Answers
                    answer = verifying_answers_in_context_manual_prediction(train_set, args.engine, args.helper, args.model, pars, vq[j])
                    va.append(answer['text'].lstrip())
                
                start4 = time()
                final_pred["time_context_and_verifying_answer"] = round(start4- start3,2)
                final_pred["context"] = contexts
                final_pred["verifying_answers"] = va

                # Step 7: Answering Again
                new_rationale = answer_again_rationale(va)
                new_p = answer_again_in_context_manual_prediction(ex, train_set, args.engine, args.helper, args.model,
                    new_rationale )

                start5 =time()
                final_pred["time_answer_again"] = round(start5 - start4, 2)
                final_pred["new_answer"] = new_p["text"]
                
                final_pred["time_taken_sec"] = round(time() - start, 2)
                with open(output_file, "a") as fout:
                    fout.write(json.dumps(final_pred) + "\n")
            
            results.append(final_pred)

            # # Step 4: Save this prediction to file immediately
            # with open(output_file, "a") as fout:
            #     fout.write(json.dumps(final_pred) + "\n")

        except Exception as e: 
            print(f"Error in record {idx} ({ex['id']}): {e}")
            continue

    total_duration = round(time() - total_start, 2)
    avg_time = total_duration / len(results) if results else 0
    print(f"\n✅ Finished {len(results)} records in {total_duration:.2f} seconds (avg {avg_time:.2f} sec/record)")
    print(f"📄 Results saved to: {output_file}")

    return None

In [9]:
run_iterative_prediction()

0 not found
7 not found


Processing one-by-one:   0%|          | 0/150 [00:00<?, ?it/s]

Processing one-by-one:  15%|█▌        | 23/150 [00:18<02:16,  1.07s/it]

Error in record 22 (8622d5700bdc11eba7f7acde48001122): ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Processing one-by-one: 100%|██████████| 150/150 [12:40<00:00,  5.07s/it]


✅ Finished 129 records in 760.80 seconds (avg 5.90 sec/record)
📄 Results saved to: misc/iterative_predictions_cot_final_100_250.jsonl





In [2]:
resp = {"response": "First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, USA. The answer is Los Angeles, California, USA.", "id": "28942dc20bdd11eba7f7acde48001122", "question": "Where did the director of film Temptation (1959 Film) die?", "consistency": 0.4, "rationale": "First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, USA.", "answer": "Los Angeles, California, USA", "original_answers": ["Hollywood, California, USA", "Los Angeles, California, USA", "Los Angeles, California, USA", "Los Angeles, California, United States", "Los Angeles, California, United States"], "original_rationales": ["First, the director of film Temptation (1959 Film) is Irving Pichel. Second, Irving Pichel died in Hollywood, California, USA.", "First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, USA.", "First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, USA.", "First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, United States.", "First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, United States."], "consistency_time": 1.2, "high_consistency": False, "verifying_questions": ["What is the location of Irving Rapper's death?", "Where did Irving Rapper, the director of film Temptation (1959 Film), die?"], "time_verifying_answer": 1.52, "time_context_and_verifying_answer": 18.53, "context": [["Irving Rapper (16 January 1898 \u2013 20 December 1999) was a British-born American film director.", "Another Man's Poison is a 1951 British drama film directed by Irving Rapper and starring Bette Davis, Gary Merrill and Emlyn Williams.", "The Miracle is a 1959 American historical fiction film directed by Irving Rapper and starring Carroll Baker and Roger Moore."], ["Irving Rapper (16 January 1898 \u2013 20 December 1999) was a British-born American film director.", "Bad for Each Other is a 1953 American drama film noir directed by Irving Rapper and starring Charlton Heston, Lizabeth Scott and Dianne Foster.", "is the fourth studio album by the English singer-songwriter Elvis Costello, and his third with the Attractions \u2014 keyboardist Steve Nieve, bassist Bruce Thomas and drummer Pete Thomas (no relation)."]], "verifying_answers": ["Irving Rapper's death location is not provided in the text.", "Irving Rapper, the director of film Temptation (1959 Film), died in the United States."], "time_answer_again": 0.66, "new_answer": " The answer is the United States.", "time_taken_sec": 21.91}

In [3]:
resp


{'response': 'First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, USA. The answer is Los Angeles, California, USA.',
 'id': '28942dc20bdd11eba7f7acde48001122',
 'question': 'Where did the director of film Temptation (1959 Film) die?',
 'consistency': 0.4,
 'rationale': 'First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, California, USA.',
 'answer': 'Los Angeles, California, USA',
 'original_answers': ['Hollywood, California, USA',
  'Los Angeles, California, USA',
  'Los Angeles, California, USA',
  'Los Angeles, California, United States',
  'Los Angeles, California, United States'],
 'original_rationales': ['First, the director of film Temptation (1959 Film) is Irving Pichel. Second, Irving Pichel died in Hollywood, California, USA.',
  'First, the director of film Temptation (1959 Film) is Irving Rapper. Second, Irving Rapper died in Los Angeles, Cali

In [9]:
    args = consistency_parse_args()

In [18]:
args.dev_slice, args.num_dev

(0, 100)

In [10]:
dev_set = read_wikiqa_data(f"data/dev_sampled.json")
dev_set = dev_set[args.dev_slice:(args.num_dev)]

7 not found


In [13]:

    # Step 2: Load right_answers_list (list of dicts with 'id' and 'right_answer')
    df_right = pd.DataFrame(dev_set)
    df_right.head()

Unnamed: 0,id,question,answer,all_pars,supp_pars
0,1ee47f380bde11eba7f7acde48001122,Where was the performer of song Get A Life – G...,Vöcklabruck,"[Bernard Bonvoisin, known as Bernie Bonvoisin(...","[The song is sung by Eric Papilaya, and was wr..."
1,49eb87280bdc11eba7f7acde48001122,Where did the director of film Don Juan (1922 ...,Westerland,"[Peter Levin is an American director of film, ...",[Don Juan is a 1922 German silent film directe...
2,2e17069c0bde11eba7f7acde48001122,What is the place of birth of the composer of ...,Tamil,[Henri Verdun( 1895–1977) was a French compose...,"[It stars Vijayashanti, Rekha and Prabhu Deva ..."
3,28942dc20bdd11eba7f7acde48001122,Where did the director of film Temptation (195...,Nice,"[Jesse Edward Hobson( May 2, 1911 – November 5...",[Temptation (also known as Temptation Island) ...
4,6a8a71280bb011ebab90acde48001122,"Who is the uncle of John Kennedy, 2Nd Lord Ken...",James Kennedy,[Bonifazio Bevilacqua Aldobrandini (1571 – Apr...,"[John Kennedy, PC, 2nd Lord Kennedy (12 Octobe..."


In [16]:
def evaluate_model_performance(jsonl_path, right_answers_list):
    # Step 1: Load main jsonl file
    with open(jsonl_path, 'r') as f:
        main_data = [json.loads(line) for line in f]
    df_main = pd.DataFrame(main_data)

    # Step 2: Load right_answers_list (list of dicts with 'id' and 'right_answer')
    df_right = pd.DataFrame(right_answers_list)
    df_right.rename(columns={"answer":"right_answer"}, inplace =True)

    # Step 3: Merge on 'id'
    df = df_main.merge(df_right[['id', 'right_answer']], on='id', how='left')

    # Step 4: Create match_flag
    def get_match(row):
        if row['high_consistency']:
            return int(row['answer'].strip() == row['right_answer'].strip())
        else:
            return int(row.get('new_answer', '').strip() == row['right_answer'].strip())
    
    df['match_flag'] = df.apply(get_match, axis=1)

    # Step 5: Metrics
    exact_match_pct = 100 * df['match_flag'].mean()
    verify_pct = 100 * (~df['high_consistency']).mean()
    avg_time = df['time_taken_sec'].mean()

    return df, {
        'exact_match_percentage': round(exact_match_pct, 2),
        'verify_percentage': round(verify_pct, 2),
        'average_time_per_record_sec': round(avg_time, 2)
    }

In [17]:
path = "misc/iterative_predictions_tr0-5_dv0-100_temp0.7.jsonl"

In [19]:
dev_set = read_wikiqa_data(f"data/dev_sampled.json")
dev_set = dev_set[args.dev_slice:(args.num_dev)]

7 not found


In [20]:
df, results =  evaluate_model_performance(path, dev_set)

In [21]:
df.head()

Unnamed: 0,response,id,question,consistency,rationale,answer,original_answers,original_rationales,consistency_time,high_consistency,verifying_questions,time_verifying_answer,time_context_and_verifying_answer,context,verifying_answers,time_answer_again,new_answer,time_taken_sec,right_answer,match_flag
0,"First, the performer of the song Get A Life - ...",1ee47f380bde11eba7f7acde48001122,Where was the performer of song Get A Life – G...,0.4,"First, the performer of the song Get A Life - ...","Kingston, Ontario, Canada","[London, England, the United States, Kingston,...","[First, the performer of the song Get A Life -...",1.26,False,"[What is Bryan Adams' birthplace?, Where was B...",1.41,19.76,"[[Bryan Guy Adams (born November 5, 1959) is ...","[Bryan Adams was born in Kingston, Ontario, Ca...",0.72,"The answer is Kingston, Ontario, Canada.",23.16,Vöcklabruck,0
1,"First, the director of film Don Juan (1922 Fil...",49eb87280bdc11eba7f7acde48001122,Where did the director of film Don Juan (1922 ...,0.6,"First, the director of film Don Juan (1922 Fil...","Hollywood, California, United States","[Hollywood, California, USA, Hollywood, Califo...","[First, the director of film Don Juan (1922 Fi...",,True,,,,,,,,1.08,Westerland,0
2,"First, the composer of film Naayudamma is Chel...",2e17069c0bde11eba7f7acde48001122,What is the place of birth of the composer of ...,0.4,"First, the composer of film Naayudamma is Chel...","Andhra Pradesh, India","[Krishnancoil, Tamil Nadu, India, Andhra Prade...","[First, the composer of film Naayudamma is K. ...",2.57,False,"[Where was Chellapilla Satyam born?, Where was...",1.45,11.51,[[Chellapilla Satyanarayana Sastry (1933 – 12 ...,[Chellapilla Satyam was born in Gunanupuram vi...,1.19,"The answer is Gunanupuram village, Komarada ma...",16.72,Tamil,0
3,"First, the director of film Temptation (1959 F...",28942dc20bdd11eba7f7acde48001122,Where did the director of film Temptation (195...,0.4,"First, the director of film Temptation (1959 F...","Los Angeles, California, USA","[Hollywood, California, USA, Los Angeles, Cali...","[First, the director of film Temptation (1959 ...",1.2,False,[What is the location of Irving Rapper's death...,1.52,18.53,[[Irving Rapper (16 January 1898 – 20 December...,[Irving Rapper's death location is not provide...,0.66,The answer is the United States.,21.91,Nice,0
4,"First, John Kennedy, 2nd Lord Kennedy's uncle ...",6a8a71280bb011ebab90acde48001122,"Who is the uncle of John Kennedy, 2Nd Lord Ken...",0.2,"First, John Kennedy, 2nd Lord Kennedy's uncle ...",James Kennedy,"[James Kennedy, David Kennedy, 1st Earl of Cas...","[First, John Kennedy, 2nd Lord Kennedy's uncle...",1.18,False,"[Who is James Kennedy, Bishop of Dunkeld relat...",0.77,17.91,"[[James, Jim, or Jimmy Kennedy may refer to:\n...","[James Kennedy, Bishop of Dunkeld is not relat...",1.2,"Second, John Kennedy, 2nd Lord Kennedy's uncl...",21.06,James Kennedy,0


In [22]:
results

{'exact_match_percentage': 21.0,
 'verify_percentage': 31.0,
 'average_time_per_record_sec': 6.37}

In [23]:
df.to_csv("ve_predictions_cot_iter_0_100.csv", index =False)

In [6]:
from evaluation import evaluate_model_performance_v2

In [9]:
path = "misc/iterative_predictions_cot_final_100_250.jsonl"

In [10]:
df, result = evaluate_model_performance_v2(path)

In [12]:
result

{'exact_match_percentage': 18.79,
 'verify_percentage': 32.21,
 'average_time_per_record_sec': 138.14}

In [11]:
df.head()

Unnamed: 0,response,id,question,right_answer,consistency,rationale,answer,original_answers,original_rationales,consistency_time,high_consistency,verifying_questions,time_verifying_answer,time_context_and_verifying_answer,context,verifying_answers,time_answer_again,new_answer,time_taken_sec,match_flag
0,"First, the director of Kai Po Che! is Abhishek...",aa94f24e084911ebbd55ac1f6bf848b6,"Which film has the director who is older, Kai ...",Waiting Room To The Beyond,0.2,"First, the director of Kai Po Che! is Abhishek...",Kai Po Che!,"[Kai Po Che!, Abhishek Kapoor, the director of...","[First, the director of Kai Po Che! is Abhishe...",2.17,False,"[Who is the director of Kai Po Che!?, Who is t...",1.4,13.54,[[He is best known for his performances in fil...,[The director of Kai Po Che! is Abhishek Kapoo...,0.71,The director of Kai Po Che! (Abhishek Kapoor) ...,17.81,0
1,"First, the director of film India Speaks is Ol...",6a3bb1220bde11eba7f7acde48001122,Where did the director of film India Speaks die?,Pacific,0.2,"First, the director of film India Speaks is Ol...","Los Angeles, California, USA","[Los Angeles, California, USA, Rome, Italy, Ne...","[First, the director of film India Speaks is O...",1.16,False,"[Who is the director of film India Speaks?, Wh...",1.11,17.18,[[India Speaks is a 1933 Pre-Code adventure fi...,[The director of film India Speaks is Richard ...,0.74,"The answer is Laurens County, South Carolina.",20.2,0
2,"First, Mount Leinster is located in Ireland. S...",4bf3295e09b311ebbdb0ac1f6bf848b6,Are Mount Leinster and Silsean both located in...,yes,0.8,"First, Mount Leinster is located in Ireland. S...","yes, both Mount Leinster and Silsean are locat...","[yes, both Mount Leinster and Silsean are loca...","[First, Mount Leinster is located in Ireland. ...",,True,,,,,,,,1.56,0
3,"First, the director of Agitated Women is Yves ...",14f5dbb208b811ebbd88ac1f6bf848b6,"Which film has the director who died earlier, ...",Agitated Women,0.6,"First, the director of Agitated Women is Yves ...",An Even Break,"[Frank Tuttle, director of An Even Break, who ...","[First, the director of Agitated Women is Tõni...",,True,,,,,,,,1.59,0
4,"First, Arthur Stannard Vernay was born on 6 Ju...",20c7b59608db11ebbd9cac1f6bf848b6,"Who is younger, Arthur Stannard Vernay or Janu...",Janusz Bielański,0.4,"First, Arthur Stannard Vernay was born on 6 Ju...",Janusz Bielański,"[Janusz Bielański, NOT ENOUGH INFO, NOT ENOUGH...","[First, Arthur Stannard Vernay was born on 6 J...",1.28,False,"[When was Arthur Stannard Vernay born?, When w...",1.36,14.07,"[[Early life\nBorn in Weymouth, England, Verna...","[Arthur Stannard Vernay was born on May 11, 18...",0.87,"The answer is Janusz Bielański, as he was born...",17.59,0


In [13]:
df.to_csv("misc/ve_predictions_cot_iter_100_250.csv", index =False)

In [14]:
59/250

0.236