# Sentence-Query Pairs

## -> Create S-Q Pairs Dictionary

In [None]:
create_new_dictionary = False

if create_new_dictionary:
    from utilities import load_jsonl
    
    # identify datasets to use
    dataset_names = ['fava', 'wiki']
    
    # identify query outputs to look for
    query_experiments = ['q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7']
    
    
    # final result will be a dictionary
    result = {"sentence_query_relevance": {}, "included_queries": {"fava": [], "wiki": []}}
    
    # load fava and wiki outputs 
    for ds_name in dataset_names:
        for qe in query_experiments:
            file_path = f"/Users/jjr/output/rarr-rep/output/{ds_name}/{ds_name}_{qe}_r1_a1.jsonl"
            try:
                data = load_jsonl(file_path)
                result["included_queries"][ds_name].append(qe)
            except FileNotFoundError:
                print(f"no experiment at {file_path}")
                # do not add this datasets s-e pairs
                continue
            
            # iterate over sentences
            for i in range(len(data)):
                sentence = data[i]["decon_sentence"]
        
                # iterate over queries 
                queries = data[i].get('queries', [])
                for query in queries:
        
                    # check if s-q pair is already in dictionary
                    if (sentence, query) in result["sentence_query_relevance"]:
                        # skip to next query   
                        continue
                    # add s-q pair
                    else:
                        result["sentence_query_relevance"][(sentence, query)] = None
                # if wiki also check facts
                if ds_name == 'wiki':
                    facts = data[i].get('facts', [])
                    for fact in queries:
                        # check if s-q pair is already in dictionary
                        if (sentence, fact) in result["sentence_query_relevance"]:
                            # skip to next query   
                            continue
                        # add s-q pair
                        else:
                            result["sentence_query_relevance"][(sentence, fact)] = None
    
    
    
    
    import pickle
    
    sq_dict_path = "/Users/jjr/output/rarr-rep/gemini_labels/sentence-query/sentence_query_labels_dict.pkl"
    with open(sq_dict_path, "wb") as f:
        pickle.dump(result, f)
    
    print("s-e pair dictionary saved as result.pkl")
            

In [None]:
result["included_queries"]

In [None]:
len(result["sentence_query_relevance"].keys())

## -> create s-q pairs for testing Gemini prompt

In [None]:
wiki_data = load_jsonl(f"/Users/jjr/output/rarr-rep/output/wiki/wiki_q1_r1_a1.jsonl")
fava_data = load_jsonl(f"/Users/jjr/output/rarr-rep/output/fava/fava_q1_r1_a1.jsonl")


# use wiki indices 0, 1, 4

# import pickle
# # load main results dictionary
# with open("/Users/jjr/output/rarr-rep/gemini_labels/query_evidence_labels_dict.pkl", "rb") as f:
#     query_evidence_labels_dict = pickle.load(f)

# list(q for (q,e) in query_evidence_labels_dict if query_evidence_labels_dict[(q,e)]['O'] == 2)[200:300]

bad_sentences = list(s for (s,q) in list(result["sentence_query_relevance"].keys()) if ((len(s) <= 25)))

In [None]:
len(bad_sentences)

In [None]:
set(bad_sentences)

In [None]:
bad_sentences = ["I'm happy to help!", "Certainly!", "Absolutely!", 
             "Sure!", "I hope that helps!", "I hope this helps!",
             "I'm happy to help!" "Yes, that's correct!", "Hello!"]

bad_queries = list(q for (s,q) in list(result["sentence_query_relevance"].keys()) if s in bad_sentences)

len(set(bad_queries))

In [None]:
fava_8b = load_jsonl("/Users/jjr/output/rarr-rep/output/fava/fava_q1_r1_a1.jsonl")
fava_70b = load_jsonl("/Users/jjr/output/rarr-rep/output/fava/fava_q2_r1_a1.jsonl")

queries_8b = []
queries_70b = []

for i in range(len(fava_8b)):
    decon_sentence = fava_8b[i]["decon_sentence"]
    if decon_sentence in bad_sentences:
        queries_8b.append(fava_8b[i]['queries'])

for i in range(len(fava_70b)):
    decon_sentence = fava_8b[i]["decon_sentence"]
    if decon_sentence in bad_sentences:
        queries_70b.append(fava_70b[i]['queries'])



## -> Run Labeller

In [None]:
run_sq_pairs = True

In [None]:
prompt = '''
You are a quality judge, evaluating both the sentence and the query.

Instructions:

1. You are given a single sentence and a single query.

2. Evaluate the **sentence** first:
  - A “good” sentence (score 1) contains enough factual or contextual detail to allow for meaningful queries.
  - A “poor” sentence (score 0) has little or no factual content (e.g., “Bye!”, “Thank you.”) or is too vague to support a useful query.

3. Next, evaluate the **query** with respect to the sentence:
  - A “good” query (score 1) directly addresses or clarifies a fact, detail, or relevant aspect of the sentence in a way that would be useful for a Google search.
  - A “poor” query (score 0) does not address any meaningful detail in the sentence or is clearly unrelated or unhelpful.

4. **Output**:
  - You must output a **valid JSON object** with two integer keys: `"sentence_score"` and `"query_score"`, each set to `0` or `1`.
  - Example: {example_json}

5. Output **only** the JSON and no extra text or explanation.

Sentence: "{sentence}"
Query: "{query}"
Output:'''

In [None]:
import json
import random
import re

import json

import json

def parse_json_scores(response_text: str) -> dict:
    """
    Expects 'response_text' to contain exactly one JSON object of the form:
      {
        "sentence_score": 0 or 1,
        "query_score": 0 or 1
      }
    Returns the parsed dictionary, for example:
      {
        "sentence_score": 1,
        "query_score": 0
      }

    Raises ValueError if parsing fails or if the keys/values are invalid.
    """
    response_text = response_text.strip()
    
    # Attempt to parse the entire text as JSON
    try:
        data = json.loads(response_text)
    except json.JSONDecodeError as e:
        raise ValueError(f"Failed to parse JSON: {e}")
    
    # Check for required keys
    expected_keys = {"sentence_score", "query_score"}
    if not all(key in data for key in expected_keys):
        raise ValueError("JSON must contain 'sentence_score' and 'query_score' keys.")

    # Validate that both scores are either 0 or 1
    if data["sentence_score"] not in [0, 1] or data["query_score"] not in [0, 1]:
        raise ValueError("'sentence_score' and 'query_score' must each be 0 or 1.")
    
    # Optional: Ensure there are no extra keys
    if len(data) != 2:
        raise ValueError("Unexpected additional keys in JSON.")
    
    return data


def exponential_backoff(attempt, base_delay=3, max_delay=60.0):
    delay = base_delay * (2 ** attempt)  # exponential growth
    delay = min(delay, max_delay)
    jitter = random.uniform(0, delay * 0.15)  # add up to 15% jitter
    return delay + jitter

In [None]:
if run_sq_pairs:
    import pickle
    import os
    import time
    import datetime
    # import vertexai
    # from vertexai.generative_models import GenerativeModel
    from tqdm import tqdm
    from google import genai

    result = {"sentence_query_relevance": {}, "included_queries": {"fava": [], "wiki": []}}
    
    # file paths for storing results
    root_path = "/Users/jjr/output/rarr-rep/gemini_labels/sentence-query/"
    sentence_query_labels_dict_path = os.path.join(root_path, "sentence_query_labels_dict.pkl")
    failed_api_calls_dict_path = os.path.join(root_path, "failed_api_calls_dict.pkl")
    
    # load main results dictionary
    with open(sentence_query_labels_dict_path, "rb") as f:
        sentence_query_labels_dict = pickle.load(f)
        
    # attempt to load any previously failed API calls; if not, start fresh.
    if os.path.exists(failed_api_calls_dict_path):
        with open(failed_api_calls_dict_path, "rb") as f:
            failed_api_calls_dict = pickle.load(f)
    else:
        failed_api_calls_dict = {}
    
    # create list of (sentence, query) tuples that have None as value
    no_eval_list = [
        (sentence, query) 
        for (sentence, query), val in sentence_query_labels_dict["sentence_query_relevance"].items()
        if val is None and (sentence, query) not in failed_api_calls_dict
    ]

    ###### obtain api key #####
    api_key = ""
    # instantiate model
    client = genai.Client(api_key=api_key)
    max_retries = 5
    
    # iterate over tuples in no_eval_list
    i = 0
    for (sentence, query) in tqdm(no_eval_list, desc="Processing pairs"):
    

        # Populate the prompt with the required data
        prompt_data = { "sentence": sentence, "query": query, "example_json": "{\"sentence_score\": 1, \"query_score\": 0}"}
    
        # create llm messages
        messages = prompt.format(**prompt_data)
        
        # send to gemini
        success = False
        last_exception = None
        for attempt in range(max_retries):
            try:
                response = client.models.generate_content(model="gemini-1.5-pro-002", contents=messages)
                response_text = response.text
                success = True
                time.sleep(7)
                break
            except Exception as e:
                last_exception = e
                if hasattr(e, 'response') and getattr(e.response, 'status_code', None) == 429:
                    delay = exponential_backoff(attempt)
                    print(f"HTTP 429 received. Waiting {delay:.2f} seconds before retrying.")
                else:
                    delay = exponential_backoff(attempt)
                    print(f"API call error: {e}. Retrying in {delay:.2f} seconds.")
                time.sleep(delay)
    
        # if the API call was not successful, flag and continue (do not update the main dictionary)
        if not success:
            error_msg = f"API call failed after {max_retries} attempts: {last_exception}"
            failed_api_calls_dict[(sentence, query)] = error_msg
            continue
        
        try:
            parsed_dict = parse_json_scores(response_text)
        except Exception as e:
            print(f"JSON parsing error for pair ({sentence}, {query}): {e}")
            failed_api_calls_dict[(sentence, query)] = f"JSON parsing error: {e}"
            continue  
        
        # save label in dictionary
        sentence_query_labels_dict["sentence_query_relevance"][sentence, query] = parsed_dict
        i += 1
    
        # save progress every 1000 iterations to avoid losing good API calls
        if i % 100 == 0:
            print(f"saving iteration {i}")
            # save timestamped version
            timestamp = datetime.datetime.now().strftime('%d_%m_%y_%H%M')
            with open(os.path.join(root_path, f"sentence_query_labels_dict_{timestamp}.pkl"), "wb") as f:
                pickle.dump(sentence_query_labels_dict, f)
            with open(os.path.join(root_path, f"failed_api_calls_dict_{timestamp}.pkl"), "wb") as f:
                pickle.dump(failed_api_calls_dict, f)
            # update the fixed file names so future runs will load the latest errors
            with open(sentence_query_labels_dict_path, "wb") as f:
                pickle.dump(sentence_query_labels_dict, f)
            with open(failed_api_calls_dict_path, "wb") as f:
                pickle.dump(failed_api_calls_dict, f) 
    
    # Save any remaining work after the loop completes to the fixed file paths
    with open(sentence_query_labels_dict_path, "wb") as f:
        pickle.dump(sentence_query_labels_dict, f)
    with open(failed_api_calls_dict_path, "wb") as f:
        pickle.dump(failed_api_calls_dict, f)

# Query-Evidence Pairs

## -> create q,e dictionary

In [None]:
# from utilities import load_jsonl
import json
from typing import List, Dict

def load_jsonl(path:str)->List[Dict]:
    """loads a jsonlist file and returns a list of json objects"""
    output = []
    try:
        with open(path, 'r') as f:
            for line in f:
                data = json.loads(line)
                output.append(data)
    except FileNotFoundError:
        print(f"Error: The file at {path} does not exist.")
        raise 
    return output

# create dictionary of all used evidence with key: chunked_id, value: <text>

# create filenames from which to find evidence
query_experiments = [f"q{i}" for i in range(1,8)]

fava_filenames = [f"fava_{qe}_r1_a1.jsonl" for qe in query_experiments if qe not in ["q3", "q4"]]
fava_filenames.extend(['fava_q1_r2.jsonl', 'fava_q1_r3.jsonl'])

wiki_filenames = [f"wiki_{qe}_r1_a1.jsonl" for qe in query_experiments if qe not in []]
wiki_filenames.extend(['wiki_q1_r2.jsonl', 'wiki_q1_r3.jsonl'])

evidence_dict = {}


for file_name in fava_filenames + wiki_filenames:
    if file_name.startswith("fava"):
        data = load_jsonl(f"/Users/jjr/output/rarr-rep/output/fava/{file_name}")
    elif file_name.startswith("wiki"):
        data = load_jsonl(f"/Users/jjr/output/rarr-rep/output/wiki/{file_name}")

    # iterate over jsonl and extract evidence
    for i in range(len(data)):
        retrieved_evidence = data[i].get("retrieved_evidence", [])

        # flatten the retrieved evidence
        retrieved_evidence = [evidence for evidence_list in retrieved_evidence for evidence in evidence_list]

        # iterate over evidence objects
        for obj in retrieved_evidence:
            title = obj['title']
            text = obj['text']
            chunk_id = obj["chunked_id"]

            if chunk_id not in evidence_dict:
                evidence_dict[chunk_id] = title + ": " + text



## -> create empty (query, chunk_id): label dictionary

In [None]:
query_evidence_label_dict = {}

# iterate over files again
for file_name in fava_filenames + wiki_filenames:
    if file_name.startswith("fava"):
        data = load_jsonl(f"/Users/jjr/output/rarr-rep/output/fava/{file_name}")
    elif file_name.startswith("wiki"):
        data = load_jsonl(f"/Users/jjr/output/rarr-rep/output/wiki/{file_name}")

    # iterate over jsonl and extract (query,evidence) pairs
    for i in range(len(data)):
        queries = data[i].get('queries', [])
        retrieved_evidence_list = data[i].get("retrieved_evidence", [])

        # should be one evidence list for each query in the same order as the queries
        if len(queries) != len(retrieved_evidence_list):
            raise ValueError("query evidence mismatch!")

        for query, evidence_list in zip(queries, retrieved_evidence_list):
            # iterate over evidence in list
            for evidence in evidence_list:
                chunked_id = evidence['chunked_id']
                if (query, chunked_id) not in query_evidence_label_dict:
                    query_evidence_label_dict[query, chunked_id] = None

# save as pickle file

import pickle
from datetime import datetime

timestamp = datetime.now().strftime('%d_%m_%y_%H%M')

root_path = "/Users/jjr/output/rarr-rep/gemini_labels/query-evidence/"

with open(root_path + f"query_evidence_labels_dict_empty.pkl", "wb") as f:
    pickle.dump(query_evidence_label_dict, f)

with open(root_path + f"evidence_dict.pkl", "wb") as f:
    pickle.dump(evidence_dict, f)

## -> Run Labeller

In [None]:
run_qe_pairs = False

In [None]:
prompt = '''
You are a search quality rater evaluating the relevance of web pages.  Given a query and a web page, you must provide a score on an integer scale of 0 to 2 with the following meanings:
2 = highly relevant, very helpful for this query
1 = relevant, may be partly helpful but might contain other irrelevant content
0 = not relevant, should never be shown for this query

Assume that you are writing a report on the subject of the topic. If you would use any of the information contained in the web page in such a report, mark it 1. If the web page is primarily about the topic, or contains vital information about the topic, mark it 2. Otherwise, mark it 0.

Query
A person has typed [{query}] into a search engine.

Result
Consider the following web page.
—BEGIN WEB PAGE CONTENT—
{evidence}
—END WEB PAGE CONTENT—

Instructions
Split this problem into steps:

Consider the underlying intent of the search.
Measure how well the content matches a likely intent of the query (M).
Measure how trustworthy the web page is (T).
Consider the aspects above and the relative importance of each, and decide on a final score (O).

Produce a JSON array of scores without providing any reasoning. Example: {{"M": 2, "T": 1, "O": 1}}

Results [{{
'''


In [None]:
import re
import json

def extract_json(text_response):
    # This pattern matches a string that starts with '{' and ends with '}'
    pattern = r'\{[^{}]*\}'
    matches = re.finditer(pattern, text_response)
    json_objects = []
    for match in matches:
        json_str = match.group(0)
        try:
            # Validate if the extracted string is valid JSON
            json_obj = json.loads(json_str)
            json_objects.append(json_obj)
        except json.JSONDecodeError:
            # Extend the search for nested structures
            extended_json_str = _json_extend_search(text_response, match.span())
            try:
                json_obj = json.loads(extended_json_str)
                json_objects.append(json_obj)
            except json.JSONDecodeError:
                # Handle cases where the extraction is not valid JSON
                continue
    if json_objects:
        return json_objects
    else:
        return None  # Or handle this case as you prefer

def _json_extend_search(text, span):
    # Extend the search to try to capture nested structures
    start, end = span
    nest_count = 0
    for i in range(start, len(text)):
        if text[i] == '{':
            nest_count += 1
        elif text[i] == '}':
            nest_count -= 1
            if nest_count == 0:
                return text[start:i+1]
    return text[start:end]

In [None]:
import random

def exponential_backoff(attempt, base_delay=3, max_delay=60.0):
    delay = base_delay * (2 ** attempt)  # exponential growth
    delay = min(delay, max_delay)
    jitter = random.uniform(0, delay * 0.15)  # add up to 15% jitter
    return delay + jitter

In [None]:
if run_qe_pairs:
    import pickle
    import os
    import time
    import datetime
    import vertexai
    from vertexai.generative_models import GenerativeModel
    from tqdm import tqdm
    from google import genai
    
    # file paths for storing results
    root_path = "/Users/jjr/output/rarr-rep/gemini_labels/query-evidence/"
    query_evidence_labels_dict_path = os.path.join(root_path, "query_evidence_labels_dict.pkl")
    failed_api_calls_dict_path = os.path.join(root_path, "failed_api_calls_dict.pkl")

    # load empty results dictionary
    with open("/Users/jjr/output/rarr-rep/gemini_labels/query-evidence/query_evidence_labels_dict_empty.pkl", "rb") as f:
        query_evidence_labels_dict_empty = pickle.load(f)
    
    # load main results dictionary
    with open("/Users/jjr/output/rarr-rep/gemini_labels/query-evidence/query_evidence_labels_dict.pkl", "rb") as f:
        query_evidence_labels_dict = pickle.load(f)

    # update main results with any new q,e pairs
    
    # get all q,e pairs from blank
    qe_pairs_empty = set(query_evidence_labels_dict_empty.keys())
    # get all q,e pairs from labeled
    qe_pairs_labeled = set(query_evidence_labels_dict.keys())
    # add new q,e pairs to labeled
    qe_add = qe_pairs_empty - qe_pairs_labeled
    print(f"adding {len(qe_add)} new q,e pairs")
    for (q,e) in list(qe_add):
        query_evidence_labels_dict[q,e] = None
    
    # load evidence data file
    with open("/Users/jjr/output/rarr-rep/gemini_labels/query-evidence/evidence_dict.pkl", "rb") as f:
        evidence_dict = pickle.load(f)
        
    # attempt to load any previously failed API calls; if not, start fresh.
    if os.path.exists(failed_api_calls_dict_path):
        with open(failed_api_calls_dict_path, "rb") as f:
            failed_api_calls_dict = pickle.load(f)
    else:
        failed_api_calls_dict = {}
    
    # create list of (query, chunked_id) tuples that have None as value
    no_eval_list = [
        (query, chunk_id) 
        for (query, chunk_id), val in query_evidence_labels_dict.items()
        if val is None and (query, chunk_id) not in failed_api_calls_dict
    ]
    
    
    # instantiate model
    api_key = ""
    client = genai.Client(api_key=api_key)
    
    max_retries = 5
    
    # iterate over tuples in no_eval_list
    i = 0
    for (query, chunk_id) in tqdm(no_eval_list, desc="Processing pairs"):
    
        # extract evidence text using chunk_id
        evidence_text = evidence_dict[chunk_id]
    
        # Populate the prompt with the required data
        prompt_data = { "query": query, "evidence": evidence_text}
    
        # create llm messages
        messages = prompt.format(**prompt_data)
        
        # send to gemini
        success = False
        last_exception = None
        for attempt in range(max_retries):
            try:
                response = client.models.generate_content(model="gemini-1.5-pro-002", contents=messages)
                response_text = response.text
                success = True
                time.sleep(0.5)
                break
            except Exception as e:
                last_exception = e
                if hasattr(e, 'response') and getattr(e.response, 'status_code', None) == 429:
                    delay = exponential_backoff(attempt)
                    print(f"HTTP 429 received. Waiting {delay:.2f} seconds before retrying.")
                else:
                    delay = exponential_backoff(attempt)
                    print(f"API call error: {e}. Retrying in {delay:.2f} seconds.")
                time.sleep(delay)
    
        # if the API call was not successful, flag and continue (do not update the main dictionary)
        if not success:
            error_msg = f"API call failed after {max_retries} attempts: {last_exception}"
            failed_api_calls_dict[(query, chunk_id)] = error_msg
            continue
        
        try:
            json_results = extract_json(response_text)
            if not json_results:
                raise ValueError("No valid JSON found in response")
            response_json = json_results[0]
        except Exception as e:
            print(f"JSON parsing error for pair ({query}, {chunk_id}): {e}")
            failed_api_calls_dict[(query, chunk_id)] = f"JSON parsing error: {e}"
            continue  
        
        # save label in dictionary
        query_evidence_labels_dict[query, chunk_id] = response_json
        i += 1
    
        # save progress every 1000 iterations to avoid losing good API calls
        if i % 250 == 0:
            print(f"saving iteration {i}")
            # save timestamped version
            timestamp = datetime.datetime.now().strftime('%d_%m_%y_%H%M')
            with open(os.path.join(root_path, f"query_evidence_labels_dict_{timestamp}.pkl"), "wb") as f:
                pickle.dump(query_evidence_labels_dict, f)
            with open(os.path.join(root_path, f"failed_api_calls_dict_{timestamp}.pkl"), "wb") as f:
                pickle.dump(failed_api_calls_dict, f)
            # update the fixed file names so future runs will load the latest errors
            with open(query_evidence_labels_dict_path, "wb") as f:
                pickle.dump(query_evidence_labels_dict, f)
            with open(failed_api_calls_dict_path, "wb") as f:
                pickle.dump(failed_api_calls_dict, f) 
    
    # Save any remaining work after the loop completes to the fixed file paths
    with open(query_evidence_labels_dict_path, "wb") as f:
        pickle.dump(query_evidence_labels_dict, f)
    with open(failed_api_calls_dict_path, "wb") as f:
        pickle.dump(failed_api_calls_dict, f)