# Imports

In [1]:
import pandas as pd
import json

# Data Loading & Preparation

In [10]:
data_directory = "../Experiment_data/"
data_type = ["NQ" , "BIOASK"]

available_models = {"qwen_3B" : "Qwen/Qwen2.5-3B-Instruct", 
                    "llama_3B" : "meta-llama/Llama-3.2-3B-Instruct", 
                    "llama_8B" : "meta-llama/Llama-3.1-8B-Instruct", 
                    "mistral_7B" : "mistralai/Mistral-7B-Instruct-v0.3"}

In [None]:
# NQ
df_nq = pd.read_csv(data_directory + f"NQ/{available_models["qwen_3B"].split('/')[1]}/results_SHUFFLE.csv")
df_nq_ref = pd.read_csv("../data/NQ.csv")
df_nq["actual_answer"] = df_nq_ref[["answer"]]

# BIOASK

df_bio = pd.read_csv(data_directory + f"BIOASK/{available_models["qwen_3B"].split('/')[1]}/results_SHUFFLE.csv")


In [3]:
df_syn = pd.read_csv('../data/synthetic_data/20_synergy_hard_negatives.csv', sep = ';')

# LLM Evaluation

In [57]:
system_prompt = """You are an expert judge evaluating whether two sentences are equivalent in meaning, 
both are answers to the same query. One is a generated answer and the other is the ground truth.

Evaluation Criteria:
1. Focus on semantic equivalence, not exact wording
2. Minor grammatical differences don't affect equivalence
3. The generated answer must capture all key information from the ground truth
4. Additional relevant information in the generated answer is acceptable

Output Format (strictly follow this JSON format):
{"evaluation": "yes"/"no", "explanation": "short explanation about the provided evaluation"}

Examples:
Query: "What is photosynthesis?"
Ground Truth: "Photosynthesis is how plants make food using sunlight."
Generated Answer: "The process by which plants convert sunlight into food is called photosynthesis."
Output: {"evaluation": "yes", "explanation": "Both sentences describe the same process with equivalent meaning, though worded differently."}

Query: "Who wrote Romeo and Juliet?"
Ground Truth: "William Shakespeare wrote Romeo and Juliet."
Generated Answer: "Romeo and Juliet was a play by Shakespeare."
Output: {"evaluation": "yes", "explanation": "Both identify Shakespeare as the author, despite slight wording differences."}

Query: "What causes seasons?"
Ground Truth: "Earth's axial tilt causes seasons."
Generated Answer: "The changing distance from the sun causes seasons."
Output: {"evaluation": "no", "explanation": "The answers provide different scientific explanations for seasons."}"""


In [120]:
from google.oauth2 import service_account
import vertexai
from vertexai.generative_models import GenerativeModel, Tool, FunctionDeclaration, ToolConfig

from vertexai.generative_models import Part

vertexai.init(
    project="oag-ai",
    credentials=service_account.Credentials.from_service_account_file("google-credentials.json"),
)



# 1. Define your schema as a FunctionDeclaration
equivalence_function = FunctionDeclaration(
    name="evaluate_equivalence",
    description="Determine if two answers are semantically equivalent",
    parameters={
        "type": "object",
        "properties": {
            "evaluation": {
                "type": "string",
                "enum": ["yes", "no"],
                "description": "Whether the answers are equivalent"
            },
            "explanation": {
                "type": "string",
                "description": "Brief rationale for the evaluation"
            }
        },
        "required": ["evaluation", "explanation"]
    }
)

# 2. Create the Tool
equivalence_tool = Tool(function_declarations=[equivalence_function])

tool_config = ToolConfig(
    function_calling_config=ToolConfig.FunctionCallingConfig(
        mode=ToolConfig.FunctionCallingConfig.Mode.ANY    )
)


judge_model = GenerativeModel(
    model_name="gemini-2.0-flash",  # or your preferred model
    system_instruction= system_prompt, 
    
    # tools=[equivalence_tool]
)#model = "publishers/google/models/gemini-2.0-flash-thinking-exp-01-21"
 
def prompt_just_text(prompt: str,temperature=0.0) -> str:
    return judge_model.generate_content(
        generation_config={
            "temperature": temperature, 
            "response_mime_type": "application/json",
        },
        contents=[
            prompt
        ], 
        # tool_config=tool_config  # Force schema use
    ).text


In [121]:
def evaluate(query: str, ground_truth: str, generated_answer: str):
    template_prompt = f"""Evaluate if the following answers to the query are equivalent:
                        Query: {query}
                        Ground Truth: {ground_truth}
                        Generated Answer: {generated_answer}
                        Provide your evaluation in the specified JSON format."""
    
    response = prompt_just_text(template_prompt)
    return json.loads(response)

In [122]:
evaluate(query = df_nq.loc[1].query, ground_truth= df_nq.loc[1].actual_answer, generated_answer=df_nq.loc[1].provided_answer )

{'evaluation': 'yes',
 'explanation': 'The generated answer contains the ground truth information (seven episodes) and provides additional context, making it equivalent.'}

In [145]:
# BIOASK - NQ

models = ["qwen_3B", "llama_8B", "mistral_7B"]
available_models = {"qwen_3B" : "Qwen/Qwen2.5-3B-Instruct", 
                    "llama_3B" : "meta-llama/Llama-3.2-3B-Instruct", 
                    "llama_8B" : "meta-llama/Llama-3.1-8B-Instruct", 
                    "mistral_7B" : "mistralai/Mistral-7B-Instruct-v0.3"}
dataset = "BIOASK"

res = []
eval_results = {}

for model_name in models : 
    df = pd.read_csv(data_directory + f"{dataset}/{available_models[model_name].split('/')[1]}/results_SHUFFLE.csv")
    df_ref = pd.read_csv(f"../data/{dataset}.csv")
    df["actual_answer"] = df_ref[["answer"]]

    for i in df.index :
        print(i)
        # print("Query: ", df.loc[i].query )
        # print("Provided: ", df.loc[i].provided_answer )
        response = evaluate(query = df.loc[i].query, ground_truth= df.loc[i].actual_answer, generated_answer=df.loc[i].provided_answer)
        res.append(response["evaluation"])
    eval_results[f"{dataset}_{model_name}"] = res
    res = []




0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [142]:
eval_results['NQ_qwen_3B'].count('yes')
eval_results['NQ_mistral_7B'].count('yes')
eval_results['NQ_llama_8B'].count('yes')

72

In [148]:
eval_results['BIOASK_qwen_3B'].count('yes')
eval_results['BIOASK_mistral_7B'].count('yes')
eval_results['BIOASK_llama_8B'].count('yes')

75

In [None]:
# SYNTHETIC DATA

datasets = ['20_complementary', '20_synergy', '20_duplicate']
models = ["qwen_3B", "llama_8B", "mistral_7B"]
available_models = {"qwen_3B" : "Qwen/Qwen2.5-3B-Instruct", 
                    "llama_3B" : "meta-llama/Llama-3.2-3B-Instruct", 
                    "llama_8B" : "meta-llama/Llama-3.1-8B-Instruct", 
                    "mistral_7B" : "mistralai/Mistral-7B-Instruct-v0.3"}
res = []
eval_results = {}
for dataset in datasets: 
    df_ref = pd.read_csv(f'../data/synthetic_data/{dataset}.csv')
    for model_name in models: 
        df = pd.read_csv(f"../Experiment_data/{dataset}/{available_models[model_name].split('/')[1]}/results_VANILLA.csv")
        df["actual_answer"] = df_ref.answer
        for i in df.index: 
            print(i)
            response = evaluate(query = df.loc[i].query, ground_truth= df.loc[i].actual_answer, generated_answer=df.loc[i].provided_answer)
            res.append(response["evaluation"])
        eval_results[f"{dataset}_{model_name}"] = res
        res = []


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
0
1
2
3
4
5
6
7
8
9
10
11
12


In [None]:
eval_results.keys()

dict_keys(['20_complementary_mistral_7B'])

In [None]:
for key in eval_results.keys() : 
    print(key, " : ",  eval_results[key].count('yes') )

KeyError: '20_complementary_qwen_3B'

# Paraphrase with LLM

In [5]:
system_prompt = """You are a helpful assistant that paraphrases user-provided sentences. Your job is to rewrite the sentence while keeping the original meaning, but using different wording and structure. Return only the paraphrased version in a JSON format, like this:

{"paraphrased": "your paraphrased sentence here"}

Example 1: 
Sentence to paraphrase: "I can't attend the meeting tomorrow because of a prior commitment."
Expected Output : {"paraphrased": "I'm unable to join the meeting tomorrow due to an existing obligation."}

Example 2: 
Sentence to paraphrase: "Learning a new language can be challenging but rewarding."
Expected Output: {"paraphrased": "Picking up a new language is tough, yet fulfilling."}

Example 3: 
Sentence to paraphrase: "She decided to walk instead of taking the bus."
Expected Output: {"paraphrased": "She chose to go on foot rather than ride the bus."}

Do not include any explanations or extra text."""


In [None]:
from google.oauth2 import service_account
import vertexai
from vertexai.generative_models import GenerativeModel

vertexai.init(
    project="oag-ai",
    credentials=service_account.Credentials.from_service_account_file("google-credentials.json"),
)

paraphase_model = GenerativeModel(
    model_name="gemini-2.0-flash",
    system_instruction= system_prompt, 
)#model = "publishers/google/models/gemini-2.0-flash-thinking-exp-01-21"
 
def prompt_just_text(prompt: str,temperature=0.0) -> str:
    return paraphase_model.generate_content(
        generation_config={
            "temperature": temperature, 
            "response_mime_type": "application/json",
        },
        contents=[
            prompt
        ], 
    ).text


In [17]:
eval(df_syn.context.loc[0])[2:4]

["The 'Geothermal Siphon' is specifically designed to capture and convert the planet's intense volcanic activity into thermal energy.",
 "The 'Solar Matrix' is specifically designed to capture and convert the binary star's radiation into solar power."]

In [22]:
results = {}
def paraphrase(sentence: str):
    template_prompt = f"""Paraphrase this sentence: 
    {sentence}"""
    
    response = prompt_just_text(template_prompt)
    return json.loads(response)

for i in df_syn.index:
    res_1 = paraphrase(eval(df_syn.context.loc[i])[2:4][0])['paraphrased']
    res_2 = paraphrase(eval(df_syn.context.loc[i])[2:4][1])['paraphrased']
    results[i] = [res_1, res_2]

In [31]:
df_syn.context = df_syn.context.apply(lambda x: eval(x))
for i in df_syn.index: 
    df_syn.loc[i].context[2:4] = results[i]

In [33]:
df_syn.to_csv('../data/synthetic_data/20_synergy_hard_negatives.csv', index= False)

In [35]:
pd.read_csv('../data/synthetic_data/20_synergy_hard_negatives.csv', sep = ';')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 21, saw 2
