In [5]:
import pandas as pd
import json
from google import genai
import numpy as np
from rouge_score import rouge_scorer

!gcloud auth application-default login

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=764086051850-6qr4p6gpi6hn506pt8ejuq83di341hur.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fsqlservice.login&state=zgGtPDytFQ3hsQwhyM2GxtpaHZ3hX6&access_type=offline&code_challenge=T9kxvfIAWik3G9Y3dkAP8cLs2ux5tP7nmg2E64jAR58&code_challenge_method=S256


Credentials saved to file: [/Users/lexha/.config/gcloud/application_default_credentials.json]

These credentials will be used by any library that requests Application Default Credentials (ADC).

Quota project "my-argolis-prj" was added to ADC which can be used by Google client libraries for billing and quota. Note that some services may still bill the project owning the resource.


Updates are available for some Google Cloud CLI co

In [None]:
PROJECT_ID = "$YOUR_PROJECT_ID"
LOCATION = 'global'

client = genai.Client(
    vertexai=True,
    project=PROJECT_ID,
    location=LOCATION,
)

max_output_tokens=3000
model = 'gemini-2.5-flash'

In [7]:
def refine_string(text_data):
    cleaned_text = text_data.strip()
    if cleaned_text.startswith("```json"):
        cleaned_text = cleaned_text[len("```json"):].strip()

    if cleaned_text.endswith("```"):
        cleaned_text = cleaned_text[:-len("```")].strip()

    if cleaned_text.startswith('{') and cleaned_text.endswith('}'):
        json_string = cleaned_text
    else:
        json_string = None

    data_dict = {}
    reason = None
    if json_string:
        try:
            data_dict = json.loads(json_string)
        except json.JSONDecodeError as e:
          reason = 'Wrong JSON format'
    else:
      reason = 'Wrong JSON format'
    return data_dict, reason

### Initial Evaluation

In [8]:
valid = pd.read_json('./sample_data/valid.jsonl', lines=True)

In [9]:
initial_prompt = '''
You are a smart multiple-choice answering assistant.
You're role is to choose right answer of given "question", among given "choices", which are possible choices.

Please return a JSON object with three properties: \"answer\", \"reason\" and \"likelihood\".
The \"answer\" property is a integer, indicating the right choice number.
The \"reason\" property is a string that explains which of the comparisons are the best matches
for the input, and the reason should also explain any factors that influence the likelihood score.
The \"likelihood\" is a probability that in reality, the acronym is used for that meaning, and must be between 0~1.
'''

input_template = """
  # **Inputs**

  - `question` : {question}
  - `choices`: {choices}
  """

In [4]:
valid.head(2)

Unnamed: 0,question,choices,answer,likelihood,reason,target
0,In the context of environmental monitoring and...,"[1] Application Programming Interface, [2] Atm...",3,0.05,While API is most commonly known as 'Applicati...,"```json\n{\n ""answer"":3,\n ""reason"":""While A..."
1,In the historical context of computer hardware...,"[1] Integrated Development Environment, [2] In...",3,0.15,While 'Integrated Development Environment' is ...,"```json\n{\n ""answer"":3,\n ""reason"":""While '..."


In [9]:
valid['initial_result'] = None

for idx in range(len(valid)):
    now_question=valid['question'][idx]
    now_choices=valid['choices'][idx]
    now_inputs = input_template.format(question=now_question, choices=now_choices)

    now_prompt = initial_prompt + now_inputs
    response = client.models.generate_content(
    model = model,
    contents = [now_prompt]
    )

    valid.loc[idx, 'initial_result'] = response.text

    if idx % 10 == 0:
        print(idx)

0


In [12]:
valid['target_json'] = valid['target'].apply(lambda x: refine_string(x))
valid['initial_result_json'] = valid['initial_result'].apply(lambda x: refine_string(x))

In [33]:
valid['initial_result_answer'] = valid['initial_result_json'].apply(lambda x: x[0]['answer'])
valid['initial_result_reason'] = valid['initial_result_json'].apply(lambda x: x[0]['reason'])
valid['initial_result_likelihood'] = valid['initial_result_json'].apply(lambda x: x[0]['likelihood'])

In [64]:
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
valid['initial_rougeL'] = valid[['reason', 'initial_result_reason']].apply(lambda x: scorer.score(x.reason, x.initial_result_reason)['rougeL'][2], axis=1)

In [70]:
print(f"Accuracy : {valid[valid['answer'] == valid['initial_result_answer']].shape[0] / len(valid)}")
print(f"Sum of Score diff : {abs(valid['likelihood'] - valid['initial_result_likelihood']).sum()}")
print(f"Mean of Score diff : {abs(valid['likelihood'] - valid['initial_result_likelihood']).mean()}")
print(f"Mean of RougeL Fmeasure : {valid['initial_rougeL'].mean()}")
print(f"Sum of RougeL Fmeasure : {valid['initial_rougeL'].sum()}")


Accuracy : 0.9
Sum of Score diff : 8.07
Mean of Score diff : 0.807
Mean of RougeL Fmeasure : 0.22048642728432394
Sum of RougeL Fmeasure : 2.2048642728432393


### Optimized

In [None]:
optimized_prompt = '''You are a smart multiple-choice answering assistant. Your primary role is to choose the single most accurate and precise answer for the given \"question\" from the provided \"choices\", which are possible options. When the \"question\" specifies a particular field, domain, or context, your selection must prioritize the meaning that is most specific and correct within that exact specialized context. The input may include illustrative examples of correct \"question\", \"choices\", and \"target\" outputs; these are for your reference to understand the task and expected output format, but you should only provide an answer for the final \"question\" presented in the query.\n\nPlease return a JSON object with three properties: \"answer\", \"reason\" and \"likelihood\".\nThe \"answer\" property is an integer, indicating the right choice number.\nThe \"reason\" property is a string that explains in detail why the chosen answer is the single best, most accurate, and most contextually appropriate match for the input question. If a specialized domain or context is provided in the question, your explanation must clearly articulate why the chosen answer is the most precise meaning within that specific field, differentiating it from other plausible but less exact options. When the question asks for an \"infrequent\" or \"niche\" meaning, the reason must clearly justify why the chosen option represents a genuinely rare but existing usage of the acronym, rather than a fabricated or irrelevant one. It must also clearly articulate all factors that influence the assigned 'likelihood' score, specifically addressing why the meaning's **global prevalence** is assessed as such, and explicitly distinguishing its global commonness from its potential commonness within the specific domain mentioned in the query, especially if the meaning is niche, infrequent, or has common alternative meanings.\nThe \"likelihood\" is a probability (between 0 and 1) representing the **absolute global prevalence and commonness** of the chosen meaning for the acronym in reality, **across all contexts and general knowledge, strictly disregarding its commonness within the specific domain mentioned in the query**. A high likelihood (e.g., 0.9-1.0) means it's a widely recognized and common meaning globally. A low likelihood (e.g., 0.01-0.25) **must be assigned** if the meaning is very niche, infrequent, or less common in general knowledge, **even if it is the correct answer for the specific question and even if it is highly common or fundamental within a specialized field**. This score is **NOT** a measure of your confidence in the correctness of the answer for the specific query; it is a measure of the meaning's general recognition in the broader world. When assessing this score, always default to a broad, general knowledge perspective, rather than a specialized one.\n\nSome examples of correct \"question\" & \"choices\" & \"target\" are:\nquestion: In the field of networking and telecommunications, what does the acronym MPLS most commonly refer to, outside of its primary 'Multiprotocol Label Switching' usage?\nchoices : [1] Multiprotocol Label Switching, [2] Minimum Packet Loss System, [3] Multi-Platform Load Sharing, [4] Mobile Positioning and Location Services\ntarget : ```json\n{\n  \"answer\":4,\n  \"reason\":\"While MPLS is overwhelmingly known as 'Multiprotocol Label Switching' in core networking, the acronym also less commonly stands for 'Mobile Positioning and Location Services' in certain contexts within mobile telecommunications, particularly in older 2G\\/3G systems or related research.\",\n  \"likelihood\":0.05\n}\n```\n==\n\n\n\nSome examples of correct \"question\" & \"choices\" & \"target\" are:\nquestion: While commonly known in business as 'Enterprise Resource Planning,' the acronym ERP has a distinct, less widely known meaning in another scientific field. What does ERP refer to in the context of cognitive neuroscience?\nchoices : [1] Enterprise Resource Planning, [2] Event Related Potentials, [3] Electronic Research Protocols, [4] Experimental Reaction Pathways\ntarget : ```json\n{\n  \"answer\":2,\n  \"reason\":\"ERP is overwhelmingly recognized as 'Enterprise Resource Planning' in the business world. However, in cognitive neuroscience, ERP stands for 'Event Related Potentials,' which are measured brain responses directly related to a specific event. This meaning is highly specialized and not widely known outside of neuroscience, making it an 'unpopular' or less common general understanding of the acronym.\",\n  \"likelihood\":0.07\n}\n```\n==\n\n\n\nSome examples of correct \"question\" & \"choices\" & \"target\" are:\nquestion: In the field of advanced microscopy and nanoscience, what does the acronym SNOM stand for?\nchoices : [1] Standard Nanoscale Optical Module, [2] Scanning Near-field Optical Microscopy, [3] Super-resolution Nano-Observation Machine, [4] Spatial Nanospectroscopy of Organic Materials\ntarget : ```json\n{\n  \"answer\":2,\n  \"reason\":\"SNOM stands for Scanning Near-field Optical Microscopy, a technique used to achieve spatial resolution beyond the diffraction limit of light by probing the sample with an evanescent field.\",\n  \"likelihood\":0.05\n}\n```'''

input_template = """
  # **Inputs**

  - `question` : {question}
  - `choices`: {choices}
  """

In [None]:
print(optimized_prompt)

You are a smart multiple-choice answering assistant. Your primary role is to choose the single most accurate and precise answer for the given "question" from the provided "choices", which are possible options. When the "question" specifies a particular field, domain, or context, your selection must prioritize the meaning that is most specific and correct within that exact specialized context. The input may include illustrative examples of correct "question", "choices", and "target" outputs; these are for your reference to understand the task and expected output format, but you should only provide an answer for the final "question" presented in the query.

Please return a JSON object with three properties: "answer", "reason" and "likelihood".
The "answer" property is an integer, indicating the right choice number.
The "reason" property is a string that explains in detail why the chosen answer is the single best, most accurate, and most contextually appropriate match for the input questi

In [11]:
valid['optimized_result'] = None

for idx in range(len(valid)):
    now_question=valid['question'][idx]
    now_choices=valid['choices'][idx]
    now_inputs = input_template.format(question=now_question, choices=now_choices)

    now_prompt = optimized_prompt + now_inputs
    response = client.models.generate_content(
    model = model,
    contents = [now_prompt]
    )

    valid.loc[idx, 'optimized_result'] = response.text

    if idx % 10 == 0:
        print(idx)

0


In [12]:
valid['optimized_result_json'] = valid['optimized_result'].apply(lambda x: refine_string(x))
valid['optimized_result_answer'] = valid['optimized_result_json'].apply(lambda x: x[0]['answer'])
valid['optimized_result_reason'] = valid['optimized_result_json'].apply(lambda x: x[0]['reason'])
valid['optimized_result_likelihood'] = valid['optimized_result_json'].apply(lambda x: x[0]['likelihood'])

In [15]:
valid['optimized_rougeL'] = valid[['reason', 'optimized_result_reason']].apply(lambda x: scorer.score(x.reason, x.optimized_result_reason)['rougeL'][2], axis=1)

In [None]:
print(f"Accuracy : {valid[valid['answer'] == valid['optimized_result_answer']].shape[0] / len(valid)}")
print(f"Sum of Score diff : {abs(valid['likelihood'] - valid['optimized_result_likelihood']).sum()}")
print(f"Mean of Score diff : {abs(valid['likelihood'] - valid['optimized_result_likelihood']).mean()}")
print(f"Mean of RougeL Fmeasure : {valid['optimized_rougeL'].mean()}")
print(f"Sum of RougeL Fmeasure : {valid['optimized_rougeL'].sum()}")


Accuracy : 0.9
Sum of Score diff : 2.1500000000000004
Mean of Score diff : 0.21500000000000002
Mean of RougeL Fmeasure : 0.30202621634655347
Sum of RougeL Fmeasure : 3.0202621634655347


In [None]:
# Accuracy : 0.9
# Sum of Score diff : 8.07
# Mean of Score diff : 0.807
# Mean of RougeL Fmeasure : 0.22048642728432394
# Sum of RougeL Fmeasure : 2.2048642728432393

In [None]:
# valid[valid['answer'] != valid['optimized_result_answer']]

Unnamed: 0,question,choices,answer,likelihood,reason,target,initial_result,target_json,initial_result_json,initial_result_answer,initial_result_reason,initial_result_likelihood,initial_rougeL,optimized_result,optimized_result_json,optimized_result_answer,optimized_result_reason,optimized_result_likelihood,optimized_rougeL
9,What does the acronym ANOVA stand for?,"[1] Analysis of Variance, [2] Advanced Network...",3,0.01,While ANOVA is most widely recognized in stati...,"```json\n{\n ""answer"":3,\n ""reason"":""While A...","```json\n{\n ""answer"": 1,\n ""reason"": ""The a...","({'answer': 3, 'reason': 'While ANOVA is most ...","({'answer': 1, 'reason': 'The acronym ANOVA is...",1,The acronym ANOVA is standard terminology in s...,1.0,0.208696,"```json\n{\n ""answer"": 1,\n ""reason"": ""The a...","({'answer': 1, 'reason': 'The acronym ANOVA is...",1,The acronym ANOVA is a well-established term i...,0.95,0.195489
