In [None]:
import os 
import pandas as pd 
import re


In [None]:
!pip install --upgrade transformers

# Load data

In [None]:
val_path = "/kaggle/input/piqa-eval/piqa_validation.json"

val_df = pd.read_json(val_path)




In [None]:
val_df.head()

# Prompt

In [None]:
prompt= """You are an intelligent assistant that helps evaluate practical solutions to everyday tasks and problems. Your role is to analyze solutions in a given context and determine which one is more effective and practical.

Given a context and two possible solutions, choose the better solution that makes more sense for that context.

Here are some important rules for the task:
- Think carefully and logically about which solution is more appropriate for the given context
- Consider practicality, safety, and effectiveness when evaluating the solutions
- You must explain your reasoning for why your chosen solution is better
- You must answer with <answer>0</answer> if Solution 1 is better, or <answer>1</answer> if Solution 2 is better
- Put your final answer in <answer></answer> tags after your explanation

Here are some examples:
<example>

<question>
Based on the context, which solution is better?
Context: When boiling butter, when it's ready, you can
Solution 1: Pour it onto a plate
Solution 2: Pour it into a jar
</question>

<response>
Solution 1 is better because it's easier to clean up and it's more practical.
Final answer: <answer>0</answer>
</response>

</example>

Now, consider the following context and solutions, choose the better solution that makes more sense for that context.
Context: {goal}

Solution 1: {sol1}
Solution 2: {sol2}"""

# Gemini

In [None]:
from huggingface_hub import login

# Log in to Hugging Face
login()

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
# Move model to GPU if available
if torch.cuda.is_available():
    device = [0, 1]  # Use GPUs 0 and 1
    model = torch.nn.DataParallel(model, device_ids=device)
# model = model.to("cuda:0")
else:
    device = "cpu"
    model = model.to(device)

def post_process(response):
    pattern = r'<answer>(.*?)</answer>'
    match = re.search(pattern, response)
    if match:
        return match.group(1)
    return None


def get_llama_response(row):
    # Format the prompt with the current example
    formatted_prompt = prompt.format(
        goal=row['goal'],
        sol1=row['sol1'], 
        sol2=row['sol2']
    )
    
    # Tokenize and generate
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device if isinstance(device, str) else f'cuda:{device[0]}')
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode and return response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


# Evaluate

In [None]:
from tqdm import tqdm
with torch.no_grad():
    res = []
    cnt = 0
    for idx, row in val_df.iterrows():
        response = get_llama_response(row)
        final_answer = post_process(response)
        res.append(final_answer) 
        cnt += 1
        if cnt == 5: break 
    
val_df['llama_answer'] = res




In [None]:
res

In [None]:
val_df.to_csv("llama_eval.csv", index=False)


In [None]:
accuracy = (val_df['llama_answer'] == val_df['label']).mean()
print(f"Accuracy: {accuracy:.2f}")

