In [15]:
import os 
import pandas as pd 
import re
from tqdm import tqdm
import torch


In [16]:
!pip install --upgrade transformers
!pip install python-dotenv




# Load data

In [4]:
torch.cuda.empty_cache()


In [5]:
val_path = "/kaggle/input/cqa-dataset/dev_rand_split.jsonl"

# Load the JSONL file as a DataFrame
val_df = pd.read_json(val_path, lines=True)


In [6]:
val_df.head()

Unnamed: 0,answerKey,id,question
0,A,1afa02df02c908a558b4036e80242fac,"{'question_concept': 'revolving door', 'choice..."
1,A,a7ab086045575bb497933726e4e6ad28,"{'question_concept': 'people', 'choices': [{'l..."
2,B,b8c0a4703079cf661d7261a60a1bcbff,"{'question_concept': 'magazines', 'choices': [..."
3,A,e68fb2448fd74e402aae9982aa76e527,"{'question_concept': 'hamburger', 'choices': [..."
4,A,2435de612dd69f2012b9e40d6af4ce38,"{'question_concept': 'farmland', 'choices': [{..."


In [7]:
val_df['question'][1]

{'question_concept': 'people',
 'choices': [{'label': 'A', 'text': 'complete job'},
  {'label': 'B', 'text': 'learn from each other'},
  {'label': 'C', 'text': 'kill animals'},
  {'label': 'D', 'text': 'wear hats'},
  {'label': 'E', 'text': 'talk to each other'}],
 'stem': 'What do people aim to do at work?'}

# Prompt

In [8]:
prompt = """
You are a highly intelligent assistant specializing in solving multiple-choice commonsense reasoning questions. Your task is select the most logical and contextually accurate answer.

### Guidelines for the Task:
**Format the Final Answer**:  Format your final answer as `<answer>D</answer>` where "D" is the correct option letter.

### Task Input:
**Question ID**: {id}  
**Concept**: {question_concept}  
**Stem**: {stem}  
**Choices**:  
{formatted_choices}

### Task Output:
**Final Answer**: Format your final answer as `<answer>D</answer>` where "D" is the correct option letter.

Proceed to evaluate the following question:
"""


# Qwen2.5 3B

In [9]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("key_token")


In [10]:
from huggingface_hub import login

# Log in to Hugging Face

login(secret_value_0)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-3B-Instruct")

# Configure the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model = model.to(device)

# Helper functions
def post_process(response):
    import re
    pattern = r'<answer>(.*?)</answer>'
    matches = re.findall(pattern, response)
    if matches:
        return matches[-1]  # Lấy phần tử cuối
        
    return None

def get_qwen_response(row):
    # Define the prompt format (ensure this is set correctly)
    formatted_prompt = prompt.format(
        id=row['id'],
        question_concept=row['question']['question_concept'],
        stem=row['question']['stem'],
        formatted_choices="\n".join(
            [f"{choice['label']}. {choice['text']}" for choice in row['question']['choices']]
        )
    )
    
    # Tokenize and generate response
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=300,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode and return response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Evaluate

In [11]:
val_df['question'].iloc[1]

{'question_concept': 'people',
 'choices': [{'label': 'A', 'text': 'complete job'},
  {'label': 'B', 'text': 'learn from each other'},
  {'label': 'C', 'text': 'kill animals'},
  {'label': 'D', 'text': 'wear hats'},
  {'label': 'E', 'text': 'talk to each other'}],
 'stem': 'What do people aim to do at work?'}

In [12]:
import time  # Đảm bảo module time được import

res = []
for idx, row in val_df.iterrows():
    response = get_qwen_response(row)
    #print(response)
    final_answer = post_process(response)
    #print(idx,final_answer)
    res.append(final_answer) 
    time.sleep(10)
val_df['qwen_answer'] = res


In [13]:
val_df.to_csv("qwen_eval.csv", index=False)


In [14]:
accuracy = (val_df['qwen_answer'] == val_df['answerKey']).mean()
print(f"Accuracy: {accuracy:.2f}")



Accuracy: 0.58


# Llama

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")

# Configure the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move model to device
model = model.to(device)

# Helper functions
def post_process(response):
    import re
    pattern = r'<answer>(.*?)</answer>'
    match = re.search(pattern, response)
    if match:
        return match.group(1)
    return None

def get_llama_response(row):
    # Define the prompt format (ensure this is set correctly)
    formatted_prompt = prompt.format(
        id=row['id'],
        question_concept=row['question']['question_concept'],
        stem=row['question']['stem'],
        formatted_choices="\n".join(
            [f"{choice['label']}. {choice['text']}" for choice in row['question']['choices']]
        )
    )
    
    # Tokenize and generate response
    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode and return response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


In [None]:

res = []
for idx, row in val_df.iterrows():
    response = get_llama_response(row)
    #print(response)
    final_answer = post_process(response)
    #print(idx,final_answer)
    res.append(final_answer) 

val_df['llma_answer'] = res
accuracy = (val_df['llma_answer'] == val_df['answerKey']).mean()
print(f"Accuracy: {accuracy:.2f}")



# Gemini

In [9]:
import google.generativeai as genai
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("Gemini_key")
genai.configure(api_key=secret_value_0)
model = genai.GenerativeModel('gemini-1.5-pro')

In [None]:
import google.generativeai as genai
import pandas as pd
import re
from kaggle_secrets import UserSecretsClient
import time
import logging
from tenacity import retry, stop_after_attempt, wait_exponential, RetryError

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Configure the Gemini API
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("Gemini_key")
genai.configure(api_key=secret_value_0)
model = genai.GenerativeModel('gemini-1.5-pro') # Or 'gemini-1.5-pro-vision' if you want vision

# Post-process the response
def post_process(response):
    if not response:
        return None
    pattern = r"<answer>(.*?)</answer>"
    match = re.search(pattern, response)
    return match.group(1) if match else None

# Generate response from Gemini with retry and exception handling
@retry(stop=stop_after_attempt(5),
       wait=wait_exponential(multiplier=1, min=1, max=15),
       reraise=True) # reraise to let the outer loop handle failure after all retries
def get_gemini_response(row):
    formatted_prompt = prompt.format(
        id=row['id'],
        question_concept=row['question']['question_concept'],
        stem=row['question']['stem'],
        formatted_choices="\n".join(
            [f"{choice['label']}. {choice['text']}" for choice in row['question']['choices']]
        )
    )
    
    response = model.generate_content(formatted_prompt)
    return response.text
    

import time
# Process rows and collect responses
res = []
for idx in range(3):
    time.sleep(20)
    response = get_gemini_response(val_df.iloc[idx])
    print(response)
    final_answer = post_process(response)
    print(final_answer)
    res.append(final_answer)
    




In [None]:
# Add answers to DataFrame and save results
val_df['gemini_answer'] = res
val_df.to_csv("gemini_eval.csv", index=False)

# Normalize answers for accuracy calculation
val_df['gemini_answer'] = val_df['gemini_answer'].str.strip().str.lower()
val_df['answerKey'] = val_df['answerKey'].str.strip().str.lower()

# Calculate accuracy
accuracy = (val_df['gemini_answer'] == val_df['answerKey']).mean()
print(f"Accuracy: {accuracy:.2f}")