### 初始化全局变量，导入包

In [25]:
import os
import sys
from model import call_huoshan,call_openai
import pandas as pd
if "__file__" in globals():
    os.chdir(os.path.dirname(os.path.abspath(__file__)))

raw_data_path= os.path.join("raw_data")
scienceQA_path = os.path.join(raw_data_path, "ScienceQA")
sciKnowEval_path = os.path.join(raw_data_path, "SciKnowEval")

### 查看并读取sciQA数据

In [26]:
# filepath: /u01/mengpengyu/dataProcess/sciknowevalProcess.ipynb

sciQA_path=[]
sciQA_path.append(os.path.join(scienceQA_path, "test-00000-of-00001-f0e719df791966ff.parquet"))
sciQA_path.append(os.path.join(scienceQA_path, "train-00000-of-00001-1028f23e353fbe3e.parquet"))
sciQA_path.append(os.path.join(scienceQA_path, "validation-00000-of-00001-6c7328ff6c84284c.parquet"))

all_dfs = []
for file_path in sciQA_path:
    temp_df = pd.read_parquet(file_path)
    all_dfs.append(temp_df)

sciQA_data = pd.concat(all_dfs, ignore_index=True)


In [27]:

image_output_dir = os.path.join(raw_data_path, "ScienceQA", "images")
os.makedirs(image_output_dir, exist_ok=True) # 如果文件夹不存在则创建

def save_image(row, id):
    if row["image"] is not None and isinstance(row["image"], dict) and "bytes" in row["image"]:
        image_bytes = row['image']['bytes']
        if image_bytes: # 确保字节数据不为空
            # 构建图片文件名，可以使用索引 i 或者其他唯一标识符
            # 假设图片是 png 格式，如果不是，需要根据实际情况调整扩展名
            image_filename = f"image_{id}.png"
            image_filepath = os.path.join(image_output_dir, image_filename)
            
            try:
                with open(image_filepath, "wb") as img_file: # "wb" 表示以二进制写入模式打开
                    img_file.write(image_bytes)
                # print(f"Saved image to {image_filepath}")
            except Exception as e:
                print(f"Error saving image {image_filepath}: {e}")

#### 根据一个问题，以及不同的文件类型，构建传给模型的最终prompt

In [28]:


def sciQA_build_prompt(row):
    question = row["question"]
    choices = row["choices"]
    prompt=f"""
Review the question and the list of options below.
Select the option that best answers the question.
Respond with ONLY the 0-based index of your chosen option.
Your entire response must be a single integer (e.g., 0 for the first option, 1 for the second, etc.). Do not include any other text or explanations.

Question:
f{question}

Options:
f{choices}
0-based index:
    """
    return prompt

In [29]:
import hashlib

def generate_md5(input_string):
    # 创建一个 md5 hash 对象
    md5_hash = hashlib.md5()
    
    # 将输入的字符串转换为字节串（因为 hashlib 需要字节类型的数据）
    input_bytes = input_string.encode('utf-8')
    
    # 更新哈希对象
    md5_hash.update(input_bytes)
    
    # 获取哈希值的十六进制表示
    md5_digest = md5_hash.hexdigest()
    
    return md5_digest


#### 都是选择题，直接使用规则比对

In [30]:
def sciQA_rule_verifier(question, groundtruth, model_content):
    if groundtruth == model_content:
        return True
    else:
        return False

#### 处理sciQA数据：生成generation，调用verifier，整合成符合要求的最终dict格式

In [31]:
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import numpy as np
Lock = Lock()


def sciKnowEval_process_row(row):
    global res_list

   
    groundtruth = str(row["answer"])

    prompt = sciQA_build_prompt(row)
    id =generate_md5(prompt)
    generations=[]
    if row["image"] is not None and isinstance(row["image"], dict) and "bytes" in row["image"]:
        save_image(row, id)
        
    elif row["image"] is None :
        # If no image, use the model to generate the answer
        for i in range(1): # 调用模型的次数，暂定为1
            generation={}
            generation["model"] = "DeepSeek-R1"
            reasoning_content, answer_content = call_huoshan(prompt,"r1")
            answer_content=answer_content.strip()
            generation["reasoning_content"] = reasoning_content
            generation["answer_content"] = answer_content
            # Verify the model content
            evaluation={}
            correctness = sciQA_rule_verifier(prompt, groundtruth, answer_content)
            
            evaluation["correctness"] = correctness
            evaluation["By"] = "mengpengyu"
            evaluation["Method"] = "Rule"
            evaluation["extra_tags"] = []
            generation["evaluation"] = evaluation
            generations.append(generation)


    task_type= "multiple_choice_single"
    # 删除image字段
    if row["image"] is not None:
        row["image"]=f"image_{id}.png"  # 将图片字段替换为图片文件名
    cleaned_metadata = {}
    for key, value in row.items():
        if isinstance(value, np.ndarray):
            cleaned_metadata[key] = value.tolist()  # 将 ndarray 转换为 list
        elif isinstance(value, np.generic): # 处理 NumPy 标量类型如 np.int64, np.float64
            cleaned_metadata[key] = value.item()
        else:
            cleaned_metadata[key] = value
    res_dict={}
    res_dict["id"] = id
    res_dict["metadata"] = cleaned_metadata
    res_dict["source_dataset"] = "hicai-zju/SciKnowEval"
    # res_dict["subject_info"] = row["domain"]   #待定，额外对数据进行打标？
    res_dict["task_type"] = task_type
    res_dict["languages"] = "en"
    if row["image"] is not None:
        res_dict["multimedia"]= [{
            "type": "image",
            "content": f"ScienceQA/images/{row['image']}"
        }]
    res_dict["question"] = prompt
    res_dict["ground_truth"] = {
            "final_answer": groundtruth,
            "unit": None, 
            "solution": None,
            "extra_tags": []
        }
    res_dict["generations"]=generations
    if len(generations) == 0:
        res_dict["solve_rate"] = None
    else:
        res_dict["solve_rate"] = sum(1 for gen in generations if gen["evaluation"]["correctness"]) / len(generations)
    res_dict["prompted_for_correct_answer"]= False
    with Lock:
        res_list.append(res_dict)
    


In [33]:
import traceback
res_list = []
with ThreadPoolExecutor(max_workers=100) as executor:
    counter = 0
    futures = {executor.submit(sciKnowEval_process_row, row): index for index, row in sciQA_data.iloc[:50].iterrows()}
    for future in as_completed(futures):
        index = futures[future]
        try:
            future.result()  # 获取结果，确保异常被捕获
            counter += 1
            if counter % 10 == 0:
                print(f"Processed {counter} rows.")
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            traceback.print_exc() 
# 将结果写入JSON文件
output_file = os.path.join(raw_data_path, "ScienceQA_processed.json")
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(res_list, f, ensure_ascii=False, indent=4)


Processed 10 rows.
Processed 20 rows.
Processed 30 rows.
Processed 40 rows.
Processed 50 rows.
