1. Pre-process your QA dataset `qa_data.jsonl`, of which each item roughly include question and answer. Preprocess it into a new dataset `qa_data_ready.jsonl` of which each item only have two properties ： `"question"`,  `"right_answer"`. You can refer to the following for implementation.


In [None]:
import json
from tqdm import tqdm
with open('/byllm/qa_data_ready.jsonl','w',encoding='utf-8') as f2:
    with open('/byllm/qa_data.jsonl','r',encoding='utf-8') as f:
        total_lines = sum(1 for _ in f)
        all_q = total_lines
        f.seek(0)
        for line in tqdm(f, total=total_lines, desc="Processing lines"):
            item = json.loads(line.strip())
            q=item['knowledge'] + ' so ' + item['question']
            
            my_dict={"question":q,"right_answer":item['answer']}
            
            f2.write(json.dumps(my_dict)+'\n')

2. Load your model for evaluation. Here I load `Qwen1.5-0.5B-Chat`. Define `batch_inference`.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
modelpath="Qwen1.5-0.5B-Chat"
model = AutoModelForCausalLM.from_pretrained(
   modelpath,
    torch_dtype="auto",
    device_map="auto"
).eval()
tokenizer = AutoTokenizer.from_pretrained(modelpath,trust_remote_code=True, padding_side="left")
def batch_inference(prompts:list[str])->list[str]:
    texts=[]
    for prompt in prompts:
        messages = [
            # 有的模型可以省略system prompt
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        texts.append(text)
    model_inputs = tokenizer(texts, return_tensors="pt", padding=True).to('cuda')

    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=512,
       
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    return responses


3. Now your model batchly answer the questions in `qa_data_ready.jsonl`. Output will be `qa_data_answer.jsonl`, which has the properties: `"question"`,  `"your_answer"`(generated by your model), `"right_answer"`. Adjust the `batch_size` to your need. The bigger, the faster.


In [None]:
from torch.utils.data import DataLoader
import json
from tqdm import tqdm
from datasets import load_dataset

# batchsize根据自己的GPU资源多少进行调整，越大越快。
batch_size=50

eval_dataset = load_dataset("/byllm", data_files="/byllm/qa_data_ready.jsonl", split="train")
data_loader = DataLoader(dataset=eval_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
with open('/byllm/qa_data_answer.jsonl','w',encoding='utf-8') as f2:
    for i in tqdm(data_loader, total=len(data_loader)):
        ans = batch_inference(i['question'])
        for q,a,ra in zip(i['question'], ans, i['right_answer']):
            my_dict={"question":q,"your_answer":a,"right_answer":ra }
            f2.write(json.dumps(my_dict)+'\n')





1. Now call a online LLM to judge your model's answers base on right answers, in `qa_data_answer.jsonl`. If your answer is considered by the online LLM to be right, a `"label"` is to 1，otherwise 0.Output is `qa_data_answer_judge.jsonl`，which has the properties: `"question"`, `"your_answer"`, `"right_answer"`, `"label"`, `"response"`. The online LLM here is [deepseek深度求索](https://platform.deepseek.com/api_keys). Click the link to apply for your API key, and replace the `api_key` below.

In [None]:
from openai import OpenAI
from tqdm import tqdm
import json

# Here please replace with your API key
api_key='sk-af2903a7da03f06dddbnwaubda'

client = OpenAI(
    api_key=api_key, base_url='https://api.deepseek.com'
)


def func(s):
    # get a string, return a answer string
    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": s},
        ],
        max_tokens=100,
        temperature=0.7,
        stream=False,
    )
    return response.choices[0].message.content


def llm_judge(answer_file, judge_func):
    # answer_file: a jsonl file, each line with "question", "your_answer", "right_answer".
    # judge_func: get a string, return a answer string
    judge_file = answer_file[:-6] + "_judge.jsonl"
    all_q = 0
    right_a = 0
    template = "Now I give you one question and two answers to it. One of the answers is student's answer, another is the right answer. Please based on the given right answer, judge if the student's answer get\
        it right. If the student get it right, please respond with a 'yes' and reasons, otherwise with a 'no' and reasons.\n Here is the question:{question}.\n \
            Student's answer: {your_answer}. \n Right answer: {right_answer}. "

    with open(judge_file, "w", encoding="utf-8") as f2:

        with open(answer_file, "r", encoding="utf-8") as f:
            total_lines = sum(1 for _ in f)
            f.seek(0)
            for line in tqdm(f, total=total_lines, desc="Processing lines"):
                item = json.loads(line.strip())
                pro = template.format(
                    question=item["question"],
                    your_answer=item["your_answer"],
                    right_answer=item["right_answer"],
                )
                try:
                    response= judge_func(pro)
                except Exception:
                    # abandon this item if rejected
                    continue
                label = 0
                # see as a yes only if the first 5 chars include 'yes'
                if "yes" in response.lower()[:5]:
                    right_a += 1
                    label = 1

                result = {
                    "question": item["question"],
                    "your_answer": item["your_answer"],
                    "right_answer": item["right_answer"],
                    "label": label,
                    "response":response
                }
                f2.write(json.dumps(result) + "\n")
                all_q += 1
               
    return right_a, all_q, right_a / all_q


right_a, all_q, accuracy = llm_judge(
    "byllm/qa_data_answer.jsonl", func
)


print(f"right answers ={right_a}, all = {all_q}, accuracy ={accuracy} ")


A simple evaluation against Qwen 0.5B is done!