In [1]:
from tqdm import tqdm
from utils import *
import json
from openai import OpenAI
import numpy as np
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
load_dotenv()
client = OpenAI()
delete_cache()

Deleting: __pycache__
All __pycache__ directories have been deleted.


In [2]:
items = []
with open("train.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        items.append(json.loads(line))

In [3]:
embeddings = []
records = []  # 保留原始数据，方便回溯

for item in tqdm(items):
    question = item["question"].replace("\n", " ")
    think = item["think"].replace("\n", " ")

    text = f"問題：{question} 解析：{think} 答案：{item['answer']}"

    emb = client.embeddings.create(
        model="text-embedding-3-small",
        input=[text]
    ).data[0].embedding

    embeddings.append(emb)
    records.append(item)

embeddings = np.array(embeddings)


100%|██████████| 99/99 [00:43<00:00,  2.25it/s]


In [None]:
model_id = "Qwen/Qwen3-4B"  # 需提前下载/授权
query_num = 6

In [5]:
tokenizer = AutoTokenizer.from_pretrained(
    model_id, 
    use_fast=False, 
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    device_map="auto", 
    dtype=torch.bfloat16
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
def search(query, embeddings, chunks, top_k=5):
    query = query.replace("\n", " ")
    query_vec = client.embeddings.create(
        model="text-embedding-3-small",
        input=[query]
    ).data[0].embedding
    query_vec = np.array(query_vec)

    scores = np.dot(embeddings, query_vec)
    top_indices = np.argsort(scores)[-top_k:][::-1]
    return [chunks[i] for i in top_indices]

In [9]:
pred_list = []

with open("./val.jsonl", "r", encoding="utf-8") as f:
    for idx, line in enumerate(f):
        line = line.strip()
        if not line:
            continue

        data = json.loads(line)
        question = data["question"]
        results = search(question, embeddings, records, query_num)
        # print(results)
        context = "\n".join(
            f"問題：{r['question'].replace('\n', ' ')} \n"
            f"解析：{r['think'].replace('\n', ' ')} \n"
            f"答案：{r['answer'].replace('\n', ' ')} \n"
            for r in results
        )

        print()
        prompt = f"""
你是一个香港保险经纪人，你在考试，你需要正确回答考试题目。
你可以参考的信息：
{context}
回答格式举例（请注意，你的回答要由选项字母结尾，不要有多余的话）。
理由：...（简单说一下理由即可）
答案：A/B/C/D（所有题目都是单选题）
对于有一些题目的选项，例如“以上皆正确”或“以上皆不正确”，这种选项较难，但说不定是对的，请深思熟虑后做出选择。
"""
        print(f"prompt如下：{prompt}\n\n")
        messages = [
            {"role": "system", "content": prompt},
            {"role": "user", "content": question}
        ]

        response = predict(messages, model, tokenizer)

        # 取 response 最后一个字符（防止空字符串）
        last_char = response[-1] if response else None
        pred_list.append(last_char)
        print(response)
        print(f"{idx+1}: {last_char}")

NameError: name 'query_num' is not defined

In [None]:
pred_list

In [None]:
correct = 0
total = 0

with open("./val.jsonl", "r", encoding="utf-8") as f:
    for idx, line in enumerate(f):
        line = line.strip()
        if not line:
            continue

        data = json.loads(line)
        gold = data["answer"]  # 你已确认是 A / B / C / D

        pred = pred_list[idx]
        pred = pred.upper() if pred is not None else None

        total += 1
        if pred == gold:
            correct += 1

accuracy = correct / total if total > 0 else 0.0

print(f"Accuracy: {accuracy:.2f} ({correct}/{total})")