In [6]:
from transformers import AutoTokenizer
from vllm import SamplingParams
import string
from vllm import LLM
from datasets import load_dataset
from tqdm import tqdm
import random
from datetime import datetime
import json
import pandas as pd

random.seed(42)

model_name = "hatakeyama-llm-team/tanuki_inst_0515test"
n_batch = 10000
current_time_no_symbols = datetime.now().strftime(
    "%Y-%m-%d %H:%M:%S").replace("-", "").replace(":", "").replace(" ", "")
out_path = f"data/0520orpo_model/model_{current_time_no_symbols}.jsonl"

n_jobs=8
job_id=0

In [2]:
df=pd.read_parquet("data/0520orpo/code_all_10000000000.parquet")
records=df.to_dict(orient="records")

In [None]:
print("init models...")
llm = LLM(model=model_name, trust_remote_code=True)

# %%
tokenizer = AutoTokenizer.from_pretrained(model_name)


# プロンプトテンプレートの準備
random.shuffle(records)

#jobで分割
job_size=int(len(records)/n_jobs)
records=records[job_id*job_size:(job_id+1)*job_size]

cnt = 0
for i in tqdm(range(int(len(records)/n_batch))):
    # プロンプトの準備
    sampled_records = records[cnt*n_batch:(cnt+1)*n_batch]

    prompts = []
    for record in sampled_records:
        prompts.append(tokenizer.encode(record["prompt"])[:-1])

    # 推論の実行
    outputs = llm.generate(
        # prompts,
        prompt_token_ids=prompts,
        sampling_params=SamplingParams(
            temperature=0.1,
            max_tokens=512,
        )
    )
    for i, output in enumerate(outputs):
        sampled_records[i]["model_answer"] = output.outputs[0].text

        #模範解答と同じ場合は分かりませんにする
        #if sampled_records[i]["model_answer"] == sampled_records[i]["answer"]:
        #    sampled_records[i]["model_answer"] = "分かりません"
    with open(out_path, "a") as f:
        for record in sampled_records:
            record.pop("prompt")
            f.write(json.dumps(record, ensure_ascii=False))
            f.write("\n")

    cnt += 1

