In [None]:
import os
import re
import pickle
import pandas as pd
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer


def find_max_strict_step(text):
    pattern = r'step\s*(\d+):\n'
    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    if not matches:
        return None
    return max(int(num) for num in matches)
system = """You are a mathematics teacher reviewing a solution that appears to be missing one step. Given the position of the missing step, your task is to fill in the missing step.
The steps in the solution are labeled from Step 0 (problem statement) to Step N.
Please format your response as:
The missing step is:
[Write the complete missing step here with necessary explanations and equations]
"""
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
df = pd.read_json("ScaleQM+_test.json")
llm = LLM(model="CoT-Bridge-Random", tensor_parallel_size=4, gpu_memory_utilization=0.85)
tokenizer = AutoTokenizer.from_pretrained("CoT-Bridge-Random")
texts = []
for i in range(len(df)):
    max_step = find_max_strict_step(df.iloc[i]["messages"][1]["content"])
    for j in range(0, max_step):
        prompt1 = f"""There is a missing step between Step {j} and Step {j+1}.
{df.iloc[i]["messages"][1]["content"]}
"""
        messages = [
            {"role": "system",
             "content": system},
            {"role": "user",
             "content": prompt1},
        ]
        
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        texts.append(text)

outputs = llm.generate(
    texts,
    SamplingParams(
    temperature=0,
    max_tokens=1024,
    skip_special_tokens=True
)
)

results = []
for i in range(len(outputs)):
    results.append(outputs[i].outputs[0].text)

with open('results-sim.pkl', 'wb') as f:
    pickle.dump(results, f)