In [5]:
# ---------- –ò–º–ø–æ—Ä—Ç—ã ----------
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import pandas as pd
import time
import warnings
import json
import sys

# ---------- –ü—É—Ç—å –∫ –º–æ–¥–µ–ª–∏ ----------
MODEL_PATH = '/home/skoltsov/qwen_1_5b/qwen_model/'

# ---------- –ü–∞—Ä–∞–º–µ—Ç—Ä—ã –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ ----------
MAX_NEW_TOKENS = 3000
test_percentage = 1
BATCH_SIZE = 62

# ---------- –ò–≥–Ω–æ—Ä–∏—Ä–æ–≤–∞–Ω–∏–µ –ø—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏–π ----------
warnings.filterwarnings("ignore", message="Current model requires 128 bytes of buffer for offloaded layers*")

# ---------- –û–ø—Ç–∏–º–∏–∑–∞—Ü–∏—è —Å–∫–æ—Ä–æ—Å—Ç–∏ ----------
torch.backends.cudnn.benchmark = True

# ---------- –ü–æ–∫–∞–∑ GPU –ø–∞–º—è—Ç–∏ ----------
def print_gpu_memory():
    if torch.cuda.is_available():
        print(f"üß† –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: {torch.cuda.memory_allocated() / (1024**2):.2f} MB")
        print(f"üõ° –ó–∞—Ä–µ–∑–µ—Ä–≤–∏—Ä–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: {torch.cuda.memory_reserved() / (1024**2):.2f} MB")
    else:
        print("CUDA –Ω–µ–¥–æ—Å—Ç—É–ø–Ω–∞.")

# ---------- –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–∫–µ–Ω–∏–∑–∞—Ç–æ—Ä–∞ –∏ –º–æ–¥–µ–ª–∏ ----------
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    padding_side="left"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="cuda"
)

if hasattr(model.config, "use_sliding_window_attention"):
    model.config.use_sliding_window_attention = False

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

# ---------- –ü–æ–ø—ã—Ç–∫–∞ –∫–æ–º–ø–∏–ª—è—Ü–∏–∏ –º–æ–¥–µ–ª–∏ ----------
try:
    model = torch.compile(model)
except Exception as e:
    print(f"torch.compile –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–ª: {e}")

# ---------- –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö ----------
test_data = pd.read_excel("/home/skoltsov/qwen_1_5b/data_balansed/test_x_balans.xlsx")
test_data = test_data.sample(frac=test_percentage, random_state=42).reset_index(drop=True)
print('üìÑ –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É—Ä–∞–≤–Ω–µ–Ω–∏–π:', len(test_data))

# ---------- –§—É–Ω–∫—Ü–∏—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ –ø—Ä–æ–º–ø—Ç–æ–≤ ----------
def build_prompts(equations):
    prompts = []
    for eq in equations:
        prompt = (
            f'Solve the differential equation: {eq}. '
            f'It is must to be provided the final decision in LaTeX format, enclosed in \\boxed{{}}'
        )
        prompts.append(prompt)
    return prompts

# ---------- –§—É–Ω–∫—Ü–∏—è —Ä–µ—à–µ–Ω–∏—è —É—Ä–∞–≤–Ω–µ–Ω–∏–π –±–∞—Ç—á–µ–º ----------
def solve_equations_batch(equations_batch, max_new_tokens=MAX_NEW_TOKENS):
    prompts = build_prompts(equations_batch)
    
    inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    start_time = time.time()

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

    end_time = time.time()
    inference_time = end_time - start_time

    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    
    return generated_texts, inference_time

# ---------- –§—É–Ω–∫—Ü–∏—è –æ—Ç–æ–±—Ä–∞–∂–µ–Ω–∏—è –ø—Ä–æ–≥—Ä–µ—Å—Å–±–∞—Ä–∞ ----------
def print_progress(current, total, elapsed_time):
    progress = (current / total)
    bar_length = 30
    block = int(bar_length * progress)
    time_per_batch = elapsed_time / current if current else 0
    eta = time_per_batch * (total - current)

    bar = "‚ñà" * block + "-" * (bar_length - block)
    percent = progress * 100
    sys.stdout.write(f"\rüìà –ü—Ä–æ–≥—Ä–µ—Å—Å: |{bar}| {percent:.2f}% ({current}/{total}) | ‚è≥ –ü—Ä–æ—à–ª–æ: {elapsed_time:.1f}c | ‚è± ETA: {eta:.1f}c")
    sys.stdout.flush()



t1= time.time()

# ---------- –û—Å–Ω–æ–≤–Ω–æ–π —Ü–∏–∫–ª –æ–±—Ä–∞–±–æ—Ç–∫–∏ + —Å–±–æ—Ä —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ ----------
output_filename = "Qwen_1_5b_inference_results_3000tok_balansed.json"
results = []

num_batches = (len(test_data) + BATCH_SIZE - 1) // BATCH_SIZE
print(f"üî¢ –û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π: {num_batches}")

start_global_time = time.time()

for batch_idx, start_idx in enumerate(range(0, len(test_data), BATCH_SIZE)):
    end_idx = start_idx + BATCH_SIZE
    batch = test_data.iloc[start_idx:end_idx]
    equations_batch = batch["equation"].tolist()

    solutions, inference_time = solve_equations_batch(equations_batch)

    for i, (eq, sol) in enumerate(zip(equations_batch, solutions)):
        #print("\n\n–£—Ä–∞–≤–Ω–µ–Ω–∏–µ:")
        #print(eq)
        #print("\n–û—Ç–≤–µ—Ç –º–æ–¥–µ–ª–∏:")
        #print(sol)
        #print("-" * 60)

        results.append({
            "equation": eq,
            "true_answer": batch.iloc[i].get('true_answer', ''),
            "generated_answer": sol,
            "type_eq": batch.iloc[i].get('type_eq', ''),  # <-- –ù–æ–≤–∞—è —Å—Ç—Ä–æ–∫–∞
        })

    print(f"\n‚è±Ô∏è –í—Ä–µ–º—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –±–∞—Ç—á–∞: {inference_time:.2f} —Å–µ–∫—É–Ω–¥")
    print_gpu_memory()
    print("=" * 100)

    # –°–æ—Ö—Ä–∞–Ω—è–µ–º –ø–æ—Å–ª–µ –∫–∞–∂–¥–æ–≥–æ –±–∞—Ç—á–∞
    with open(output_filename, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    elapsed_global_time = time.time() - start_global_time
    print_progress(batch_idx + 1, num_batches, elapsed_global_time)

print("\n\n‚úÖ –í—Å–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã –≤", output_filename)
t2= time.time()

print('–í—Ä–µ–º—è —Ä–∞–±–æ—Ç—ã –º–æ–¥–µ–ª–∏: ', t2-t1)

torch.compile –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–ª: Dynamo is not supported on Python 3.12+
üìÑ –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —É—Ä–∞–≤–Ω–µ–Ω–∏–π: 1710
üî¢ –û–±—â–µ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –±–∞—Ç—á–µ–π: 28

‚è±Ô∏è –í—Ä–µ–º—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –±–∞—Ç—á–∞: 196.06 —Å–µ–∫—É–Ω–¥
üß† –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: 2953.28 MB
üõ° –ó–∞—Ä–µ–∑–µ—Ä–≤–∏—Ä–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: 11962.00 MB
üìà –ü—Ä–æ–≥—Ä–µ—Å—Å: |‚ñà-----------------------------| 3.57% (1/28) | ‚è≥ –ü—Ä–æ—à–ª–æ: 197.0c | ‚è± ETA: 5320.2c
‚è±Ô∏è –í—Ä–µ–º—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –±–∞—Ç—á–∞: 195.17 —Å–µ–∫—É–Ω–¥
üß† –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: 2953.28 MB
üõ° –ó–∞—Ä–µ–∑–µ—Ä–≤–∏—Ä–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: 11962.00 MB
üìà –ü—Ä–æ–≥—Ä–µ—Å—Å: |‚ñà‚ñà----------------------------| 7.14% (2/28) | ‚è≥ –ü—Ä–æ—à–ª–æ: 392.7c | ‚è± ETA: 5104.9c
‚è±Ô∏è –í—Ä–µ–º—è –∏–Ω—Ñ–µ—Ä–µ–Ω—Å–∞ –±–∞—Ç—á–∞: 197.72 —Å–µ–∫—É–Ω–¥
üß† –ò—Å–ø–æ–ª—å–∑–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: 2953.28 MB
üõ° –ó–∞—Ä–µ–∑–µ—Ä–≤–∏—Ä–æ–≤–∞–Ω–æ GPU –ø–∞–º—è—Ç–∏: 12534.00 MB