In [None]:
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
import os
import warnings

print('начало')
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
print('начало')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Кастомный колбэк для отслеживания прогресса обучения [[1]]
class ProgressCallback(TrainerCallback):
    def __init__(self, total_epochs):
        super().__init__()
        self.progress_bar = None
        self.current_epoch = 0
        self.total_epochs = total_epochs

    def on_train_begin(self, args, state, control, **kwargs):
        self.progress_bar = tqdm(total=self.total_epochs, desc="Training Progress")

    def on_epoch_end(self, args, state, control, **kwargs):
        self.current_epoch += 1
        self.progress_bar.update(1)
        loss = state.log_history[-1].get('loss', 0) if state.log_history else 0
        self.progress_bar.set_postfix({'Epoch': f"{self.current_epoch}/{self.total_epochs}", 'Loss': loss})

    def on_train_end(self, args, state, control, **kwargs):
        if self.progress_bar:
            self.progress_bar.close()

# Загрузка данных с выбором процента [[2]][[3]]
def load_data(train_path, test_path, train_frac=0.05, test_frac=0.05):
    train_df = pd.read_excel(train_path, sheet_name=0, engine='openpyxl')
    test_df = pd.read_excel(test_path, sheet_name=0, engine='openpyxl')

    if train_frac < 1.0:
        train_df = train_df.sample(frac=train_frac, random_state=42).reset_index(drop=True)
    if test_frac < 1.0:
        test_df = test_df.sample(frac=test_frac, random_state=42).reset_index(drop=True)

    return train_df, test_df

# Класс датасета [[4]]
class DifferentialEquationDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, df, max_len=2048):  # ★ Phi‑3 has ≥4 k ctx
        self.tokenizer = tokenizer
        self.df = df

    def _build_sample(self, eq, ans):
        # For causal LM we concatenate instruction + answer
        prompt = f"Solve: {eq}\nAnswer:"
        full_text = prompt + " " + str(ans)
        enc = self.tokenizer(
            full_text,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        enc = {k: v.squeeze() for k, v in enc.items()}
        # Labels are the same as input_ids, but we can mask the prompt part
        prompt_len = len(
            self.tokenizer(prompt, add_special_tokens=False)["input_ids"]
        )
        labels = enc["input_ids"].clone()
        labels[:prompt_len] = -100              # ★ ignore loss on prompt
        enc["labels"] = labels
        return enc

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self._build_sample(self.df["equation"].iloc[idx], self.df["true_answer"].iloc[idx])

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,         # ★ use causal‑LM wrapper
    Trainer,
    TrainingArguments,
    TrainerCallback,
)

# 3. Training function --------------------------------------------------------
def train_model(
    train_df,
    val_df,
    model_name="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
    output_dir="./best_model_deepseek_qwen"
):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Phi models have no padding token – assign one
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
    ).to(device)
    model.config.pad_token_id = tokenizer.pad_token_id  # ★

    train_dataset = DifferentialEquationDataset(tokenizer, train_df)
    val_dataset   = DifferentialEquationDataset(tokenizer, val_df)

    epochs = 10
    training_args = TrainingArguments(
        output_dir=output_dir,
        eval_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=4,        # ★ bigger seq ⇒ smaller batch
        per_device_eval_batch_size=1,
        num_train_epochs=epochs,
        weight_decay=0.01,
        save_total_limit=2,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False,
        bf16=True,
        optim="adamw_torch_fused",
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        callbacks=[ProgressCallback(total_epochs=epochs)],
    )
    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# 4. Generation / evaluation stays almost identical --------------------------
def evaluate_model(test_df, model_path="./best_model_deepseek_qwen"):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(
        model_path, torch_dtype=torch.bfloat16
    ).to(device)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.pad_token_id

    results = []
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Testing"):
        prompt = f"Solve: {row['equation']}\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                num_beams=5,
                early_stopping=True,
            )
        full_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_answer = full_text.split("Answer:")[-1].strip()
        results.append(
            dict(
                equation=row["equation"],
                true_answer=row["true_answer"],
                generated_answer=generated_answer,
            )
        )
    return pd.DataFrame(results)
        


#основное тело программы
# Параметры загрузки
TRAIN_PATH = "./train_x_new.xlsx"
TEST_PATH = "./test_x.xlsx"
OUTPUT_CSV = './test_results__tmodel_fulldata_deepseek_qwen.csv'

TRAIN_FRACTION = 1  # 0.1  - 10% тренировочных данных
TEST_FRACTION = 1   # 0.05 - 5% тестовых данных

# Загрузка данных
train_df, test_df = load_data(
    TRAIN_PATH,
    TEST_PATH,
    train_frac=TRAIN_FRACTION,
    test_frac=TEST_FRACTION
)

# Создание валидационной выборки 
val_df = test_df.sample(frac=0.2, random_state=42) if len(test_df) > 0 else test_df

# Обучение и оценка
train_model(train_df, val_df)
print('закончили обучение')
results_df = evaluate_model(test_df)
results_df.to_csv(OUTPUT_CSV, sep=';', index=False)

начало
начало


Training Progress:   0%|          | 0/10 [01:20<?, ?it/s]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Training Progress:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss
