# Обучение модели

In [1]:
import sys
import os

PROJECT_ROOT = os.path.dirname(os.path.abspath(os.getcwd()))

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

In [2]:
import os
os.environ['TRANSFORMERS_OFFLINE'] = '1'
os.environ['HF_DATASETS_OFFLINE'] = '1'

In [3]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from src.model import run_experiment

In [5]:
import gc
import torch

def cleanup():
    if 'clara_model' in globals():
        del clara_model
    if 'peft_model' in globals():
        del peft_model
    if 'base_model' in globals():
        del base_model
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

cleanup()

In [6]:
model, tokenizer = run_experiment()

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.52it/s]
bitsandbytes library load error: Configured CUDA binary not found at /home/jovyan/.conda/envs/byakubson-nlp-absa/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
Traceback (most recent call last):
  File "/home/jovyan/.conda/envs/byakubson-nlp-absa/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 320, in <module>
    lib = get_native_library()
          ^^^^^^^^^^^^^^^^^^^^
  File "/home/jovyan/.conda/envs/byakubson-nlp-absa/lib/python3.11/site-packages/bitsandbytes/cextension.py", line 288, in get_native_library
    raise RuntimeError(f"Configured {BNB_BACKEND} binary not found at {cuda_binary_path}")
RuntimeError: Configured CUDA binary not found at /home/jovyan/.conda/envs/byakubson-nlp-absa/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda121.so
Epoch 0: 100%|██████████| 4945/4945 [1:14:41<00:00,  1.10it/s, L=0.083


Running validation for epoch 0...





Val Loss: 1.2124, Val ABSA: 0.5186




Epoch 0: checkpoint is saved in checkpoints/clara_epoch_0


Epoch 1: 100%|██████████| 4945/4945 [1:14:43<00:00,  1.10it/s, L=0.855, A=0.68, B=0.46] 


Running validation for epoch 1...





Val Loss: 0.8852, Val ABSA: 0.4671
Epoch 1: checkpoint is saved in checkpoints/clara_epoch_1


Epoch 2: 100%|██████████| 4945/4945 [1:14:43<00:00,  1.10it/s, L=0.238, A=0.52, B=0.64] 


Running validation for epoch 2...





Val Loss: 0.8079, Val ABSA: 0.4671
Epoch 2: checkpoint is saved in checkpoints/clara_epoch_2


Epoch 3: 100%|██████████| 4945/4945 [1:14:39<00:00,  1.10it/s, L=0.443, A=0.36, B=0.82] 


Running validation for epoch 3...





Val Loss: 0.7310, Val ABSA: 0.4696
Epoch 3: checkpoint is saved in checkpoints/clara_epoch_3


Epoch 4: 100%|██████████| 4945/4945 [1:14:59<00:00,  1.10it/s, L=0.475, A=0.2, B=1]     


Running validation for epoch 4...





Val Loss: 0.6533, Val ABSA: 0.4601
Epoch 4: checkpoint is saved in checkpoints/clara_epoch_4
Logs are saved locally in: research_logs/Phi3.5-MAMS-Joint-Training_1766876526.csv


# Оценка метрик модели

In [12]:
import torch
import numpy as np
from tqdm.auto import tqdm

@torch.no_grad()
def evaluate_metrics(clara_model, dataloader, device, name="Test"):
    clara_model.eval()
    all_preds = []
    all_labels = []
    
    mapping = {"positive": 0, "negative": 1, "neutral": 2}
    rev_mapping = {0: "positive", 1: "negative", 2: "neutral"}
    
    print(f"--- Evaluating {name} (Manual Calculation) ---")
    
    for batch in tqdm(dataloader):
        task_types = batch['task']
        input_batch = {k: v.to(device) for k, v in batch.items() if isinstance(v, torch.Tensor)}
        
        outputs = clara_model(**input_batch, task=task_types)
        logits = outputs["logits"]
        labels = input_batch["labels"]
        
        for i in range(len(task_types)):
            if task_types[i] == "reason":
                target_mask = (labels[i] != -100)
                if not target_mask.any(): continue
                
                target_logits = logits[i][target_mask]
                preds = torch.argmax(target_logits, dim=-1)
                
                pred_text = clara_model.tokenizer.decode(preds, skip_special_tokens=True).strip().lower()
                label_text = clara_model.tokenizer.decode(labels[i][target_mask], skip_special_tokens=True).strip().lower()
                
                p_id = mapping.get(pred_text, -1)
                l_id = mapping.get(label_text, -1)
                
                if l_id != -1: # Считаем только если золотая метка корректна
                    all_preds.append(p_id)
                    all_labels.append(l_id)


    y_true = np.array(all_labels)
    y_pred = np.array(all_preds)
    
    # 1. Accuracy
    accuracy = np.mean(y_true == y_pred)
    
    # 2. F1-Score per class
    f1_scores = []
    class_stats = {}
    
    for class_id in [0, 1, 2]:
        tp = np.sum((y_true == class_id) & (y_pred == class_id))
        fp = np.sum((y_true != class_id) & (y_pred == class_id))
        fn = np.sum((y_true == class_id) & (y_pred != class_id))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        f1_scores.append(f1)
        class_stats[rev_mapping[class_id]] = {"P": precision, "R": recall, "F1": f1, "Support": np.sum(y_true == class_id)}

    # 3. Macro F1
    macro_f1 = np.mean(f1_scores)

    # --- ПЕЧАТЬ ОТЧЕТА ---
    print(f"\nFinal Results for {name}:")
    print(f"{'='*40}")
    print(f"Accuracy: {accuracy:.44f}")
    print(f"Macro F1: {macro_f1:.44f}")
    print(f"{'='*40}")
    print(f"{'Class':<12} | {'P':<6} | {'R':<6} | {'F1':<6} | {'Support'}")
    print(f"{'-'*45}")
    for cls, s in class_stats.items():
        print(f"{cls:<12} | {s['P']:<6.3f} | {s['R']:<6.3f} | {s['F1']:<6.3f} | {s['Support']}")
    
    clara_model.train()
    return {"acc": accuracy, "f1": macro_f1}

In [17]:
@torch.no_grad()
def evaluate_metrics_pure_python(clara_model, dataloader, device, name="Test"):
    clara_model.eval()
    all_preds = []
    all_labels = []
    
    mapping = {"positive": 0, "negative": 1, "neutral": 2}
    rev_mapping = {0: "positive", 1: "negative", 2: "neutral"}
    
    print(f"--- Evaluating {name} (Generation Mode) ---")
    debug_count = 0
    
    for batch in tqdm(dataloader):
        task_types = batch['task']
        # Нам нужны только задачи 'reason'
        indices_to_process = [i for i, t in enumerate(task_types) if t == "reason"]
        if not indices_to_process: continue

        # Перенос на GPU
        enc_ids = batch["enc_input_ids"].to(device)
        enc_mask = batch["enc_mask"].to(device)
        dec_ids = batch["dec_input_ids"].to(device)
        labels = batch["labels"].to(device)

        # 1. Получаем векторы памяти из энкодера
        memory_states, _ = clara_model.get_encoder_memory_states(enc_ids, enc_mask)

        # 2. Готовим вход для генерации (только промпт без ответов)
        # Нам нужно отрезать от dec_ids ту часть, где начинаются лейблы (ответы)
        for i in indices_to_process:
            # Находим, где начинается ответ (первый индекс, где label != -100)
            label_mask = (labels[i] != -100)
            if not label_mask.any(): continue
            
            # Индекс начала ответа
            start_idx = label_mask.nonzero()[0].item()
            
            # Промпт для генерации: Memory Tokens + Текст промпта
            # Берем из dec_ids всё ДО начала ответа
            prompt_ids = dec_ids[i:i+1, clara_model.num_mem_tokens : start_idx]
            prompt_embeds = clara_model.model.get_input_embeddings()(prompt_ids)
            
            # Конкатенируем с памятью конкретно этого примера
            # memory_states[i:i+1] -> [1, num_mem, D]
            current_mem = memory_states[i:i+1]
            inputs_embeds = torch.cat([current_mem, prompt_embeds], dim=1)

            # 3. ГЕНЕРАЦИЯ (модель сама решит, какой токен выдать)
            gen_outputs = clara_model.model.generate(
                inputs_embeds=inputs_embeds,
                max_new_tokens=3, # "positive" это 1-2 токена
                pad_token_id=clara_model.tokenizer.pad_token_id,
                eos_token_id=clara_model.tokenizer.eos_token_id,
                do_sample=False # Жадный поиск для воспроизводимости
            )
            
            # Декодируем
            pred_text = clara_model.tokenizer.decode(gen_outputs[0], skip_special_tokens=True).strip().lower()
            
            # Берем золотую метку
            label_ids = labels[i][label_mask]
            label_text = clara_model.tokenizer.decode(label_ids, skip_special_tokens=True).strip().lower()

            # Сопоставляем
            p_id = -1
            for word, idx in mapping.items():
                if word in pred_text:
                    p_id = idx
                    break
            
            l_id = mapping.get(label_text, -1)

            if debug_count < 5:
                print(f"DEBUG | Gold: '{label_text}' | Pred: '{pred_text}' (ID: {p_id})")
                debug_count += 1

            if l_id != -1:
                all_preds.append(p_id)
                all_labels.append(l_id)

    # --- МАТЕМАТИКА (Accuracy / F1) ---
    y_true = np.array(all_labels)
    y_pred = np.array(all_preds)
    
    accuracy = np.mean(y_true == y_pred)
    f1_scores = []
    for cid in [0, 1, 2]:
        tp = np.sum((y_true == cid) & (y_pred == cid))
        fp = np.sum((y_true != cid) & (y_pred == cid))
        fn = np.sum((y_true == cid) & (y_pred != cid))
        p = tp / (tp + fp) if (tp + fp) > 0 else 0
        r = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_scores.append(2 * (p * r) / (p + r) if (p + r) > 0 else 0)

    print(f"\nFinal Results: Acc: {accuracy:.4f} | Macro F1: {np.mean(f1_scores):.4f}")
    return {"acc": accuracy, "f1": np.mean(f1_scores)}

In [18]:
from configs.base_config import ClaraConfig
from src.data_utils import MamsClaraDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = ClaraConfig() 



full_test_ds = MamsClaraDataset(
    "../data/test.xml", 
    tokenizer, 
    num_mem_tokens=config.num_mem_tokens, 
    max_enc_len=config.max_enc_len, 
    max_dec_len=config.max_dec_len
)

reason_only_samples = [s for s in full_test_ds.samples if s['task'] == "reason"]
full_test_ds.samples = reason_only_samples

print(f"Number of test reasoning examples for ABSA: {len(full_test_ds)}")

test_loader = torch.utils.data.DataLoader(
    full_test_ds,  
    batch_size=config.val_batch_size, 
    shuffle=False, 
    num_workers=4
)

test_results = evaluate_metrics_pure_python(model, test_loader, device, name="MAMS Test Set (Reasoning Only)")

# train_ds = MamsClaraDataset("data/train.xml", tokenizer, ...)
# train_loader = DataLoader(train_ds, batch_size=config.val_batch_size, shuffle=False)
# train_results = evaluate_metrics(clara_model, train_loader, device, name="MAMS Train Set")

Number of test reasoning examples for ABSA: 1336
--- Evaluating MAMS Test Set (Reasoning Only) (Generation Mode) ---


  0%|          | 0/84 [00:00<?, ?it/s]The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


DEBUG | Gold: 'neutral' | Pred: 'positive' (ID: 0)
DEBUG | Gold: 'positive' | Pred: 'positive' (ID: 0)
DEBUG | Gold: 'positive' | Pred: 'neutral' (ID: 2)
DEBUG | Gold: 'neutral' | Pred: 'negative' (ID: 1)
DEBUG | Gold: 'negative' | Pred: 'negative' (ID: 1)


100%|██████████| 84/84 [01:51<00:00,  1.33s/it]


Final Results: Acc: 0.6774 | Macro F1: 0.6730





In [19]:
full_train_ds = MamsClaraDataset(
    "../data/train.xml", 
    tokenizer, 
    num_mem_tokens=config.num_mem_tokens, 
    max_enc_len=config.max_enc_len, 
    max_dec_len=config.max_dec_len
)

train_reason_only_samples = [s for s in full_train_ds.samples if s['task'] == "reason"]
full_train_ds.samples = train_reason_only_samples

print(f"Number of train reasoning examples for ABSA: {len(full_test_ds)}")

train_loader = torch.utils.data.DataLoader(
    full_train_ds,  
    batch_size=config.val_batch_size, 
    shuffle=False, 
    num_workers=4
)

train_results = evaluate_metrics_pure_python(model, train_loader, device, name="MAMS Test Set (Reasoning Only)")


Number of train reasoning examples for ABSA: 1336
--- Evaluating MAMS Test Set (Reasoning Only) (Generation Mode) ---


  0%|          | 0/700 [00:00<?, ?it/s]

DEBUG | Gold: 'negative' | Pred: 'negative' (ID: 1)
DEBUG | Gold: 'positive' | Pred: 'positive' (ID: 0)
DEBUG | Gold: 'positive' | Pred: 'negative' (ID: 1)
DEBUG | Gold: 'neutral' | Pred: 'neutral' (ID: 2)
DEBUG | Gold: 'negative' | Pred: 'negative' (ID: 1)


100%|██████████| 700/700 [15:32<00:00,  1.33s/it]


Final Results: Acc: 0.7615 | Macro F1: 0.7600





# Тестирование инференса

In [56]:
@torch.no_grad()
def predict_absa_v4(clara_model, text, aspect, device):
    clara_model.eval()
    tokenizer = clara_model.tokenizer
    
    # 1. ENCODER: Сжимаем текст
    # Важно: используем ту же строку, что в Dataset
    mem_tokens_str = " ".join([f"[M{i}]" for i in range(clara_model.num_mem_tokens)])
    enc_text = f"{text} {mem_tokens_str}"
    enc_res = tokenizer(enc_text, return_tensors="pt", padding=True).to(device)
    
    memory_states, _ = clara_model.get_encoder_memory_states(enc_res.input_ids, enc_res.attention_mask)
    
    # 2. DECODER: Повторяем логику подготовки dec_input_ids из Dataset
    # Строка: "[M0] [M1] ... [M7] Sentiment of {aspect}?"
    dec_prompt_text = f"{mem_tokens_str} Sentiment of {aspect}?"
    
    # tokenizer добавит BOS автоматически, как и при обучении
    dec_res = tokenizer(dec_prompt_text, return_tensors="pt").to(device)
    dec_input_ids = dec_res.input_ids
    
    # ПОВТОРЯЕМ СРЕЗ ИЗ ТВОЕГО FORWARD:
    # Отрезаем первые num_mem_tokens (т.е. BOS и M0-M6)
    rest_of_dec_ids = dec_input_ids[:, clara_model.num_mem_tokens:]
    rest_of_dec_embeds = clara_model.model.get_input_embeddings()(rest_of_dec_ids)
    
    # СКЛЕИВАЕМ (как в твоем forward)
    # 8 векторов из энкодера + остаток (начиная с M7 и промпта)
    inputs_embeds = torch.cat([memory_states, rest_of_dec_embeds], dim=1)
    
    # Маска внимания
    attention_mask = torch.ones(inputs_embeds.shape[:2], dtype=torch.long, device=device)
    
    # 3. GENERATION
    outputs = clara_model.model.generate(
        inputs_embeds=inputs_embeds,
        attention_mask=attention_mask,
        max_new_tokens=3,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=False
    )
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True).strip().lower()
    return prediction

In [57]:
text = "The pizza was absolutely delicious and hot, but the manager was incredibly rude even though the waitstaff tried their best to be helpful."

print(f"Pizza: {predict_absa_v4(model, text, 'pizza', device)}")
print(f"Manager: {predict_absa_v4(model, text, 'manager', device)}")
print(f"Waitstaff: {predict_absa_v4(model, text, 'waitstaff', device)}")

Pizza: positive
Manager: positive
Waitstaff: negative


# Расчет коэффициента сжатия текста

In [53]:
def calculate_compression_stats(dataset, tokenizer, num_mem_tokens):
    token_lengths = []
    
    for sample in dataset.samples:
        # Считаем только уникальные тексты (в MAMS один текст дублируется для каждого аспекта)
        # Но для статистики датасета можно считать и все вхождения
        tokens = tokenizer.encode(sample['text'], add_special_tokens=False)
        token_lengths.append(len(tokens))
    
    avg_len = np.mean(token_lengths)
    max_len = np.max(token_lengths)
    min_len = np.min(token_lengths)
    
    # Коэффициент сжатия: средняя длина / количество векторов памяти
    compression_ratio = avg_len / num_mem_tokens
    
    print(f"--- Статистика сжатия MAMS ---")
    print(f"Средняя длина текста: {avg_len:.2f} токенов")
    print(f"Максимальная длина:  {max_len} токенов")
    print(f"Минимальная длина:   {min_len} токенов")
    print(f"Используется памяти: {num_mem_tokens} токенов")
    print(f"{'='*30}")
    print(f"Коэффициент сжатия (CR): {compression_ratio:.2f}x")
    
    return avg_len, compression_ratio

# Запуск
new_train_ds = MamsClaraDataset(
    "../data/train.xml", 
    tokenizer, 
    num_mem_tokens=config.num_mem_tokens, 
    max_enc_len=config.max_enc_len, 
    max_dec_len=config.max_dec_len
)
avg_l, cr = calculate_compression_stats(new_train_ds, tokenizer, config.num_mem_tokens)

--- Статистика сжатия MAMS ---
Средняя длина текста: 33.31 токенов
Максимальная длина:  114 токенов
Минимальная длина:   5 токенов
Используется памяти: 8 токенов
Коэффициент сжатия (CR): 4.16x
