In [None]:
# !pip install transformers==4.44.2
!pip install datasets

In [1]:
from transformers import Qwen2ForCausalLM, AutoTokenizer, __version__, AutoModelForCausalLM, Qwen2ForSequenceClassification, Qwen2Model
import torch
from torch import nn
from copy import deepcopy
import numpy as np
import gc
import warnings
import sys
import os
from tqdm.auto import tqdm
from datasets import load_dataset

src_path = os.path.abspath(os.path.join('..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

from utils import *
from xsqa.utils_multiple_choice import CollatorXSQA, MultipleChoiceDataset, Split
from xsqa.module import Qwen2ForMultipleChoice
from torch.utils.data import DataLoader,IterableDataset

%load_ext autoreload
%autoreload 2

In [2]:
NUM_LABELS = 5
DTYPE = torch.float32

In [3]:
__version__

'4.44.2'

In [3]:
qwen_path = "Qwen/Qwen2-0.5B" # change it
# qwen_path = 'Qwen/Qwen2.5-1.5B'
qwen = Qwen2Model.from_pretrained(qwen_path, torch_dtype=DTYPE).cuda()
tokenizer = AutoTokenizer.from_pretrained(qwen_path)
tokenizer.pad_token = tokenizer.eos_token
qwen.config.pad_token_id = qwen.config.eos_token_id

In [4]:
qwen = Qwen2ForMultipleChoice(qwen, NUM_LABELS, DTYPE)

Для начала надо обучить сами модели для задачи X-CSQA. После этого можно раскоментировать ячейку и перейти к эксперименту с обучением уже моделей только на нужных нейронах

In [5]:
# после первой ступени обучения
domain = 'eng'
qwen.load_state_dict(torch.load(f"model_{domain}.pt", weights_only=True))
qwen = qwen.to('cuda')

In [6]:
qwen = convert_to_Qwen2_ND(qwen).to("cuda")


In [16]:
# qwen = impacts_off(qwen) # comment this to test impacts

Пример запроса типа вопрос-вырианты ответов для нейронки

In [10]:
with torch.no_grad():
    prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
    choice0 = "It is eaten with a fork and a knife."
    choice1 = "It is eaten while held in the hand."
    choice2 = "It is eaten while held in the hand."
    choice3 = "It is eaten while held in the hand."
    choice4 = "It is eaten while held in the hand."
    encoding = tokenizer([prompt, prompt, prompt, prompt, prompt], [choice0, choice1, choice2, choice3, choice4], return_tensors="pt", padding=True)
    inputs = {k: v[None, ...].cuda() for k, v in encoding.items()}
    output = qwen(**inputs)
    print(output)

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


{'logits': tensor([[-3.8873, -4.8008, -4.8008, -4.8008, -4.8008]], device='cuda:0')}


In [17]:
class Trainer:
    def __init__(
        self,
        model=None,
        train_config=None
    ):
        self.model = model
        self.train_config = train_config if train_config is not None else {}

    def train(
        self,
        train_loader=None,
        val_loader=None,
        model=None,
        only_domain_specific=True,
        domain=None,
        max_iters=None,
        optimizer=None,
        lr=None,
        ignore_index=None,
        evaluate_every=-1,  # -1 for no evaluation
    ):
        if model is not None:
            self.model = model
        if self.model is None:
            raise ValueError("No model provided")

        if lr is not None:
            self.train_config["lr"] = lr
        elif "lr" in self.train_config:
            lr = self.train_config["lr"]
        else:
            warnings.warn("No learning rate provided. Defaulting to 1e-3", UserWarning)
            lr = 1e-3

        if max_iters is not None:
            self.train_config["max_iters"] = max_iters
        elif "max_iters" in self.train_config:
            max_iters = self.train_config["max_iters"]
        else:
            max_iters = 1
            self.train_config["max_iters"] = max_iters
            warnings.warn("No max_iters provided. Defaulting to 100", UserWarning)

        if ignore_index is None:
            ignore_index = self.train_config.get("ignore_index", None)
        else:
            self.train_config["ignore_index"] = ignore_index

        loss_function = nn.CrossEntropyLoss(ignore_index=ignore_index, reduction="sum")

        if evaluate_every is not None:
            self.train_config["evaluate_every"] = evaluate_every
        elif "evaluate_every" in self.train_config:
            evaluate_every = self.train_config["evaluate_every"]
        else:
            warnings.warn("No evaluate_every provided. Defaulting to -1 (no evaluation)", UserWarning)
            evaluate_every = -1

        if domain is not None:
            self.train_config["domain"] = domain
        elif "domain" in self.train_config:
            domain = self.train_config["domain"]
        else:
            warnings.warn("No domain provided. Defaulting to 'eng'", UserWarning)
            domain = "eng"

        # IMPORTANT!
        if only_domain_specific:
            # calculate gradients only for layers with possible occurence of DSN
            dsn_model_grads_to_train(self.model)

        if optimizer is not None:
            if isinstance(optimizer, torch.optim.Optimizer):
                optimizer = optimizer([param for param in self.model.parameters() if param.requires_grad], lr=lr)
            else:
                self.train_config["optimizer"] = optimizer
        elif "optimizer" in self.train_config:
            opt_name = self.train_config["optimizer"]
        else:
            warnings.warn("No optimizer provided. Defaulting to Adam", UserWarning)
            opt_name = "Adam"

        if opt_name == "Adam":
            optimizer = torch.optim.Adam([param for param in self.model.parameters() if param.requires_grad], lr=lr)
        elif opt_name == "SGD":
            optimizer = torch.optim.SGD([param for param in self.model.parameters() if param.requires_grad], lr=lr)
        else:
            raise ValueError(f"Unknown optimizer: {opt_name}")

        losses = []
        metrics = []
        n_iter = 0
        device = next(self.model.parameters()).device

        self.model.train()
        max_acc = 0
        with tqdm (range(max_iters), desc="Training iters") as pbar:
            for batch, labels in train_loader:
                labels = torch.tensor(labels, device = device)
                optimizer.zero_grad()

                output = self._forward(batch, device=device)["logits"]
                loss = loss_function(output.view(-1, output.shape[-1]),  labels)
                loss.backward()
                # IMPORTANT!
                if only_domain_specific:
                    dsn_model_mask_gradients(self.model, domain=domain)

                optimizer.step()

                losses.append(loss.detach().float().cpu().numpy())

                n_iter += len(batch)
                pbar.update(len(batch))

                if evaluate_every > 0 and n_iter >= (len(metrics) + 1) * evaluate_every:
                    eval_metrics = self.evaluate(val_loader)
                    pbar.set_postfix(**eval_metrics)
                    metrics.append(eval_metrics)
                    if (eval_metrics['accuracy'] > max_acc):
                      max_acc = eval_metrics['accuracy']
                      torch.save(self.model.state_dict(), f"model_{domain}.pt")
                    self.model.train()


                if n_iter >= max_iters:
                    break

        self._metrics = metrics
        self._losses = losses

        return self.model

    def _forward(
        self,
        batch,
        device="cuda"
    ):
        _batch = {}
        for k, v in batch.items():
            _batch[k] = v.to(device)
        preds = self.model(**_batch)

        return preds

    def _calulate_metrics(
        self,
        output,
        labels,
        device="cuda"
    ):
        metrics_to_calulate = self.train_config.get("eval_metrics", [])
        metrics = {}

        for m in metrics_to_calulate:
            if m == "loss" or m == "cross_entropy_loss":
                loss_function = nn.CrossEntropyLoss(ignore_index=self.train_config.get("ignore_index", None))
                metrics[m] = loss_function(output.view(-1, output.shape[-1]), labels.to(device).view(-1)).float().detach().cpu().numpy()
            elif m == "accuracy":
                predicted  = torch.argmax(output, dim = -1).view(-1)
                #print(predicted.shape)
                #print(predicted)
                #print(labels)
                metrics[m] = (predicted == labels).float().mean().detach().cpu().numpy()
            else:
                warnings.warn(f"Metric: {m} NOT IMPLEMENTED", UserWarning)

        return metrics

    def evaluate(
        self,
        val_loader=None,
    ):
        if val_loader is not None:
            with torch.no_grad():
                self.model.eval()
                device = next(self.model.parameters()).device

                metrics = []
                # for batch in tqdm(val_loader, leave=False, desc="eval batch"):
                for batch, labels in val_loader:
                    labels = torch.tensor(labels, device = device)
                    self.model.train()

                    output = self._forward(batch, device=device)["logits"]

                    batch_metrics = self._calulate_metrics(output, labels, device)

                    metrics.append(batch_metrics)

                metrics_mean = {}
                for metric in metrics[0].keys():
                    metrics_mean[metric] = np.mean([m[metric] for m in metrics])

                return metrics_mean

        else:
            warnings.warn("No validation data provided, return empty val metrics", UserWarning)
            return {}

# Загрузка датасета

In [None]:
! wget https://inklab.usc.edu/XCSR/xcsr_datasets.zip
! unzip xcsr_datasets.zip -d ./xcr_dataset

# Обучение

In [20]:
train_dataset = (
        MultipleChoiceDataset(
            data_dir='./xcr_dataset',
            task='xcsr',
            overwrite_cache=True,
            mode=Split.train,
            num_choices=NUM_LABELS,
            train_file='./xcr_dataset/X-CSR_datasets/X-CSQA/en/train.jsonl',
            val_file='./xcr_dataset/X-CSR_datasets/X-CSQA/en/dev.jsonl',
            test_file='./xcr_dataset/X-CSR_datasets/X-CSQA/en/test.jsonl',
            percentage=100,
        )
    )
val_dataset = (
        MultipleChoiceDataset(
            data_dir='./xcr_dataset',
            task='xcsr',
            overwrite_cache=True,
            mode=Split.train,
            num_choices=NUM_LABELS,
            train_file='./xcr_dataset/X-CSR_datasets/X-CSQA/en/train.jsonl',
            val_file='./xcr_dataset/X-CSR_datasets/X-CSQA/en/dev.jsonl',
            test_file='./xcr_dataset/X-CSR_datasets/X-CSQA/en/test.jsonl',
            percentage=3,
        )
    )

8888


read data:   0%|          | 0/8888 [00:00<?, ?it/s]

convert examples to features:   0%|          | 0/8888 [00:00<?, ?it/s]

266


read data:   0%|          | 0/266 [00:00<?, ?it/s]

convert examples to features:   0%|          | 0/266 [00:00<?, ?it/s]

Можно поэксперементировать над размерами батчей в зависимотси от доступной памяти

In [21]:
collator = CollatorXSQA(tokenizer, num_labels = NUM_LABELS)
#dataset = load_dataset('yhavinga/ccmatrix', "en-ru", split='train', streaming=True, trust_remote_code=True)
train_data_loader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn = collator,
    batch_sampler=None
)
val_data_loader = DataLoader(
    val_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn = collator,
    batch_sampler=None
)


Для второй части эксперимента

In [11]:
qwen, _ = detect_domain_specific_neurons(
    qwen,
    tokenizer,
    dataloader=train_data_loader,
    eps=1e-2,
    domain_name="eng",
    reset_impacts=False,
    reset_dsn=True, num_elements = 10
)


  0%|          | 0/10 [00:00<?, ?it/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [12]:
print(qwen.model.layers[0].mlp.dsn)
print(_)

{'eng': {'up_proj': tensor([[True],
        [True],
        [True],
        ...,
        [True],
        [True],
        [True]], device='cuda:0'), 'down_proj': tensor([[True, True, True,  ..., True, True, True]], device='cuda:0')}}
tensor([[ -0.0931,  -5.9938,  -5.7302,  -3.5445,  -8.5399,  -1.3429,  -6.4582,
         -12.8190,   4.1807,  -9.8090,  -9.3872,  -5.3373,  -6.3403,  -7.6917,
          -7.1003,  -6.1945,  -7.4106, -12.4312,  -8.5482,  -2.4961, -10.3927,
          -8.6811,   0.5962,  -5.4542, -10.4397],
        [ -3.9861, -10.7410, -13.4053,   1.9875,  -8.8597,  -4.1294,  -4.1294,
          -9.0795,  -4.1294,  -7.2331,  -6.8675,  -7.3345,  -6.3442,   1.8365,
          -5.3510, -12.3396,  -5.4929,  -3.4525,  -4.4728, -10.0996, -10.2665,
          -3.8575,  -3.2697,  -6.2501, -10.0255],
        [ -9.8832, -10.5923,  -9.9432, -11.6652,   0.6376,   2.4556,  -7.9632,
          -8.5260, -11.6406, -10.2905,  -5.5375, -12.1440, -12.2223,  -3.6329,
          -4.2349,  -9.2002, -14.09

In [None]:
#for name, para in qwen.model.named_parameters():
#    para.requires_grad = False

В первой части эксперимента, где мы полностью обучаем сеть параметр тренера должен быть
```
only_domain_specific=False
```
Во второй же нам нужно обучать только на специфических нейронах 
```
only_domain_specific=True
```

In [22]:
trainer = Trainer(
    model=qwen,
    train_config={
        "eval_metrics": ["loss", "accuracy"],
        "lr": 1e-4,
        "optimizer": "Adam",
        "max_iters": 10000,
        "ignore_index": tokenizer.pad_token_id
    }
)

In [11]:
qwen = trainer.train(
    train_loader=train_data_loader,
    val_loader=val_data_loader,
    evaluate_every=10,
    only_domain_specific=False, #Меняем в зависимости от типа эксперимента
    domain="eng"
)

Training iters:   0%|          | 0/10000 [00:00<?, ?it/s]

In [23]:
dsn_model_grads_to_train(qwen)

qwen = trainer.train(
    train_loader=train_data_loader,
    val_loader=val_data_loader,
    evaluate_every=10,
    only_domain_specific=True,
    domain="eng"
)

Training iters:   0%|          | 0/10000 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 334.00 MiB. GPU 0 has a total capacity of 23.69 GiB of which 92.50 MiB is free. Process 15914 has 286.00 MiB memory in use. Process 88624 has 23.31 GiB memory in use. Of the allocated memory 16.44 GiB is allocated by PyTorch, and 6.57 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# torch  Version: 1.10.0+cu113
# torch vision  0.11.0+cu113

In [14]:
torch.save(trainer.model.state_dict(), f"model_eng.pt")

Оценка созраненной и обученной модели

In [6]:
domain = 'eng'
qwen.load_state_dict(torch.load(f"model_{domain}.pt", weights_only=True))
qwen = qwen.to('cuda')

In [12]:
eval_result = trainer.evaluate(val_data_loader)

In [13]:
print(eval_result)

{'loss': 0.38228205, 'accuracy': 0.9073529}


Возможно стоит использоваь XLM как в оригинальной работе, тогда результат будет лучше

In [None]:
from transformers import AutoTokenizer, XLMRobertaForMultipleChoice
import torch

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
model = XLMRobertaForMultipleChoice.from_pretrained("FacebookAI/xlm-roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/xlm-roberta-base/6fd4797bc397c3b8b55d6bb5740366b57e6a3ce91c04c77f22aafc0c128e6feb?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1734081036&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczNDA4MTAzNn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby94bG0tcm9iZXJ0YS1iYXNlLzZmZDQ3OTdiYzM5N2MzYjhiNTVkNmJiNTc0MDM2NmI1N2U2YTNjZTkxYzA0Yzc3ZjIyYWFmYzBjMTI4ZTZmZWI%7EcmVzcG9uc2UtY29udGVudC1kaXNwb3NpdGlvbj0qIn1dfQ__&Signature=oFEVm7tn4jViv8LRQcw5TGyso9qg9LDJgtvka8s%7EfRPqFyFSuxb1FYcyuPGiHZLR04585xANUboxpA0F0r33CgW-QfX8VoYu93v1hwRb25eGEvaxa452Z4z02HaSNn%7EGZxfhzvOZcCD7j0SX4sFhNTo8IFIuQIqn3BgUpvgrSgtPNm9H54AXGh9w00Q67zqgSIlgzrz756jHPFdAvU5oKmEjTfX2nhPFNs341fegYsc3h7Lfm8nIP8wVGYDNo9Al1fv8dEmTb8rTi22kD5me13YSnwhqom5XB5jzoNBGmVDVCzE7njahOvjg8L9E45POGtV2UdRdbVwh0F4sZzMWKg__&Key-Pair-Id=K3RPWS32NSSJCE: HTTPSConnectionPool(host='cdn-

model.safetensors:  33%|###2      | 367M/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForMultipleChoice were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(model)

XLMRobertaForMultipleChoice(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tr