In [1]:
!pip install openpyxl
!pip install datasets
# !pip install bitsandbytes
# !pip install accelerate
# !pip install peft

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [1]:
import pandas as pd
import numpy as np
import torch as t

In [2]:
train_data = pd.read_excel("/content/sample_data/train_dataset.xlsx")
valid_data = pd.read_excel("/content/sample_data/valid_dataset.xlsx")

# Modeling

---

Подготовим модель, подберем параметры обучения и оценим модель на тестовой выборке

In [3]:
from dataclasses import dataclass

@dataclass
class Config:
    model_name = "cointegrated/rubert-tiny-sentiment-balanced"  # "blanchefort/rubert-base-cased-sentiment"
    new_model = "seq-cls-ft-system"
    wb_token = 'bab00ed6b8ec6a868aef6917554e2eee8a723676'

config = Config()

In [20]:
import wandb

# Зайдем в W&B
wandb.finish()
wandb.login(key=config.wb_token)

run = wandb.init(
    project='Fine-tune Pre-Trained SEQ-CLS',
    job_type="training"
)

0,1
eval/loss,█▁
eval/recall,█▁
eval/runtime,▁█
eval/samples_per_second,█▁
eval/steps_per_second,█▁
train/epoch,▁▂▂▃▃▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▄▄▄▅▅▆▇▇██
train/grad_norm,▄▄▄▃▂▂█▄▂▅▁▄
train/learning_rate,▁▂▂▃▄▄▅▅▆▇▇█
train/loss,▂▃▃▁▁▁█▁▂▃▂▂

0,1
eval/loss,0.92258
eval/recall,0.48292
eval/runtime,0.3178
eval/samples_per_second,305.211
eval/steps_per_second,31.465
train/epoch,1.88889
train/global_step,12.0
train/grad_norm,2.60927
train/learning_rate,0.0
train/loss,1.0348


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [28]:
from torch import nn

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Модель
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForSequenceClassification.from_pretrained(config.model_name)

# Заменяем последний слой и остальные замораживаем
for name, param in model.named_parameters():
    if "bert.encoder.layer.2" in name:
        break
    else:
        param.requires_grad = False

# nn.init.kaiming_uniform_(model.classifier.weight)
nn.init.xavier_uniform_(model.classifier.weight)
nn.init.zeros_(model.classifier.bias);

In [29]:
from datasets import Dataset
from tqdm.notebook import tqdm


def preprocess_function(examples):
    inputs = tokenizer(
        examples['MessageText'],
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors="pt"
    )

    return inputs

train_dataset = Dataset.from_pandas(train_data).map(preprocess_function, batched=True)
valid_dataset = Dataset.from_pandas(valid_data).map(preprocess_function, batched=True)

Map:   0%|          | 0/1697 [00:00<?, ? examples/s]

Map:   0%|          | 0/97 [00:00<?, ? examples/s]

In [30]:
from transformers import (Trainer, TrainingArguments,
                          EarlyStoppingCallback, AdamW)

from sklearn.metrics import recall_score

# Зададим метрику
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    predictions = np.argmax(logits, axis=1)

    rec = recall_score(labels, predictions, average="macro")

    return {"eval_recall": rec}

# Адаптируем оптимизатор под нашу модель
class CustomTrainer(Trainer):
    def create_optimizer(self):
        if self.optimizer is None:
            self.optimizer = AdamW([
                {"params": self.model.bert.encoder.layer[2:].parameters(), "lr": 2e-5},
                {"params": self.model.classifier.parameters(), "lr": 1e-4},
            ], weight_decay=0.1)
        return self.optimizer

In [31]:
# Зададим параметры тренера
model.to('cuda')
model.train()

training_args = TrainingArguments(
    output_dir=config.new_model,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=10,
    gradient_accumulation_steps=4,
    dataloader_num_workers=2,
    num_train_epochs=50,
    # weight_decay=0.1,
    # learning_rate=3e-5,
    # optim="adamw_torch",
    lr_scheduler_type='cosine',
    warmup_steps=150,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_recall",
    greater_is_better=True,
    eval_steps=1,
    logging_steps=1,
    report_to="wandb"
)

trainer = CustomTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    args=training_args,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=15, early_stopping_threshold=0.001)]
)

In [32]:
trainer.train()



Epoch,Training Loss,Validation Loss,Recall
0,1.1527,1.201915,0.336524
1,1.1699,1.19785,0.336524
2,1.1279,1.190883,0.336524
3,1.1526,1.181293,0.343468
4,1.1135,1.169047,0.371246
5,1.0734,1.154346,0.408033
6,1.0753,1.137416,0.428866
7,1.0241,1.118517,0.410098
8,1.1157,1.097881,0.417042
9,1.0526,1.076105,0.423986


TrainOutput(global_step=300, training_loss=0.7589820127685865, metrics={'train_runtime': 153.8861, 'train_samples_per_second': 551.382, 'train_steps_per_second': 1.949, 'total_flos': 294770735087616.0, 'train_loss': 0.7589820127685865, 'epoch': 49.888888888888886})

# Оценим модель

---

Импортируем два файла с тестовыми данными и посмотрим на оценки метрик

In [33]:
from bs4 import BeautifulSoup

fst_test_data = pd.read_excel("/content/sample_data/dataset_comments_35.xlsx")
snd_test_data = pd.read_excel("/content/sample_data/dataset_comments_100_test.xlsx")

# Зададим таргеты в виде индексов
def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator=" ")
    text = text.replace("\xa0", " ")

    return text.strip()

d = {
    'B': 0,
    'N': 1,
    'G': 2
}

fst_test_data.MessageText = fst_test_data.MessageText.apply(lambda x: html_to_text(x))
fst_test_data.labels = fst_test_data.labels.apply(lambda x: d[x.strip()])

snd_test_data.MessageText = snd_test_data.MessageText.apply(lambda x: html_to_text(x))
snd_test_data.labels = snd_test_data.labels.apply(lambda x: d[x.strip()])

In [34]:
from torch.nn import functional as f

for data in [fst_test_data, snd_test_data]:
    pred = np.empty(data.shape[0])

    model.eval()
    model.to('cpu')
    for i in range(data.shape[0]):
        text = html_to_text(data.iloc[i]['MessageText'])
        inputs = tokenizer(text, truncation=True, max_length=256, return_tensors='pt')
        with t.no_grad():
            logits = model(**inputs).logits
        probs = f.softmax(logits, dim=-1)
        pred[i] = t.argmax(probs, dim=-1).item()

    from sklearn.metrics import classification_report, recall_score

    print(classification_report(data.labels.astype(int), pred), end='\n\n\n')

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        11
           1       0.67      0.67      0.67         3
           2       0.95      0.95      0.95        20

    accuracy                           0.88        34
   macro avg       0.81      0.81      0.81        34
weighted avg       0.88      0.88      0.88        34



              precision    recall  f1-score   support

           0       0.67      0.63      0.65        19
           1       0.71      0.68      0.70        22
           2       0.89      0.92      0.90        59

    accuracy                           0.81       100
   macro avg       0.76      0.74      0.75       100
weighted avg       0.81      0.81      0.81       100





In [42]:
# Сохраним модель
model.save_pretrained('/content/model')
tokenizer.save_pretrained('/content/tokenizer')

('/content/tokenizer/tokenizer_config.json',
 '/content/tokenizer/special_tokens_map.json',
 '/content/tokenizer/vocab.txt',
 '/content/tokenizer/added_tokens.json',
 '/content/tokenizer/tokenizer.json')

In [27]:
!zip -r '/content/model.zip' '/content/model'
!zip -r '/content/tokenizer.zip' '/content/tokenizer'

  adding: content/model/ (stored 0%)
  adding: content/model/config.json (deflated 51%)
  adding: content/model/model.safetensors (deflated 8%)
  adding: content/tokenizer/ (stored 0%)
  adding: content/tokenizer/tokenizer.json (deflated 70%)
  adding: content/tokenizer/tokenizer_config.json (deflated 74%)
  adding: content/tokenizer/special_tokens_map.json (deflated 80%)
  adding: content/tokenizer/vocab.txt (deflated 52%)
