In [44]:
!pip install openpyxl
!pip install datasets
!pip install bitsandbytes
!pip install accelerate
!pip install peft

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.2-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [2]:
import pandas as pd
import numpy as np
import torch as t

In [136]:
train_data = pd.read_excel("/content/sample_data/aug_data.xlsx")
valid_data = pd.read_excel("/content/sample_data/dataset_comments_79.xlsx")

In [137]:
# Зададим таргеты в виде индексов
def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator=" ")
    text = text.replace("\xa0", " ")

    return text.strip()

valid_data.MessageText = valid_data.MessageText.apply(lambda x: html_to_text(x))
d = {
    'B': 0,
    'N': 1,
    'G': 2
}
valid_data.labels = valid_data.labels.apply(lambda x: d[x])

# Modeling

---

Подготовим модель, подберем параметры обучения и оценим модель на тестовой выборке

In [138]:
# from sklearn.model_selection import train_test_split

# # Разделим на train и valid
# train_data, valid_data = train_test_split(data, test_size=0.2, random_state=42)

In [139]:
from dataclasses import dataclass

@dataclass
class Config:
    model_name = "cointegrated/rubert-tiny-sentiment-balanced"  # "blanchefort/rubert-base-cased-sentiment"
    new_model = "seq-cls-ft-system"
    wb_token = 'bab00ed6b8ec6a868aef6917554e2eee8a723676'

config = Config()

In [140]:
import wandb

# Зайдем в W&B
wandb.login(key=config.wb_token)

run = wandb.init(
    project='Fine-tune Pre-Trained SEQ-CLS',
    job_type="training"
)



In [158]:
from torch import nn

# import bitsandbytes as bnb
from transformers import (Trainer, TrainingArguments,
                          AutoModelForSequenceClassification,
                          AutoTokenizer, EarlyStoppingCallback)

# Модель
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
model = AutoModelForSequenceClassification.from_pretrained(config.model_name)

# Заменяем последний слой и остальные замораживаем
for name, param in model.named_parameters():
    if "bert.encoder.layer.1" in name:  # "bert.encoder.layer.9" in name
        break
    else:
        param.requires_grad = False

In [159]:
from datasets import Dataset
from tqdm.notebook import tqdm


def preprocess_function(examples):
    inputs = tokenizer(
        examples['MessageText'],
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors="pt"
    )

    return inputs

train_dataset = Dataset.from_pandas(train_data).map(preprocess_function, batched=True)
valid_dataset = Dataset.from_pandas(valid_data).map(preprocess_function, batched=True)

Map:   0%|          | 0/1913 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [160]:
from sklearn.metrics import recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    predictions = np.argmax(logits, axis=1)

    rec = recall_score(labels, predictions, average="macro")

    return {"eval_recall": rec}

model.to('cuda')

training_args = TrainingArguments(
    output_dir=config.new_model,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=34,
    gradient_accumulation_steps=4,
    dataloader_num_workers=2,
    num_train_epochs=50,
    weight_decay=0.1,
    learning_rate=1e-5,
    optim="adamw_torch",
    lr_scheduler_type='cosine_with_restarts',
    warmup_steps=400,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_recall",
    greater_is_better=True,
    eval_steps=1,
    logging_steps=1,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    args=training_args,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10, early_stopping_threshold=0.001)]
)

In [161]:
trainer.train()

Epoch,Training Loss,Validation Loss,Recall
1,0.9531,0.679562,0.725694
2,1.1018,0.669685,0.725694
3,1.0052,0.654484,0.725694
4,1.1428,0.635557,0.740846
5,0.7851,0.615148,0.786301
6,0.8344,0.596345,0.772412
7,1.0807,0.581248,0.772412
8,0.9763,0.567795,0.808396
9,0.7793,0.55734,0.822285
10,0.6856,0.551111,0.808396


TrainOutput(global_step=285, training_loss=0.8573512388948809, metrics={'train_runtime': 79.6154, 'train_samples_per_second': 1201.401, 'train_steps_per_second': 9.42, 'total_flos': 134032624952832.0, 'train_loss': 0.8573512388948809, 'epoch': 19.0})

In [165]:
# Сохраним модель
model.save_pretrained('/content/model')
tokenizer.save_pretrained('/content/tokenizer')

('/content/tokenizer/tokenizer_config.json',
 '/content/tokenizer/special_tokens_map.json',
 '/content/tokenizer/vocab.txt',
 '/content/tokenizer/added_tokens.json',
 '/content/tokenizer/tokenizer.json')

In [166]:
!zip -r '/content/model.zip' '/content/model'
!zip -r '/content/tokenizer.zip' '/content/tokenizer'

  adding: content/model/ (stored 0%)
  adding: content/model/config.json (deflated 51%)
  adding: content/model/model.safetensors (deflated 8%)
  adding: content/tokenizer/ (stored 0%)
  adding: content/tokenizer/tokenizer.json (deflated 70%)
  adding: content/tokenizer/tokenizer_config.json (deflated 74%)
  adding: content/tokenizer/special_tokens_map.json (deflated 80%)
  adding: content/tokenizer/vocab.txt (deflated 52%)
