In [1]:
!pip install openpyxl
!pip install -U datasets
!pip install -U peft
!pip install -U bitsandbytes
!pip install -U wandb
!pip install -U dataclasses

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_excel('/content/sample_data/dataset_comments.xlsx')
test_data = pd.read_excel("/content/sample_data/dataset_comments_35.xlsx")

# Обработка данных и аугментация

In [3]:
data = data.drop(['UserSenderId', 'SubmitDate'], axis=1)
test_data = test_data.drop(['UserSenderId', 'SubmitDate'], axis=1)

In [4]:
from bs4 import BeautifulSoup

# Удалим html теги
def html_to_text(html):
    soup = BeautifulSoup(html, "html.parser")
    text = soup.get_text(separator=" ")
    text = text.replace("\xa0", " ")
    return text.strip()

data.MessageText = data.MessageText.apply(lambda x: html_to_text(x))
test_data.MessageText = test_data.MessageText.apply(lambda x: html_to_text(x))

In [5]:
import re

# Частично избавимся от "шума"
def is_noise(text):
    if len(text) < 3:  # Убираем слишком короткие строки
        return True
    if re.match(r"^[\W\d_]+$", text):  # Только символы и цифры
        return True
    if re.match(r"^[a-zA-Z]+$", text):  # Только латиница
        return True
    return False

data = data[~data.MessageText.apply(is_noise)]

In [None]:
import torch as t
from transformers import pipeline
from tqdm.notebook import trange, tqdm

paraphrase_model = pipeline(
    "text2text-generation",
    model="cointegrated/rut5-base-paraphraser",
    device=0
)

def paraphrase_text(text, num_return_sequences=3):
    paraphrased = paraphrase_model(
        text,
        max_length=128,
        num_return_sequences=num_return_sequences,
        truncation=True,
        # num_beams=num_return_sequences,
        temperature=0.6,
        do_sample=True,
    )

    return [p["generated_text"] for p in paraphrased]

aug_data = {
    'MessageText': [],
    'labels': []
}

for class_ in data.labels.unique():
    if class_ == 0:
        return_sequences = 10
    elif class_ == 2:
        return_sequences = 3
    elif class_ == 1:
        return_sequences = 2

    subdata = data[data.labels == class_]
    for i in trange(subdata.shape[0]):
        aug_text = paraphrase_text(subdata.iloc[i, 0], return_sequences)

        aug_data['MessageText'] += aug_text
        aug_data['labels'] += [subdata.iloc[i, 1]] * return_sequences

        t.cuda.empty_cache()

aug_data = pd.DataFrame(aug_data)

KeyboardInterrupt: 

In [None]:
# Рандомно засэмплируем данные и избавимся от дубликатов
aug_data = aug_data.sample(frac=1)

# Объединим данные
data = pd.concat([data, aug_data], axis=0).drop_duplicates()

In [None]:
# Посмотрим на сбалансированность классов
data.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,673
2,656
0,584


# Моделироавние

In [8]:
import numpy as np

from datasets import load_dataset, Dataset

from dataclasses import dataclass
from tqdm.notebook import tqdm

import torch as t
from torch import nn

from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer

from peft import LoraConfig, get_peft_model
import wandb

In [10]:
@dataclass
class Config:
    model_name = "cointegrated/rubert-tiny-sentiment-balanced"
    new_model = "seq-cls-ft-system"
    wb_token = 'bab00ed6b8ec6a868aef6917554e2eee8a723676'

config = Config()

In [11]:
# Зайдем в W&B
wandb.login(key=config.wb_token)

run = wandb.init(
    project='Fine-tune Pre-Trained Sequence-Classifier',
    job_type="training"
)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkhhha_pffuuuu[0m ([33mkhhha_pffuuuu-saint-petersburg-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [19]:
# Модель
model_name = "cointegrated/rubert-tiny-sentiment-balanced"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Заменяем последний слой и остальные замораживаем
for param in model.parameters():
    param.requires_grad = False

# model.classifier = nn.Linear(312, 3)
model.classifier = nn.Sequential(
    nn.Linear(312, 4096),
    nn.ReLU(),
    nn.Linear(4096, 4096),
    nn.ReLU(),
    nn.Linear(4096, 512),
    nn.ReLU(),
    nn.Linear(512, 3)
)

In [15]:
dict_classes = {
    'G': 2,
    'N': 1,
    'B': 0
}

test_data.labels = test_data.labels.apply(lambda x: dict_classes[x])

In [16]:
def preprocess_function(examples):
    inputs = tokenizer(
        examples['MessageText'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors="pt"
    )

    return inputs

train_dataset = Dataset.from_pandas(data).map(preprocess_function, batched=True)
test_dataset = Dataset.from_pandas(test_data).map(preprocess_function, batched=True)

Map:   0%|          | 0/1913 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [20]:
from sklearn.metrics import recall_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    predictions = np.argmax(logits, axis=1)

    rec = recall_score(labels, predictions, average="macro")

    return {"eval_recall": rec}

model.to('cuda')

training_args = TrainingArguments(
    output_dir=config.new_model,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=34,
    gradient_accumulation_steps=3,
    dataloader_num_workers=2,
    num_train_epochs=100,
    weight_decay=0.1,
    learning_rate=1e-5,
    optim="adamw_torch",
    lr_scheduler_type='cosine',
    warmup_steps=400,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_recall",
    greater_is_better=False,
    eval_steps=1,
    logging_steps=1,
    report_to="wandb"
)

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    args=training_args,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Recall
1,1.0954,1.102762,0.464646
2,1.0899,1.098511,0.434343
3,1.0871,1.091758,0.55101
4,1.0843,1.08238,0.69798
5,1.0682,1.07042,0.69798
6,1.0665,1.056125,0.714646
7,1.0575,1.038864,0.684343
8,1.0408,1.01797,0.731313
9,1.0167,0.994138,0.731313
10,0.9914,0.967898,0.731313


TrainOutput(global_step=1000, training_loss=0.6638491753339767, metrics={'train_runtime': 338.3692, 'train_samples_per_second': 565.359, 'train_steps_per_second': 2.955, 'total_flos': 3314830973568000.0, 'train_loss': 0.6638491753339767, 'epoch': 100.0})

In [22]:
# Сохраним модель
model.save_pretrained('/content/model')
tokenizer.save_pretrained('/content/tokenizer')

('/content/tokenizer/tokenizer_config.json',
 '/content/tokenizer/special_tokens_map.json',
 '/content/tokenizer/vocab.txt',
 '/content/tokenizer/added_tokens.json',
 '/content/tokenizer/tokenizer.json')

In [23]:
!zip -r '/content/model.zip' '/content/model'
!zip -r '/content/tokenizer.zip' '/content/tokenizer'

  adding: content/model/ (stored 0%)
  adding: content/model/config.json (deflated 51%)
  adding: content/model/model.safetensors (deflated 8%)
  adding: content/tokenizer/ (stored 0%)
  adding: content/tokenizer/tokenizer.json (deflated 70%)
  adding: content/tokenizer/tokenizer_config.json (deflated 74%)
  adding: content/tokenizer/special_tokens_map.json (deflated 80%)
  adding: content/tokenizer/vocab.txt (deflated 52%)


In [24]:
from google.colab import files

files.download("/content/model.zip")
files.download("/content/tokenizer.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>