In [10]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, pipeline
import re
import numpy as np
from collections import Counter
import nltk
import torch
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from datasets import Features, ClassLabel

# 1. Загрузка данных и токенизатора
name = "Geotrend/distilbert-base-ru-cased"
tokenizer = AutoTokenizer.from_pretrained(name)

# Загрузка датасета со всеми сплитами
ds = load_dataset("sagteam/cedr_v1", "main")  # Теперь ds содержит 'train' и 'test'

# id2label для справки
id2label = {
    0: "joy",
    1: "sadness",
    2: "surprise",
    3: "fear",
    4: "anger",
    5: "no_emotion"
}
label2id = {v: k for k, v in id2label.items()}


# 3. Очистка текста
nltk.download('stopwords')
ru_stopwords = set(stopwords.words('russian'))


def clean_text(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'@\S+|#\S+', '', text)
    text = re.sub(r'_{5,}', '', text)
    text = re.sub(r'\)+', ')', text)
    text = re.sub(r'\s+', ' ', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in ru_stopwords and len(t) > 2]
    return ' '.join(tokens) if tokens else None


def process_text(example):
    example['cleaned_text'] = clean_text(example['text'])
    return example


def create_averaged_labels(example):
    # Создаём one-hot вектор с float значениями
    hard_label = [0] * 6
    if example['labels']:
        for label in example['labels']:
            if label < 6:
                hard_label[label] = 1
    if not example['labels']:
        hard_label[5] = 1
    hard_label = torch.tensor(hard_label, dtype=torch.float32)
    # Перезаписываем 'labels' вместо создания нового поля 'label'
    return {
        'label': hard_label
    }


# 5. Обработка каждого сплита
processed = {}
for split in ds:
    temp_ds = ds[split]

    # Очистка текста
    temp_ds = temp_ds.map(process_text)
    temp_ds = temp_ds.filter(lambda x: x['cleaned_text'] is not None)

    # Создание усреднённых меток
    temp_ds = temp_ds.map(create_averaged_labels)

    # Удаление ненужных колонок
    temp_ds = temp_ds.remove_columns(['text', 'source', 'labels'])
    temp_ds = temp_ds.rename_column('label', 'labels')

    processed[split] = temp_ds


# 6. Определение максимальной длины
def get_max_length(dataset, sample_size=1000):
    sample_texts = dataset.shuffle(seed=42).select(range(sample_size))['cleaned_text']
    tokenized = tokenizer(sample_texts, truncation=False, padding=False)
    lengths = [len(x) for x in tokenized['input_ids']]
    return max(lengths)


max_length = get_max_length(processed['train'])
print(max_length)

# 7. Токенизация
def tokenize_function(examples):
    return tokenizer(
        examples['cleaned_text'],
        padding=False,
        truncation=True,
        max_length=128,
        return_special_tokens_mask=True,
        return_attention_mask=True
    )


tokenized_datasets = {}
for split in processed:
    tokenized = processed[split].map(lambda x: tokenize_function(x), batched=True)
    tokenized = tokenized.remove_columns(['cleaned_text'])
    tokenized.set_format(type='torch', columns=['labels','input_ids','attention_mask'])
    tokenized_datasets[split] = tokenized

# 8. Сохранение
tokenized_datasets = DatasetDict(tokenized_datasets)
tokenized_datasets.save_to_disk('./processed_datasets')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Map:   0%|          | 0/7525 [00:00<?, ? examples/s]

Map:   0%|          | 0/1882 [00:00<?, ? examples/s]

77


Map:   0%|          | 0/7525 [00:00<?, ? examples/s]

Map:   0%|          | 0/1882 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7525 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1882 [00:00<?, ? examples/s]

In [18]:
print(tokenized_datasets['test'][0])

{'labels': tensor([0., 0., 0., 0., 0., 1.]), 'input_ids': tensor([   11, 12777,  3167,   175, 10934,  1293,  4590,   326,  4321,  1531,
         3155, 12777,  3167,  9664,   724,   172,  2493,    12]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}


In [27]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import get_scheduler, AutoTokenizer, AutoModelForSequenceClassification
from peft import LoraConfig, get_peft_model
from sklearn.metrics import f1_score, accuracy_score
from datasets import load_from_disk
from accelerate import Accelerator
from accelerate.utils import tqdm  # Используем tqdm из accelerate
from timm.layers import LayerNorm2d


# Инициализация Accelerate
accelerator = Accelerator(
    mixed_precision="fp16",
    gradient_accumulation_steps=2
)

# Загрузка модели и токенизатора
tokenizer = AutoTokenizer.from_pretrained("Geotrend/distilbert-base-ru-cased")
model = AutoModelForSequenceClassification.from_pretrained(
    "Geotrend/distilbert-base-ru-cased",
    num_labels=6,
    problem_type="multi_label_classification"
)

# Конфигурация LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],
    lora_dropout=0.05,
    bias="none",
    modules_to_save=["classifier"],
    use_dora=True
)
model = get_peft_model(model, peft_config)
#print(model)
# Загрузка датасета
tokenized_datasets = tokenized_datasets  # Укажите путь к вашему датасету
train_dataset = tokenized_datasets["test"]  # Используем тестовый набор как тренировочный для примера
eval_dataset = tokenized_datasets["test"]

# Создание DataLoaders
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8
)
eval_dataloader = DataLoader(
    eval_dataset,
    collate_fn=data_collator,
    batch_size=8
)

# Оптимизатор и планировщик
optimizer = AdamW(model.parameters(), lr=2e-4)
num_epochs = 35
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# Подготовка компонентов с помощью Accelerate
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

# Тренировка с tqdm и отображением метрик
model.train()
total_loss = 0
total_accuracy = 0
total_f1 = 0
num_batches = 0

for epoch in range(num_epochs):
    # Оборачиваем тренировочный цикл в tqdm
    progress_bar = tqdm(train_dataloader, 
                       desc=f"Epoch {epoch+1}/{num_epochs}", 
                       disable=not accelerator.is_local_main_process)
    
    for step, batch in enumerate(progress_bar):
        with accelerator.accumulate(model):
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.detach().float()
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            # Вычисляем метрики для текущего батча
            logits = outputs.logits
            preds = torch.sigmoid(logits) > 0.5
            labels = batch["labels"]
            
            # Собираем данные с всех устройств
            preds_gathered = accelerator.gather(preds)
            labels_gathered = accelerator.gather(labels)
            
            # Вычисляем метрики
            accuracy = accuracy_score(labels_gathered.cpu(), preds_gathered.cpu())
            f1 = f1_score(labels_gathered.cpu(), preds_gathered.cpu(), average="micro")
            
            # Накапливаем метрики
            total_accuracy += accuracy
            total_f1 += f1
            num_batches += 1
            
            # Обновляем прогресс-бар с текущими метриками
            if accelerator.is_local_main_process:
                progress_bar.set_postfix({
                    "loss": loss.item(),
                    "accuracy": f"{accuracy:.4f}",
                    "f1": f"{f1:.4f}"
                })

    # Вычисляем средние метрики за эпоху
    avg_loss = total_loss / len(train_dataloader)
    avg_accuracy = total_accuracy / num_batches
    avg_f1 = total_f1 / num_batches
    
    # Выводим итоговые метрики за эпоху
    if accelerator.is_local_main_process:
        print(f"\nEpoch {epoch+1} completed:")
        print(f"Average Loss: {avg_loss:.4f}")
        print(f"Average Accuracy: {avg_accuracy:.4f}")
        print(f"Average F1: {avg_f1:.4f}\n")

# Оценка
model.eval()
all_preds, all_labels = [], []

for batch in eval_dataloader:
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    preds = torch.sigmoid(logits) > 0.5
    all_preds.append(accelerator.gather(preds))
    all_labels.append(accelerator.gather(batch["labels"]))

# Объединение результатов
all_preds = torch.cat(all_preds)
all_labels = torch.cat(all_labels)
all_preds_np = all_preds.cpu().numpy()
all_labels_np = all_labels.cpu().numpy()

# Вычисление метрик
f1 = f1_score(all_labels_np, all_preds_np, average="micro")
accuracy = accuracy_score(all_labels_np, all_preds_np)

print(f"Evaluation results - F1: {f1:.4f}, Accuracy: {accuracy:.4f}")
accelerator.wait_for_everyone()

# Сохраните модель (только LoRA-адаптеры и классификатор)
output_dir = "./lora_classifier_model"
model.save_pretrained(output_dir)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at Geotrend/distilbert-base-ru-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 1 completed:
Average Loss: 0.3810
Average Accuracy: 0.2399
Average F1: 0.3180



Epoch 2/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 2 completed:
Average Loss: 0.6918
Average Accuracy: 0.3480
Average F1: 0.4403



Epoch 3/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 3 completed:
Average Loss: 0.9859
Average Accuracy: 0.3891
Average F1: 0.4867



Epoch 4/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 4 completed:
Average Loss: 1.2653
Average Accuracy: 0.4163
Average F1: 0.5159



Epoch 5/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 5 completed:
Average Loss: 1.5198
Average Accuracy: 0.4396
Average F1: 0.5391



Epoch 6/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 6 completed:
Average Loss: 1.7483
Average Accuracy: 0.4645
Average F1: 0.5634



Epoch 7/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 7 completed:
Average Loss: 1.9583
Average Accuracy: 0.4859
Average F1: 0.5838



Epoch 8/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 8 completed:
Average Loss: 2.1472
Average Accuracy: 0.5080
Average F1: 0.6041



Epoch 9/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 9 completed:
Average Loss: 2.3203
Average Accuracy: 0.5289
Average F1: 0.6232



Epoch 10/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 10 completed:
Average Loss: 2.4720
Average Accuracy: 0.5491
Average F1: 0.6419



Epoch 11/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 11 completed:
Average Loss: 2.6097
Average Accuracy: 0.5677
Average F1: 0.6587



Epoch 12/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 12 completed:
Average Loss: 2.7368
Average Accuracy: 0.5850
Average F1: 0.6736



Epoch 13/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 13 completed:
Average Loss: 2.8481
Average Accuracy: 0.6020
Average F1: 0.6882



Epoch 14/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 14 completed:
Average Loss: 2.9531
Average Accuracy: 0.6169
Average F1: 0.7015



Epoch 15/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 15 completed:
Average Loss: 3.0445
Average Accuracy: 0.6313
Average F1: 0.7142



Epoch 16/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 16 completed:
Average Loss: 3.1290
Average Accuracy: 0.6454
Average F1: 0.7262



Epoch 17/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 17 completed:
Average Loss: 3.2040
Average Accuracy: 0.6586
Average F1: 0.7374



Epoch 18/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 18 completed:
Average Loss: 3.2713
Average Accuracy: 0.6708
Average F1: 0.7478



Epoch 19/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 19 completed:
Average Loss: 3.3378
Average Accuracy: 0.6818
Average F1: 0.7569



Epoch 20/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 20 completed:
Average Loss: 3.3958
Average Accuracy: 0.6927
Average F1: 0.7659



Epoch 21/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 21 completed:
Average Loss: 3.4451
Average Accuracy: 0.7035
Average F1: 0.7747



Epoch 22/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 22 completed:
Average Loss: 3.4918
Average Accuracy: 0.7134
Average F1: 0.7827



Epoch 23/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 23 completed:
Average Loss: 3.5357
Average Accuracy: 0.7225
Average F1: 0.7901



Epoch 24/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 24 completed:
Average Loss: 3.5748
Average Accuracy: 0.7312
Average F1: 0.7970



Epoch 25/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 25 completed:
Average Loss: 3.6118
Average Accuracy: 0.7396
Average F1: 0.8037



Epoch 26/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 26 completed:
Average Loss: 3.6470
Average Accuracy: 0.7472
Average F1: 0.8098



Epoch 27/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 27 completed:
Average Loss: 3.6821
Average Accuracy: 0.7544
Average F1: 0.8154



Epoch 28/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 28 completed:
Average Loss: 3.7119
Average Accuracy: 0.7614
Average F1: 0.8209



Epoch 29/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 29 completed:
Average Loss: 3.7392
Average Accuracy: 0.7680
Average F1: 0.8262



Epoch 30/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 30 completed:
Average Loss: 3.7657
Average Accuracy: 0.7743
Average F1: 0.8311



Epoch 31/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 31 completed:
Average Loss: 3.7915
Average Accuracy: 0.7802
Average F1: 0.8358



Epoch 32/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 32 completed:
Average Loss: 3.8130
Average Accuracy: 0.7859
Average F1: 0.8403



Epoch 33/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 33 completed:
Average Loss: 3.8398
Average Accuracy: 0.7910
Average F1: 0.8442



Epoch 34/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 34 completed:
Average Loss: 3.8600
Average Accuracy: 0.7962
Average F1: 0.8483



Epoch 35/35:   0%|          | 0/236 [00:00<?, ?it/s]


Epoch 35 completed:
Average Loss: 3.8790
Average Accuracy: 0.8012
Average F1: 0.8521

Evaluation results - F1: 0.9997, Accuracy: 0.9995
