### Предобработка данных

In [None]:
import pandas as pd
df = pd.read_csv('RuFoLa фрагмент для практики.csv', encoding='utf-8')
# удаляем значения С2 -- их очень мало, они всё портят
df = df.drop(df[df['level_number'] == 6].index)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# делаем списки из нужных нам колонок
texts = df["text"].tolist()
labels = df["level"].tolist()

le = LabelEncoder()
labels = le.fit_transform(labels)

In [None]:
# смотрим распределение данных по классам
import matplotlib.pyplot as plt

# Визуализация
plt.hist(labels, bins=np.arange(len(le.classes_)+1)-0.5, align='mid', rwidth=0.8, edgecolor='black')
plt.xlabel('Классы')
plt.ylabel('Количество текстов')
plt.title('Распределение текстов по классам')
plt.xticks(np.arange(len(le.classes_)))
plt.show()

In [None]:
len(texts)

In [None]:
# разделеляем данные на обучающую и тестовую выборки
train_texts, vali_texts, train_labels, vali_labels = train_test_split(texts, labels, test_size=0.3, random_state=40)
val_texts, test_texts, val_labels, test_labels = train_test_split(vali_texts, vali_labels, test_size=0.5, random_state=40)
train_texts[:3]

### TRANSFORMERS


In [None]:
# токенизация
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# model_name = "sberbank-ai/ruRoberta-large"
model_name = "DeepPavlov/rubert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

import torch
# Выбор модели и токенизатора (например, RuBERT)

# model = AutoModelForSequenceClassification.from_pretrained('roberta-large')
# tokenizer = AutoTokenizer.from_pretrained('roberta-large')

# Функция для подготовки данных
def tokenize_function(examples):
    return tokenizer(examples, truncation=True, padding="max_length", max_length=512)

val_encodings = tokenize_function(val_texts)
train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

In [None]:

import torch
from torch.utils.data import Dataset

class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
       self.encodings = encodings
       self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    # Создаём датасеты для PyTorch
train_dataset = TextClassificationDataset(train_encodings, train_labels)
val_dataset = TextClassificationDataset(val_encodings, val_labels)
test_dataset = TextClassificationDataset(test_encodings, test_labels)

In [None]:
print(train_dataset[1])

### GPU
Точность на тестовой выборке 0.5895765472312704
F1-мера на тестовой выборке 0.5571348023827405

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Модель для классификации

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

# Функция вычисления метрик
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Получаем полный отчет по метрикам
    report = classification_report(labels, preds, output_dict=True)

    # Извлекаем метрики для каждого класса
    metrics = {label: {metric: values[metric] for metric in ['precision', 'recall', 'f1-score', 'support']}
               for label, values in report.items() if label.startswith('level_')}

    # Добавляем общие метрики
    metrics['accuracy'] = report['accuracy']
    metrics['macro avg'] = report['macro avg']
    metrics['weighted avg'] = report['weighted avg']

    # Сохранение результатов в файл
    with open("classification_report.txt", "w", encoding="utf-8") as f:
        json.dump(metrics, f, indent=4, ensure_ascii=False)

    return metrics

# Параметры обучения
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=200,
    weight_decay=0.01,
    # evaluation_strategy='epoch',
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='epoch',
    gradient_accumulation_steps=4,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Обучение модели
# trainer.train()

for epoch in range(int(training_args.num_train_epochs)):
    print(f"\n==== Эпоха {epoch + 1}/{int(training_args.num_train_epochs)} ====\n")
    trainer.train(resume_from_checkpoint=None)  # обучение одной эпохи
    eval_metrics = trainer.evaluate()
    print(f"Метрики на валидации после эпохи {epoch + 1}:", eval_metrics)

# Оценка модели на тесте
predictions = trainer.predict(test_dataset)
labels = predictions.label_ids
preds = predictions.predictions.argmax(-1)



In [None]:
predictions = trainer.predict(test_dataset)
labels = predictions.label_ids
preds = predictions.predictions.argmax(-1)


# Вывод classification report
print("Classification Report:")
print(classification_report(labels, preds, digits=2))


# Вывод confusion matrix
cm = confusion_matrix(labels, preds)
print("\nConfusion Matrix:")
print(cm)

# Отрисовка confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.savefig("confusion_matrix.png")
plt.show()