In [174]:
import pandas as pd
import re
import string
import numpy as np
import torch
from sklearn.metrics import roc_auc_score
from scipy.stats import ks_2samp
from datasets import load_dataset, Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from natasha import Doc, Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy

# Загрузка ресурсов
nltk.download('punkt')
nltk.download('stopwords')
russian_stopwords = set(stopwords.words("russian"))
nlp = spacy.load("ru_core_news_sm")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
df = pd.read_csv(r'/content/train.csv')
test = pd.read_csv(r'/content/test.csv')

In [192]:
# Инициализация компонентов Natasha
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

# Список месяцев
months = [
    "январь", "февраль", "март", "апрель", "май", "июнь",
    "июль", "август", "сентябрь", "октябрь", "ноябрь", "декабрь"
]

# Стоп-слова
russian_stopwords = set(stopwords.words("russian"))
russian_stopwords.discard('за')  # Убираем из стоп-слов "за"
russian_stopwords.update(['кг', 'сумма', 'тч', 'мл', 'счёт'])  # Добавляем свои слова

# Функция очистки текста
def clear(text, russian_stopwords, segmenter, morph_vocab, morph_tagger, months):
    # Удаляем даты из текста
    date_pattern = r"(\d{0,4}[./-]?)"
    cleaned_text = re.sub(date_pattern, "", text)

    # Токенизация и удаление стоп-слов
    tokens = word_tokenize(cleaned_text.lower())
    tokens = [token for token in tokens if token not in russian_stopwords]
    tokens = [token for token in tokens if token not in string.punctuation]
    cleaned_text = ' '.join(tokens)

    doc = Doc(cleaned_text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)

    for token in doc.tokens:
        token.lemmatize(morph_vocab)


    lemmas = [token.lemma for token in doc.tokens if token.lemma not in months and len(token.lemma) > 1]

    return ' '.join(lemmas)

In [82]:
class_ = list(df['class'].unique())
len_class = len(class_)

In [83]:
df['text'] = df['text'].apply(lambda x: clear(x,russian_stopwords, segmenter, morph_vocab, morph_tagger, months))
test['text'] = test['text'].apply(lambda x: chist(x,russian_stopwords, segmenter, morph_vocab, morph_tagger, months))
df['label_encoded'] = df['class'].apply(lambda x: class_.index(x))
test['label_encoded'] = test['class'].apply(lambda x: class_.index(x))

In [84]:
model_name = "sberbank-ai/ruBert-large"
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len_class
)
tokenizer = BertTokenizer.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sberbank-ai/ruBert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [85]:
model.config.id2label = {
    0:0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
    6: 6,
    7: 7,
    8: 8
}

In [86]:
for param in model.bert.parameters():
    param.requires_grad = False

In [87]:
train_df = pd.DataFrame({"text": df['text'], "label": df['label_encoded']})
test_df = pd.DataFrame({"text": test['text'], "label": test['label_encoded']})

In [88]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [89]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=7)

# Токенизируем оба набора
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [90]:
train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])
train_dataset = train_dataset.with_format("torch")
test_dataset = test_dataset.with_format("torch")

In [91]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions, average="weighted")
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average="weighted")
    recall = recall_score(labels, predictions, average="weighted")

    return {
        "f1": f1,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall
    }

In [94]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=15,
    weight_decay=0.01,
    save_strategy="epoch",
    report_to=[]
)

In [95]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,No log,1.433541,0.4924,0.54,0.485293,0.54
2,No log,1.015462,0.631605,0.62,0.715515,0.62
3,No log,1.073347,0.683464,0.7,0.737684,0.7
4,No log,0.712373,0.713921,0.73,0.852222,0.73
5,No log,0.496486,0.778173,0.78,0.814842,0.78
6,No log,0.481144,0.865338,0.86,0.901825,0.86
7,No log,0.399924,0.851464,0.88,0.905933,0.88
8,No log,0.303109,0.894051,0.9,0.910602,0.9
9,No log,0.321593,0.888725,0.89,0.91789,0.89
10,1.165800,0.315006,0.888339,0.89,0.901706,0.89


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=750, training_loss=0.9251091918945312, metrics={'train_runtime': 1470.5942, 'train_samples_per_second': 4.08, 'train_steps_per_second': 0.51, 'total_flos': 76449302748000.0, 'train_loss': 0.9251091918945312, 'epoch': 15.0})

In [99]:
model.save_pretrained("./rubert_model")
tokenizer.save_pretrained("./rubert_model")

('./rubert_model/tokenizer_config.json',
 './rubert_model/special_tokens_map.json',
 './rubert_model/vocab.txt',
 './rubert_model/added_tokens.json')

In [182]:
classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [186]:
ind = 0
true_value =[]
other_value = []
text2 = test['text'].to_list()
label2 = test['label_encoded'].to_list()

for i in range(len(text2)):
  promb = classifier(text2[i],return_all_scores=True)[0][ind]['score']

  if label2[i] == ind:
    true_value.append(promb)
  else:
    other_value.append(promb)

In [187]:
stat,pvalue = ks_2samp(true_value, other_value)

In [188]:
print(f'Статистика Колмогорова-Смирнова: {stat}, p-value: {pvalue}')

Статистика Колмогорова-Смирнова: 1.0, p-value: 1.1553808469067748e-13


In [189]:
texts = test['text'].tolist()
true_labels = test['label_encoded'].tolist()

all_probs = []
for text in texts:
    processed_text = clear(text, russian_stopwords, segmenter, morph_vocab, morph_tagger, months)
    predict = classifier(processed_text, return_all_scores=True)
    probs = [label['score'] for label in predict[0]]
    all_probs.append(probs)


all_probs = np.array(all_probs)

unique_classes = list(set(true_labels))
label_to_index = {label: i for i, label in enumerate(unique_classes)}
true_labels_one_hot = np.zeros((len(true_labels), len(unique_classes)))

for i, label in enumerate(true_labels):
    true_labels_one_hot[i, label_to_index[label]] = 1


In [190]:
roc_auc = roc_auc_score(true_labels_one_hot, all_probs, multi_class='ovr')

In [191]:
print(f'Gini:{2*roc_auc - 1}')

Gini:0.9168993697734247
