In [1]:
!pip install -U transformers



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType

In [3]:
# категории
categories = [
    "debitcards", "creditcards", "hypothec", "autocredits",
    "credits", "restructing", "deposits", "transfers",
    "remote", "other", "mobile_app", "individual"
]

In [4]:
def setup_bert_model() -> tuple[RobertaForSequenceClassification, AutoTokenizer]:
    num_labels = len(categories)
    model_name = "ai-forever/ruRoberta-large"

    base_model = RobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type="multi_label_classification"  # важно для BCEWithLogitsLoss
    )

    # Конфиг LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
    )

    model = get_peft_model(base_model, lora_config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [15]:
bert_model, bert_tokenizer = setup_bert_model()

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

In [18]:
class ReviewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]
        labels = item["categories"]

        # мультилейбл: 0/1 для каждой категории
        target = [1 if cat in labels else 0 for cat in categories]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(target, dtype=torch.float)
        }

In [23]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)

    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "accuracy": accuracy_score(labels, preds)
    }

In [25]:
def train_model(train_dataset, eval_dataset):
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="steps",
        eval_steps=25,
        save_strategy="steps",
        save_steps=25,
        save_total_limit=5,
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro", #целевая метрика
        greater_is_better=True,
        num_train_epochs=7,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.15,
        label_smoothing_factor=0.05,
        fp16=True,
        logging_strategy="steps",
        logging_steps=25,
        dataloader_num_workers=2,
        report_to="none",
        seed=42,
        save_safetensors=True,
    )
    trainer = Trainer(
        model=bert_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=bert_tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer


In [17]:
def predict(texts, model, tokenizer, thresholds):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Перемещаем тензоры на то же устройство, что и модель
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()

    results = []
    for prob in probs:
        labels = [categories[i] for i, p in enumerate(prob) if p > thresholds[i]]
        results.append(labels)

    return results

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
file_path = "/content/drive/MyDrive/reviews_full.json"

import json
with open(file_path, "r", encoding="utf-8") as f:
    data_json = json.load(f)

In [26]:
reviews_list = list(data_json.values())
print(len(reviews_list))

50665


In [28]:
import random

dataset_list = random.sample(reviews_list, 5000)

Посмотрим на распределение категорий

In [29]:
from collections import Counter

all_categories = []
for item in dataset_list:
    all_categories.extend(item["categories"])  # т.к. категорий может быть несколько

category_counts = Counter(all_categories)
print(category_counts)

Counter({'debitcards': 2638, 'creditcards': 758, 'deposits': 656, 'remote': 622, 'mobile_app': 490, 'transfers': 351, 'credits': 345, 'hypothec': 342, 'other': 329, 'restructing': 112, 'autocredits': 45, 'individual': 12})


In [30]:
import pandas as pd

df_counts = pd.DataFrame.from_dict(category_counts, orient="index", columns=["count"])
df_counts = df_counts.sort_values("count", ascending=False)

print(df_counts)

             count
debitcards    2638
creditcards    758
deposits       656
remote         622
mobile_app     490
transfers      351
credits        345
hypothec       342
other          329
restructing    112
autocredits     45
individual      12


In [31]:
train_dataset = ReviewsDataset(dataset_list, bert_tokenizer)
eval_dataset = ReviewsDataset(dataset_list, bert_tokenizer)

In [32]:
trainer = train_model(train_dataset, eval_dataset)

  trainer = Trainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
25,0.5681,0.352165,0.447187,0.057386,0.359
50,0.3015,0.282119,0.427498,0.057597,0.3142
75,0.2787,0.26023,0.39292,0.059397,0.2668


KeyboardInterrupt: 

Попробуем получить предсказания

In [33]:
test_texts = [
    "Очень понравилась кредитная карта, удобный лимит и хорошие бонусы.",
    "Хочу взять ипотеку, но пока не понимаю условия.",
    "Приложение работает быстро, переводы проходят мгновенно!"
]

treshholds = [0.7] + [0.5] * (len(categories) - 1)

preds = predict(test_texts, bert_model, bert_tokenizer, treshholds)
print(preds)

[['debitcards'], ['hypothec'], ['debitcards']]


In [12]:
!pip install transformers accelerate sentencepiece



In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch

device = 0 if torch.cuda.is_available() else -1

model_name = "mistralai/Mistral-7B-Instruct-v0"  # instruction-tuned для русскоязычных задач

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

llm_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=50,
    temperature=0,
    device=device
)


OSError: mistralai/Mistral-7B-Instruct-v0 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [11]:
import json
import torch
from transformers import pipeline


device = 0 if torch.cuda.is_available() else -1
sentiment_pipe = pipeline(
    "text-classification",
    model="google/gemma-3-1b-it",
    device=device,
    tokenizer="google/gemma-3-1b-it",
    return_all_scores=True
)

def predict_sentiment(text):
    """Прогоняем текст через sentiment модель Gemma"""
    result = sentiment_pipe(text)
    # Предполагаем, что модель возвращает список с оценками для каждой категории
    sentiment_scores = {label: score for label, score in zip(result[0]['labels'], result[0]['scores'])}
    return sentiment_scores

# ---------------------------
# 2. Функция ABSA
# ---------------------------
def aspect_sentiment(text, predicted_categories):
    """
    Для каждой категории предсказываем тональность.
    Без синонимов, быстрое решение.
    """
    results = {}
    for cat in predicted_categories:
        sentiment_scores = predict_sentiment(text)
        # Выбираем тональность с максимальным баллом
        sentiment = max(sentiment_scores, key=sentiment_scores.get)
        results[cat] = sentiment
    return results

# ---------------------------
# 3. Пример на отзыве
# ---------------------------
sample_review = "Хочу взять ипотеку, но пока не понимаю условия. Очень понравилась кредитная карта, удобный лимит и хорошие бонусы."
predicted_categories = ["hypothec", "debitcards"]  # пример категорий из твоей модели

result = aspect_sentiment(sample_review, predicted_categories)
print(result)

results_all = []
for item in dataset_list[:10]:  # первые 10 отзывов для примера
    text = item["text"]
    # Здесь используем заранее предсказанные категории
    predicted_categories = item["categories"]
    sentiments = aspect_sentiment(text, predicted_categories)
    results_all.append({
        "id": item["id"],
        "text": text,
        "sentiments": sentiments
    })

# Печатаем результаты
for r in results_all:
    print(f"\nОтзыв: {r['text']}")
    for cat, sent in r["sentiments"].items():
        print(f"  - {cat}: {sent}")


OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.