In [1]:
!pip install -U transformers



In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType

In [3]:
# категории
categories = [
    "debitcards", "creditcards", "hypothec", "autocredits",
    "credits", "restructing", "deposits", "transfers",
    "remote", "other", "mobile_app", "individual"
]

In [4]:
def setup_bert_model() -> tuple[RobertaForSequenceClassification, AutoTokenizer]:
    num_labels = len(categories)
    model_name = "ai-forever/ruRoberta-large"

    base_model = RobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type="multi_label_classification"  # важно для BCEWithLogitsLoss
    )

    # Конфиг LoRA
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
    )

    model = get_peft_model(base_model, lora_config)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer

In [5]:
bert_model, bert_tokenizer = setup_bert_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ai-forever/ruRoberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
class ReviewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]
        labels = item["categories"]

        # мультилейбл: 0/1 для каждой категории
        target = [1 if cat in labels else 0 for cat in categories]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(target, dtype=torch.float)
        }

In [7]:
from sklearn.metrics import f1_score, accuracy_score

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)

    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "accuracy": accuracy_score(labels, preds)
    }

In [8]:
def train_model(train_dataset, eval_dataset):
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="steps",
        eval_steps=25,
        save_strategy="steps",
        save_steps=25,
        save_total_limit=5,
        load_best_model_at_end=True,
        metric_for_best_model="f1_micro", #целевая метрика
        greater_is_better=True,
        num_train_epochs=7,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        gradient_accumulation_steps=2,
        learning_rate=2e-4,
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        warmup_ratio=0.15,
        label_smoothing_factor=0.05,
        fp16=True,
        logging_strategy="steps",
        logging_steps=25,
        dataloader_num_workers=2,
        report_to="none",
        seed=42,
        save_safetensors=True,
    )
    trainer = Trainer(
        model=bert_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=bert_tokenizer,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer


In [9]:
def predict(texts, model, tokenizer, thresholds):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Перемещаем тензоры на то же устройство, что и модель
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.sigmoid(outputs.logits).cpu().numpy()

    results = []
    for prob in probs:
        labels = [categories[i] for i, p in enumerate(prob) if p > thresholds[i]]
        results.append(labels)

    return results

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
file_path = "/content/drive/MyDrive/reviews_full.json"

import json
with open(file_path, "r", encoding="utf-8") as f:
    data_json = json.load(f)

In [12]:
reviews_list = list(data_json.values())
print(len(reviews_list))

50665


In [13]:
import random

dataset_list = random.sample(reviews_list, 5000)

Посмотрим на распределение категорий

In [14]:
from collections import Counter

all_categories = []
for item in dataset_list:
    all_categories.extend(item["categories"])  # т.к. категорий может быть несколько

category_counts = Counter(all_categories)
print(category_counts)

Counter({'debitcards': 2723, 'creditcards': 780, 'deposits': 629, 'remote': 611, 'mobile_app': 450, 'credits': 332, 'other': 318, 'hypothec': 317, 'transfers': 301, 'restructing': 101, 'autocredits': 33, 'individual': 24})


In [15]:
import pandas as pd

df_counts = pd.DataFrame.from_dict(category_counts, orient="index", columns=["count"])
df_counts = df_counts.sort_values("count", ascending=False)

print(df_counts)

             count
debitcards    2723
creditcards    780
deposits       629
remote         611
mobile_app     450
credits        332
other          318
hypothec       317
transfers      301
restructing    101
autocredits     33
individual      24


In [16]:
train_dataset = ReviewsDataset(dataset_list[:len(dataset_list)*0.9], bert_tokenizer)
eval_dataset = ReviewsDataset(dataset_list[len(dataset_list)*0.9:], bert_tokenizer)

TypeError: slice indices must be integers or None or have an __index__ method

In [None]:
trainer = train_model(train_dataset, eval_dataset)

Попробуем получить предсказания

In [None]:
test_texts = [
    "Очень понравилась кредитная карта, удобный лимит и хорошие бонусы.",
    "Хочу взять ипотеку, но пока не понимаю условия.",
    "Приложение работает быстро, переводы проходят мгновенно!"
]

treshholds = [0.7] + [0.5] * (len(categories) - 1)

preds = predict(test_texts, bert_model, bert_tokenizer, treshholds)
print(preds)

In [17]:
from huggingface_hub import login
login("")

In [18]:
!pip install -U bitsandbytes
!pip install -U transformers accelerate



In [19]:
import json
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from tqdm import tqdm

categories = [
    "debitcards", "creditcards", "hypothec", "autocredits", "credits", "restructing",
    "deposits", "transfers", "remote", "other", "mobile_app", "individual"
]

sentiments = ["positive", "neutral", "negative"]

# ====== Промпт для анализа тональности ======
sentiment_prompt = (
    "Ты — ассистент для анализа тональности отзывов клиентов о банках.\n"
    "Проанализируй отзыв и определи тональность для каждой указанной категории.\n"
    "Доступные тональности: positive, neutral, negative.\n"
    "Формат ответа: категория1:тональность1, категория2:тональность2, ...\n"
    "Если отзыв не затрагивает категорию, верни для неё 'neutral'.\n"
    "Будь внимателен к контексту и эмоциональной окраске текста.\n"
    "Категории для анализа: {categories}\n"
    "Отзыв: {review}\n"
    "Тональность по категориям:"
)

def setup_llm(model_name="google/gemma-3-12b-it"):
    """Загрузка модели для анализа тональности"""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    quant_config = BitsAndBytesConfig(load_in_4bit=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="cuda",
        dtype="auto"
    )
    print("✅ Модель для тональности загружена с 4-бит квантизацией")
    return model, tokenizer

def predict_sentiment(review: str, categories_list: list, llm: tuple):
    """Предсказание тональности для каждой категории"""
    model, tokenizer = llm

    categories_str = ", ".join(categories_list)
    prompt = sentiment_prompt.format(
        categories=categories_str,
        review=review
    )

    inputs = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(model.device)

    with torch.inference_mode():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=100,
            do_sample=False,
            temperature=0.1,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Извлекаем только часть с тональностями
    if "Тональность по категориям:" in decoded:
        response = decoded.split("Тональность по категориям:")[-1].strip()
    else:
        response = decoded

    return parse_sentiment_response(response, categories_list)

def parse_sentiment_response(response: str, target_categories: list):
    """Парсинг ответа модели в словарь тональностей"""
    sentiment_dict = {cat: "neutral" for cat in target_categories}  # по умолчанию neutral

    try:
        pairs = [pair.strip() for pair in response.split(",")]
        for pair in pairs:
            if ":" in pair:
                cat, sent = pair.split(":", 1)
                cat = cat.strip().lower()
                sent = sent.strip().lower()
                if cat in target_categories and sent in sentiments:
                    sentiment_dict[cat] = sent
    except Exception as e:
        print(f"Ошибка парсинга тональности: {e}")

    return sentiment_dict

def analyze_reviews_with_sentiment(reviews_data, bert_model, bert_tokenizer, llm, thresholds):
    """
    Анализ отзывов: категории через BERT, тональность через LLM
    """
    predictions = []

    for i, item in enumerate(tqdm(reviews_data, desc="Анализ отзывов"), 1):
        review_text = item["text"]

        # Шаг 1: Предсказание категорий через BERT (ваш существующий код)
        predicted_categories = predict([review_text], bert_model, bert_tokenizer, thresholds)[0]

        # Шаг 2: Предсказание тональности для каждой категории через LLM
        sentiment_results = predict_sentiment(review_text, predicted_categories, llm)

        # Формируем результат в требуемом формате
        prediction = {
            "id": i,
            "topics": predicted_categories,  # английские названия
            "sentiments": [sentiment_results[cat] for cat in predicted_categories]  # английские тональности
        }

        predictions.append(prediction)

    # Обертываем в финальный JSON
    final_result = {
        "predictions": predictions
    }

    return final_result

In [None]:
llm = setup_llm()

model.safetensors.index.json:   0%|          | 0.00/109k [00:00<?, ?B/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/4.60G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

In [None]:
file_path = "/content/drive/MyDrive/sravni_reviews_full.json"

with open(file_path, "r", encoding="utf-8") as f:
    data_json = json.load(f)

test_data = data_json
print(test_data)