In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import Dataset


MODEL_NAME = "sergeyzh/rubert-mini-frida"
MAX_LEN = 128
BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 2e-5

In [None]:
df = pd.read_csv('spam.csv')

label_map = {'ham': 0, 'spam': 1}
df['label'] = df['target'].map(label_map)

df = df.dropna(subset=['text', 'label'])

# Разделение на 90% трейн и 10% тест
train_df, test_df = train_test_split(df, test_size=0.10, random_state=42, stratify=df['label'])

print(f"Размер обучающей выборки: {len(train_df)}")
print(f"Размер тестовой выборки: {len(test_df)}")

In [None]:
class SpamDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Инициализация токенизатора
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Создание объектов датасета
train_dataset = SpamDataset(
    texts=train_df.text.to_numpy(),
    labels=train_df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = SpamDataset(
    texts=test_df.text.to_numpy(),
    labels=test_df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=2
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    from sklearn.metrics import accuracy_score, f1_score
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Параметры обучения
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch", # Валидация в конце каждой эпохи
    save_strategy="no",          # Не сохранять чекпоинты (мы сохраним вручную в конце)
    learning_rate=LEARNING_RATE,
    use_cpu=False if torch.cuda.is_available() else True
)

# Инициализация тренера
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Запуск обучения
trainer.train()

In [None]:
# Получение предсказаний на тесте
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

print("\n=== Отчет о классификации на тестовой выборке (10%) ===")
print(classification_report(test_df.label.to_numpy(), preds, target_names=['ham', 'spam']))

In [None]:

head_save_path = "frida_spam_head_only.bin"
state_dict = model.state_dict()

# Фильтруем, оставляя только ключи, содержащие 'classifier'
classifier_state_dict = {k: v for k, v in state_dict.items() if 'classifier' in k}

# Сохраняем
torch.save(classifier_state_dict, head_save_path)

print(f"Обученная голова сохранена в файл: {head_save_path}")
print("Сохраненные ключи:", classifier_state_dict.keys())

In [None]:
# new_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# head_weights = torch.load('frida_spam_head_only.bin')
# new_model.load_state_dict(head_weights, strict=False) # strict=False важен, т.к. грузим не всё тело