In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
import math
from datasets import load_dataset, Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          AutoModelForMaskedLM, Trainer, TrainingArguments,
                          DataCollatorForLanguageModeling)
import torch
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Отключаем W&B
os.environ["WANDB_DISABLED"] = "true"

# Пути к данным
TRAIN_CSV = "/content/gdrive/MyDrive/Colab Notebooks/train.csv"
TRAIN_TEST_TXT = "/content/gdrive/MyDrive/Colab Notebooks/train-test.txt"

# 1. Загрузка данных
# Загрузка train.csv
dataset = load_dataset("csv", data_files=TRAIN_CSV)
train_test_txt = open(TRAIN_TEST_TXT).readlines()

# Разбиваем train.csv на train/valid
split_datasets = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_datasets["train"]
valid_dataset = split_datasets["test"]

# 2. Токенизация
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
valid_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 3. Обучение классификационной модели
classification_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./classification_results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="no",
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    cm = confusion_matrix(labels, predictions)
    return {"accuracy": acc, "f1": f1, "confusion_matrix": cm}

trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# 4. Оценка модели
results = trainer.evaluate()
print(f"Initial Classification Model - Accuracy: {results['eval_accuracy']}, F1: {results['eval_f1']}")

# 5. Претренировка с помощью Masked Language Modeling
# Создаём датасет для MLM
train_test_dataset = Dataset.from_dict({"text": train_test_txt})
mlm_dataset = train_test_dataset.map(tokenize_function, batched=True)
mlm_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

# Создаём MLM модель
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)

# Настраиваем параметры обучения
mlm_args = TrainingArguments(
    output_dir="./mlm_results",
    evaluation_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    save_strategy="no",
    report_to="none"
)

mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_args,
    train_dataset=mlm_dataset,
    data_collator=mlm_collator,
    tokenizer=tokenizer
)

mlm_trainer.train()

# Оценка MLM модели
mlm_results = mlm_trainer.predict(mlm_dataset)
mlm_loss = mlm_results.metrics["test_loss"]
mlm_perplexity = math.exp(mlm_loss)
print(f"MLM Model - Loss: {mlm_loss}, Perplexity: {mlm_perplexity}")

# 6. Перенос весов MLM модели в классификационную
classification_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)
classification_model.distilbert.load_state_dict(mlm_model.distilbert.state_dict())

trainer_with_mlm = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer_with_mlm.train()

# Оценка новой классификационной модели
final_results = trainer_with_mlm.evaluate()
print(f"Final Classification Model - Accuracy: {final_results['eval_accuracy']}, F1: {final_results['eval_f1']}")

Map:   0%|          | 0/1182 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Confusion Matrix
1,No log,0.187959,0.899323,0.899046,[[561 27]  [ 92 502]]


Epoch,Training Loss,Validation Loss,Accuracy,F1,Confusion Matrix
1,No log,0.187959,0.899323,0.899046,[[561 27]  [ 92 502]]
2,0.249000,0.157049,0.913706,0.913661,[[525 63]  [ 39 555]]
3,0.249000,0.169991,0.916244,0.91624,[[536 52]  [ 47 547]]


Initial Classification Model - Accuracy: 0.916243654822335, F1: 0.9162403573355115


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  mlm_trainer = Trainer(


Step,Training Loss


MLM Model - Loss: 2.9081809520721436, Perplexity: 18.323437022945583


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_with_mlm = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Confusion Matrix
1,No log,0.229124,0.866328,0.863992,[[588 0]  [158 436]]
2,0.262200,0.164142,0.91286,0.912851,[[532 56]  [ 47 547]]
3,0.262200,0.14907,0.923858,0.923852,[[550 38]  [ 52 542]]


Final Classification Model - Accuracy: 0.9238578680203046, F1: 0.9238517638309485


In [None]:
# Сохранение модели без MLM
trainer.save_model("/content/gdrive/MyDrive/Colab Notebooks/classification_without_mlm")

# Сохранение MLM модели
mlm_trainer.save_model("/content/gdrive/MyDrive/Colab Notebooks/mlm_model")

# Сохранение модели с MLM
trainer_with_mlm.save_model("/content/gdrive/MyDrive/Colab Notebooks/classification_with_mlm")

**Итоговые метрики:**

Начальная модель классификации (без MLM):

*   Точность (Accuracy): 91.62%
*   F1-Score: 91.62%

MLM (Masked Language Model):

*   Loss: 2.91
*   Perplexity: 18.32

Финальная модель классификации (с MLM):

*  Точность (Accuracy): 92.39%
*  F1-Score: 92.39%

MLM-предобучение улучшило метрики классификации: точность выросла на 0,77%, F1-score — на 0,77%.