In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Загрузка данных
data = pd.read_csv('/content/drive/MyDrive/ML_2024/hw_4/train.csv')

# Разделение на train и valid выборки
train_data, valid_data = train_test_split(data, test_size=0.2, random_state=42)

# Вывод количества классов для проверки баланса
print(train_data['label'].value_counts())
print(valid_data['label'].value_counts())

label
1    2410
0    2317
Name: count, dtype: int64
label
0    613
1    569
Name: count, dtype: int64


In [4]:
!pip install transformers datasets scikit-learn

from datasets import Dataset

# Преобразуем данные в формат Huggingface Dataset
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)

# Загрузка токенизатора
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

# Функция токенизации
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Токенизация данных
train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

# Удаляем ненужные столбцы
train_dataset = train_dataset.remove_columns(["text", "__index_level_0__"])
valid_dataset = valid_dataset.remove_columns(["text", "__index_level_0__"])

# Указываем, какие колонки использовать
train_dataset.set_format("torch")
valid_dataset.set_format("torch")

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/4727 [00:00<?, ? examples/s]

Map:   0%|          | 0/1182 [00:00<?, ? examples/s]

In [5]:
# 3. Обучение классификационной модели
classification_model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base", num_labels=2)

training_args = TrainingArguments(
    output_dir="./classification_results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    save_strategy="no",
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    cm = confusion_matrix(labels, predictions)
    return {"accuracy": acc, "f1": f1, "confusion_matrix": cm}

trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Confusion Matrix
1,No log,0.23824,0.856176,0.856219,[[513 100]  [ 70 499]]
2,0.252300,0.212159,0.886633,0.886599,[[550 63]  [ 71 498]]
3,0.252300,0.20112,0.897631,0.89749,[[566 47]  [ 74 495]]


TrainOutput(global_step=888, training_loss=0.2183414837261578, metrics={'train_runtime': 178.6511, 'train_samples_per_second': 79.378, 'train_steps_per_second': 4.971, 'total_flos': 469630045085184.0, 'train_loss': 0.2183414837261578, 'epoch': 3.0})

In [6]:
train_test_data ="/content/drive/MyDrive/ML_2024/hw_4/train-test.txt"

train_test_txt = open(train_test_data).readlines()


# Создание Dataset
mlm_dataset = Dataset.from_dict({"text": train_test_txt})

# Токенизация данных
mlm_dataset = mlm_dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=128), batched=True)

# Подготовка датаколлатора
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [7]:


# Создаём MLM модель
mlm_model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")

# Настраиваем параметры обучения
mlm_args = TrainingArguments(
    output_dir="./mlm_results",
    evaluation_strategy="no",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    save_strategy="no",
    report_to="none"
)

mlm_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

mlm_trainer = Trainer(
    model=mlm_model,
    args=mlm_args,
    train_dataset=mlm_dataset,
    data_collator=mlm_collator,
    tokenizer=tokenizer
)

mlm_trainer.train()
# Сохранение модели
mlm_trainer.save_model("./mlm_model")

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  mlm_trainer = Trainer(


Step,Training Loss


In [8]:
# Загрузка модели с новыми весами
model_with_mlm = AutoModelForSequenceClassification.from_pretrained("./mlm_model", num_labels=2)

# Тренер с обновленной моделью
trainer_with_mlm = Trainer(
    model=model_with_mlm,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

# Обучение
trainer_with_mlm.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./mlm_model and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Confusion Matrix
1,No log,0.238334,0.866328,0.863255,[[612 1]  [157 412]]
2,0.258600,0.230181,0.871404,0.869716,[[594 19]  [133 436]]
3,0.258600,0.23463,0.873942,0.872211,[[597 16]  [133 436]]


TrainOutput(global_step=888, training_loss=0.24010803463222744, metrics={'train_runtime': 175.2439, 'train_samples_per_second': 80.922, 'train_steps_per_second': 5.067, 'total_flos': 469630045085184.0, 'train_loss': 0.24010803463222744, 'epoch': 3.0})

In [9]:
# Оценка начальной модели
initial_metrics = trainer.evaluate()

# Оценка модели с MLM
mlm_metrics = trainer_with_mlm.evaluate()

print("Initial Model Metrics:", initial_metrics)
print("Model with MLM Metrics:", mlm_metrics)

Initial Model Metrics: {'eval_loss': 0.2011197954416275, 'eval_accuracy': 0.8976311336717429, 'eval_f1': 0.8974901642634883, 'eval_confusion_matrix': array([[566,  47],
       [ 74, 495]]), 'eval_runtime': 4.0286, 'eval_samples_per_second': 293.404, 'eval_steps_per_second': 36.738, 'epoch': 3.0}
Model with MLM Metrics: {'eval_loss': 0.2346300333738327, 'eval_accuracy': 0.873942470389171, 'eval_f1': 0.8722107459944028, 'eval_confusion_matrix': array([[597,  16],
       [133, 436]]), 'eval_runtime': 3.9386, 'eval_samples_per_second': 300.106, 'eval_steps_per_second': 37.577, 'epoch': 3.0}


In [10]:
# Сохранение начальной модели
trainer.save_model("/content/drive/MyDrive/ML_2024/hw_4/initial_model")

# Сохранение модели с MLM
trainer_with_mlm.save_model("/content/drive/MyDrive/ML_2024/hw_4/final_model")