# Код для повторения на Kaggle

In [1]:
import torch
import random
import numpy as np


from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed set as {seed}")


set_seed(42)

Random seed set as 42


# Загрузка датасета и модели

In [3]:
dataset = load_dataset("cornell-movie-review-data/rotten_tomatoes")

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [4]:
model_name = "google-bert/bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [6]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [7]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [8]:
trainer = Trainer(
    model=model,
    args=TrainingArguments(
        "test_trainer",
        eval_strategy="no",
        report_to="none",
    ),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [9]:
initial_metrics = trainer.evaluate()



In [10]:
initial_metrics

{'eval_loss': 0.6912773251533508,
 'eval_accuracy': 0.5196998123827392,
 'eval_f1': 0.4454775993237532,
 'eval_precision': 0.5424012607111199,
 'eval_recall': 0.5196998123827392,
 'eval_runtime': 18.3935,
 'eval_samples_per_second': 57.955,
 'eval_steps_per_second': 3.643}

# Дообучение модели

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="no",
    save_strategy="no",
    report_to="none",
    logging_steps=100,
    gradient_accumulation_steps=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [12]:
trainer.train()



Step,Training Loss
100,0.4188


TrainOutput(global_step=133, training_loss=0.396829640954957, metrics={'train_runtime': 433.9346, 'train_samples_per_second': 19.657, 'train_steps_per_second': 0.306, 'total_flos': 2239601303224320.0, 'train_loss': 0.396829640954957, 'epoch': 0.9962546816479401})

In [13]:
final_metrics = trainer.evaluate()

In [14]:
final_metrics

{'eval_loss': 0.35613584518432617,
 'eval_accuracy': 0.849906191369606,
 'eval_f1': 0.8498929818632421,
 'eval_precision': 0.8500294025472818,
 'eval_recall': 0.849906191369606,
 'eval_runtime': 19.2137,
 'eval_samples_per_second': 55.481,
 'eval_steps_per_second': 1.77,
 'epoch': 0.9962546816479401}

In [15]:
print("Улучшение:")

for metric in initial_metrics:
    if metric in final_metrics:
        diff = final_metrics[metric] - initial_metrics[metric]
        print(f"{metric}: {diff:+.4f}")

Улучшение:
eval_loss: -0.3351
eval_accuracy: +0.3302
eval_f1: +0.4044
eval_precision: +0.3076
eval_recall: +0.3302
eval_runtime: +0.8202
eval_samples_per_second: -2.4740
eval_steps_per_second: -1.8730
