In [3]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
device

device(type='cuda')

In [5]:
df =  pd.read_csv("/kaggle/input/dynasent/data_sentiment.csv", index_col = False)
df

Unnamed: 0,review,sentiment
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative
...,...,...
147196,"Для нас было плюсом, что в этом отеле можно ос...",positive
147197,Очень понравился отель.Хорошие номера с кухней...,positive
147198,"В Петербурге бываю очень часто, поэтому появле...",positive
147199,Изумительное место! Я просто не ожидала такого...,positive


In [6]:
df = df.rename(columns={'review': 'text', 'sentiment': 'label'})

In [7]:
import numpy as np

In [8]:
df['text'].replace('', np.nan, inplace=True)
df = df.dropna()
df

Unnamed: 0,text,label
0,качество плохое пошив ужасный (горловина напер...,negative
1,"Товар отдали другому человеку, я не получила п...",negative
2,"Ужасная синтетика! Тонкая, ничего общего с пре...",negative
3,"товар не пришел, продавец продлил защиту без м...",negative
4,"Кофточка голая синтетика, носить не возможно.",negative
...,...,...
147196,"Для нас было плюсом, что в этом отеле можно ос...",positive
147197,Очень понравился отель.Хорошие номера с кухней...,positive
147198,"В Петербурге бываю очень часто, поэтому появле...",positive
147199,Изумительное место! Я просто не ожидала такого...,positive


In [9]:
from sklearn.model_selection import train_test_split

# разделение на тестовую и тренировочную выборки
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)



In [10]:
import datasets
from datasets import Dataset, DatasetDict

tds = Dataset.from_pandas(train_data)
vds = Dataset.from_pandas(test_data)

ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 117638
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 29410
    })
})


In [11]:
# токенизатор для отзывов
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("blinoff/roberta-base-russian-v0")

Downloading tokenizer_config.json:   0%|          | 0.00/16.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [12]:
# переводим классы из строк в числа
from datasets import ClassLabel

labels = ClassLabel(num_classes = 3,names=["negative", "neutral", "positive"])

In [13]:
def tokenize(batch):
    tokens = tokenizer(batch['text'], padding=True, truncation=True)
    tokens['label'] = labels.str2int(batch['label'])
    return tokens

In [14]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
tokenized_corpus_train = tds.map(tokenize, batched=True)
tokenized_corpus_test = vds.map(tokenize, batched=True)

tokenized_corpus_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_corpus_test.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

  0%|          | 0/118 [00:00<?, ?ba/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

In [16]:
import evaluate

accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [17]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [18]:
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

In [19]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "blinoff/roberta-base-russian-v0",
    num_labels=3,
    id2label=id2label,
    label2id=label2id,
    attention_probs_dropout_prob=0.1,
    hidden_dropout_prob=0.1
)

Downloading model.safetensors:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at blinoff/roberta-base-russian-v0 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [21]:
# Определение параметров тренировки и тренировка модели
training_args = TrainingArguments(
    report_to=None,
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir='./logs',
    save_steps = 4000,
    save_total_limit = 10,
    logging_steps=2000,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_corpus_train,
    eval_dataset=tokenized_corpus_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
2000,0.7152
4000,0.6026
6000,0.5732
8000,0.5636
10000,0.5396
12000,0.5372
14000,0.5292
16000,0.4856
18000,0.4658
20000,0.4669


TrainOutput(global_step=44115, training_loss=0.47920149373289384, metrics={'train_runtime': 19425.5585, 'train_samples_per_second': 18.168, 'train_steps_per_second': 2.271, 'total_flos': 9.28564087040594e+16, 'train_loss': 0.47920149373289384, 'epoch': 3.0})

In [25]:
trainer.save_model("/kaggle/working/rubert_fin")

In [26]:
trainer

<transformers.trainer.Trainer at 0x7dd56981bbe0>