<a href="https://colab.research.google.com/github/kateburovova/Communication_Patterns_Investigation/blob/mainbranch/finetune_RuBert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
df_labels = pd.read_csv('/content/final_labels.csv', index_col=[0])

In [None]:
df_mention = df_labels[['text', 'Mention']]
df_emotion = df_labels[['text', 'Emotion']]

In [None]:
mention_label_mapping = {'так': 1, 'ні': 0, 'не можу визначитись із відповіддю': -1}
df_mention['labels'] = df_mention['Mention'].map(mention_label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mention['labels'] = df_mention['Mention'].map(mention_label_mapping)


In [None]:
# dropping lines where labelers are unsure
df_mention = df_mention[df_mention['labels']!=-1]
len(df_mention)

3518

In [None]:
emotion_label_mapping = {'так, присутня негативна': 1,
                         'ні, оцінка не присутня': 0,
                         'не можу визначитись з правильною відповіддю': -1,
                         'так, присутня позитивна': 2}
df_emotion['labels'] = df_emotion['Emotion'].map(emotion_label_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_emotion['labels'] = df_emotion['Emotion'].map(emotion_label_mapping)


In [None]:
df_emotion = df_emotion[df_emotion.labels!=-1] #dropping those regarding which labelers are unsure

In [None]:
len(df_emotion)


3495

In [None]:
pip install wandb


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import wandb

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import torch
import random
from transformers import set_seed
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

In [None]:
def set_random_seeds(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    set_seed(seed)  

seed = 42
set_random_seeds(seed)

# Mention

In [None]:
train_data, val_data = train_test_split(df_mention, test_size=0.2, random_state=42)
max_length = 512

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("DeepPavlov/rubert-base-cased")
train_encodings = tokenizer(train_data["text"].tolist(), padding=True, truncation=True, max_length=max_length)
val_encodings = tokenizer(val_data["text"].tolist(), padding=True, truncation=True, max_length=max_length)

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [None]:
class BinaryClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = BinaryClassificationDataset(train_encodings, train_data["labels"].tolist())
val_dataset = BinaryClassificationDataset(val_encodings, val_data["labels"].tolist())

In [None]:
model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=2)

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [None]:
# Load the pre-trained model
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate = 5e-5,
    load_best_model_at_end=True,
    weight_decay=0.01,
    metric_for_best_model="f1_score",
    report_to="wandb",
)

def compute_accuracy(predictions, labels):
    preds = predictions.argmax(-1)
    return {"accuracy": (preds == labels).mean()}

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     return compute_accuracy(predictions, labels)

def compute_f1_score(predictions, labels):
    preds = predictions.argmax(-1)
    return f1_score(labels, preds, average="weighted")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"f1_score": compute_f1_score(predictions, labels)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
wandb.init(project="binary_sentence_classification", name="rubert_mention_finetuning_colab")


[34m[1mwandb[0m: Currently logged in as: [33mkate-burovova[0m ([33mbrvva_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Start training
best_model_TrainOutput = trainer.train()



Epoch,Training Loss,Validation Loss,F1 Score
1,0.0785,0.359001,0.949725
2,0.0003,0.320047,0.958571


In [None]:
best_model_TrainOutput

TrainOutput(global_step=704, training_loss=0.03006206781190651, metrics={'train_runtime': 150.8492, 'train_samples_per_second': 37.309, 'train_steps_per_second': 4.667, 'total_flos': 1304366890281840.0, 'train_loss': 0.03006206781190651, 'epoch': 2.0})

In [None]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.020 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.105926…

0,1
eval/f1_score,▄█▅▄▆▇▆▃▅▆▆▆█▇▇▇▆▇▇▇▆▇▇▆▁▁▆▆▆▆
eval/loss,▁▁▃▄▂▃▃▄▄▄▃▃▂▃▃▃▃▃▄▄▄▄▄▄██▅▅▅▅
eval/runtime,▄▅▆▃▄▅▄▃▅▄▄▂▁▁▁▂▂▁▃▂▁▁▄▃▃▄█▃▂▂
eval/samples_per_second,▅▄▃▆▅▄▅▆▄▅▅▇███▇▇█▆▇██▅▆▆▅▁▆▇▇
eval/steps_per_second,▅▄▃▆▅▄▅▆▄▅▅▇███▇▇█▆▇██▅▆▆▅▁▆▇▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,▂▄█▁▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁

0,1
eval/f1_score,0.94833
eval/loss,0.57586
eval/runtime,5.6582
eval/samples_per_second,124.422
eval/steps_per_second,15.553
train/epoch,30.0
train/global_step,10560.0
train/learning_rate,0.0
train/loss,0.0
train/total_flos,1.95655033542276e+16


In [None]:
trained_model = trainer.model

In [None]:
output_directory = "./saved_model"
trained_model.save_pretrained(output_directory)

#Emotion

In [None]:
train_data, val_data = train_test_split(df_emotion, test_size=0.2, random_state=42)
max_length = 512

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("DeepPavlov/rubert-base-cased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_data["text"].tolist(), padding=True, truncation=True, max_length=512)
val_encodings = tokenizer(val_data["text"].tolist(), padding=True, truncation=True, max_length=512)

In [None]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_data['labels'].tolist())
val_dataset = CustomDataset(val_encodings, val_data['labels'].tolist())

In [None]:
model = BertForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=3)

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate = 5e-5,
    load_best_model_at_end=True,
    weight_decay=0.01,
    metric_for_best_model="f1_score",
    report_to="wandb",
)

def compute_f1_score(predictions, labels):
    preds = predictions.argmax(-1)
    return f1_score(labels, preds, average="weighted")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"f1_score": compute_f1_score(predictions, labels)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
wandb.init(project="binary_sentence_classification", name="1_epoch_rubert_emotion_finetuning_colab")

0,1
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▅▅▆▃▆▄▃▃▂▅▅▂▆▆▄▃▄▂▁▄▂▇▂▁▄▁▅▃▃▂▅▂▃▂

0,1
train/epoch,1.0
train/global_step,350.0
train/learning_rate,0.0
train/loss,0.2623


In [None]:
training_output = trainer.train()

In [None]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▇▆▇▅▄▃▄▄▂▆▃▂▆▅▃▄▄▃▁▅▃▅▃▂▄▁▅▂▄▃▃▂▂▃

0,1
train/epoch,1.0
train/global_step,350.0
train/learning_rate,0.0
train/loss,0.3398


We could either stop at 1 epoch or try more regularization apart from the weitghs decay that we already use.

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig

# config = AutoConfig.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=3, hidden_dropout_prob=0.5)
config = AutoConfig.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=3)
model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", config=config)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [None]:
from datetime import datetime

current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
run_name = f"run_{current_time}"

In [None]:
from sklearn.metrics import f1_score

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate = 5e-5,
    load_best_model_at_end=True,
    weight_decay=0.01,
    metric_for_best_model="f1_score",
    report_to="wandb",
    run_name = run_name
)

def compute_f1_score(predictions, labels):
    preds = predictions.argmax(-1)
    return f1_score(labels, preds, average="weighted")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"f1_score": compute_f1_score(predictions, labels)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)


In [None]:
wandb.init(project="binary_sentence_classification", name="rubert_emotion_finetuning_colab_no_dropout_1ep")

VBox(children=(Label(value='0.002 MB of 0.019 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.114472…

0,1
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,▆▆▅▆▅▃▄▄▃▁▇▆▄██▄▃▅▃▁▇▅▇▄▂▃▂▇▅█▅▇▅▅▆

0,1
train/epoch,1.0
train/global_step,350.0
train/learning_rate,0.0
train/loss,0.37


In [None]:
training_output = trainer.train()
wandb.finish()

Epoch,Training Loss,Validation Loss,F1 Score
1,0.4892,0.44962,0.896527


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1_score,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/learning_rate,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,▄▆▂▄▁▂▂▆▂▁█▅▂▆▆▃▃▃▃▂▄▄▄▃▁▂▂▆▅▇▃▇▄▅█
train/total_flos,▁

0,1
eval/f1_score,0.89653
eval/loss,0.44962
eval/runtime,2.622
eval/samples_per_second,266.592
eval/steps_per_second,33.562
train/epoch,1.0
train/global_step,350.0
train/learning_rate,0.0
train/loss,0.4892
train/total_flos,735665115967488.0


In [None]:
trained_model = trainer.model

NameError: ignored

In [None]:
output_directory = "./saved_2model"
trained_model.save_pretrained(output_directory)

In [None]:
trained_model = trained_model.to('cuda:0')

In [None]:
input_text = 'Тут понимаешь всё про метафизику этой войны. «Укропы» под покровом Запада вышли не против России — они вышли на войну с Богом.'
input_tokens = tokenizer(input_text, padding=True, max_length=512, truncation=True, return_tensors="pt")
input_tokens = {k: v.to('cuda:0') for k, v in input_tokens.items()}

In [None]:
with torch.no_grad():
    logits = trained_model(**input_tokens).logits

predictions = torch.argmax(logits, dim=-1)
predictions[0]

tensor(1, device='cuda:0')

In [None]:
input_text = 'Ну и мудила украинец конечно.'
input_tokens = tokenizer(input_text, padding=True, max_length=512, truncation=True, return_tensors="pt")
input_tokens = {k: v.to('cuda:0') for k, v in input_tokens.items()}

with torch.no_grad():
    logits = trained_model(**input_tokens).logits

predictions = torch.argmax(logits, dim=-1)
predictions[0]

tensor(1, device='cuda:0')

In [None]:
input_text = 'Красавчик украинец, супер.'
input_tokens = tokenizer(input_text, padding=True, max_length=512, truncation=True, return_tensors="pt")
input_tokens = {k: v.to('cuda:0') for k, v in input_tokens.items()}

with torch.no_grad():
    logits = trained_model(**input_tokens).logits

predictions = torch.argmax(logits, dim=-1)
predictions[0]

tensor(0, device='cuda:0')

In [None]:
input_text = 'Ну и мудила конечно.'
input_tokens = tokenizer(input_text, padding=True, max_length=512, truncation=True, return_tensors="pt")
input_tokens = {k: v.to('cuda:0') for k, v in input_tokens.items()}

with torch.no_grad():
    logits = trained_model(**input_tokens).logits

predictions = torch.argmax(logits, dim=-1)
predictions[0]

tensor(0, device='cuda:0')

In [None]:
input_text = 'Какие хорошенькие котики, прелесть'
input_tokens = tokenizer(input_text, padding=True, max_length=512, truncation=True, return_tensors="pt")
input_tokens = {k: v.to('cuda:0') for k, v in input_tokens.items()}

with torch.no_grad():
    logits = trained_model(**input_tokens).logits

predictions = torch.argmax(logits, dim=-1)
predictions[0]

tensor(0, device='cuda:0')

## Emotion from finetuned

In [None]:
import torch
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast

In [None]:
import random
import numpy as np
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed = 42
set_seed(seed)


In [None]:
tokenizer = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)


def predict(text):
    inputs = tokenizer(text, max_length=512, padding=True, truncation=True, return_tensors='pt')
    outputs = model(**inputs)
    predicted = torch.nn.functional.softmax(outputs.logits, dim=1)
    predicted = torch.argmax(predicted, dim=1).numpy()
    return predicted

In [None]:
text1 = 'какая милая кошечка'
text2 = 'какая мразь конечно, ну мудила!'
text3 = 'нейтральное предложение'

In [None]:
predict(text1)

array([1])

In [None]:
predict(text2)

array([2])

In [None]:
predict(text3)

array([0])

In [None]:
df_emotion_remapped = df_emotion.copy()


In [None]:
df_emotion_remapped['labels'] = df_emotion_remapped['labels'].replace({0: 0, 
                                                              1: 2,
                                                              2: 1})


In [None]:
train_data, val_data = train_test_split(df_emotion_remapped, test_size=0.2, random_state=seed)
max_length = 512

In [None]:
def preprocess_data(data):
    encodings = tokenizer(
        data['text'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )
    labels = torch.tensor(data['labels'].tolist())
    return encodings, labels

train_encodings, train_labels = preprocess_data(train_data)
val_encodings, val_labels = preprocess_data(val_data)

In [None]:
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [None]:
from sklearn.metrics import f1_score

from transformers import EarlyStoppingCallback

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=1,  # Number of evaluations with no improvement after which training will be stopped
    early_stopping_threshold=0.0,  # Minimum relative improvement in the monitored metric required to qualify as an improvement
)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment', return_dict=True)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    seed=seed,
    metric_for_best_model="f1_score",
    greater_is_better=True,
    learning_rate=2e-5,
    report_to="wandb"
)

def compute_f1_score(predictions, labels):
    preds = predictions.argmax(-1)
    return f1_score(labels, preds, average="weighted")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {"f1_score": compute_f1_score(predictions, labels)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping_callback]
)


In [None]:
wandb.init(project="binary_sentence_classification", name="rubert_pretrained_em_ft_colab")

[34m[1mwandb[0m: Currently logged in as: [33mkate-burovova[0m ([33mbrvva_team[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
trainer.train()
trainer.evaluate()
wandb.finish()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Epoch,Training Loss,Validation Loss,F1 Score
1,0.5929,0.683727,0.626597
2,0.4751,0.537509,0.777585
3,0.5057,0.526947,0.797182
4,0.359,0.46914,0.818214
5,0.5261,0.460688,0.812196


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/f1_score,▁▇▇███
eval/loss,█▃▃▁▁▁
eval/runtime,█▇▇█▆▁
eval/samples_per_second,▁▂▂▁▃█
eval/steps_per_second,▁▂▂▁▃█
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▅▅▆▄▆▆▄▄▄▅▃▄▃▄▃▃▂▄▃▄▂▃▂▃▂▂▄▂▂▂▂▂▂▂▁▂▂▂▁
train/total_flos,▁

0,1
eval/f1_score,0.81821
eval/loss,0.46914
eval/runtime,11.2129
eval/samples_per_second,62.339
eval/steps_per_second,7.848
train/epoch,5.0
train/global_step,1750.0
train/learning_rate,1e-05
train/loss,0.5261
train/total_flos,3678325579837440.0


In [None]:
# from transformers import AutoModelForSequenceClassification

# best_model_path = training_args.output_dir + "/best_model"
# best_model = AutoModelForSequenceClassification.from_pretrained(best_model_path, return_dict=True)

In [None]:
trained_model = trainer.model

In [None]:
output_directory = "./saved_2model"
trained_model.save_pretrained(output_directory)

In [None]:
trained_model = trained_model.to('cuda:0')
input_text = 'Тут понимаешь всё про метафизику этой войны. «Укропы» под покровом Запада вышли не против России — они вышли на войну с Богом.'
input_tokens = tokenizer(input_text, padding=True, max_length=512, truncation=True, return_tensors="pt")
input_tokens = {k: v.to('cuda:0') for k, v in input_tokens.items()}

In [None]:
with torch.no_grad():
    logits = trained_model(**input_tokens).logits

predictions = torch.argmax(logits, dim=-1)
predictions[0]


tensor(2, device='cuda:0')

In [None]:
input_text = 'Плохой украинец.'
input_tokens = tokenizer(input_text, padding=True, max_length=512, truncation=True, return_tensors="pt")
input_tokens = {k: v.to('cuda:0') for k, v in input_tokens.items()}

with torch.no_grad():
    logits = trained_model(**input_tokens).logits

predictions = torch.argmax(logits, dim=-1)
predictions[0]

tensor(2, device='cuda:0')

In [None]:
df_emotion_remapped

Unnamed: 0,text,Emotion,labels
1219,"Всвязи с этим немного поправлю коллег ⤵️ ""Они...","так, присутня негативна",2
1218,Литературный критик Галина Юзефович о новом ро...,"ні, оцінка не присутня",0
1591,Почему на базах неонацистов стоят языческие ис...,"так, присутня негативна",2
1198,Группа добровольцев-медиков из Чеченской Респу...,"так, присутня негативна",2
3247,"ВСУшники, переходите на сторону добра, у нас т...","ні, оцінка не присутня",0
...,...,...,...
3613,Утренний брифинг Минобороны России: ▪️ россий...,"ні, оцінка не присутня",0
3612,И понеслась мазепинщино-петлюровщино-бандеровщ...,"так, присутня негативна",2
4121,Наш соратник по русскому движению Алексей Сели...,"так, присутня негативна",2
4120,Хорошее видео от 4 бригады НМ ЛНР https://t.me...,"ні, оцінка не присутня",0
