## 01. Libraries Import

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
!huggingface-cli login --token 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


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
!pip install accelerate>=0.21.0

In [4]:
!pip install datasets



In [5]:
!pip install transformers evaluate accelerate



In [6]:
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from transformers import TrainerCallback
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback

from datasets import Features, Sequence, Value, ClassLabel

from datasets import Dataset

import evaluate

In [7]:
metric = evaluate.load("f1", average='macro')

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [9]:
def calculate_metrics(true_labels, predictions):
    metrics = {
    'Accuracy': accuracy_score(true_labels, predictions)
    }

    for avg in ['micro', 'macro', 'weighted']:
        metrics['Precision_'+avg.title()] = precision_score(true_labels, predictions, average=avg)
        metrics['Recall_'+avg.title()] = recall_score(true_labels, predictions, average=avg)
        metrics['F1-ratio_'+avg.title()] = f1_score(true_labels, predictions, average=avg)

    return metrics

In [10]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2", ignore_mismatched_sizes=True)
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2", ignore_mismatched_sizes=True)
 # model.cuda()  # uncomment it if you have a GPU

In [11]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 02. Importing data

In [None]:
dataset_df = pd.read_csv('./repurchase_dataset.csv')

In [None]:
dataset_df.sample(5)

Unnamed: 0,review_id,review_text
100833,XEBL96631447634557,"Программа отличная, экскурсовод грамотный , ав..."
230126,LZHD00705217174029,От Колизея дух захватывает: гид очень старалас...
133545,PALF14663192579370,Очень понравился рассказ экскурсовода Елены. Ч...
68039,GJSB63634646559669,Все интересно и увлекательно
256092,AMSE63255390284404,"Купила тур супер Сафари тазы Каньон, селге, ка..."


In [None]:
melted_df = pd.read_csv('./melted_anon_may24.csv')

In [None]:
melted_df.sample(5)

Unnamed: 0,REVIEW_ID,TOUR_ID,TOURIST_ID,GUIDE_ID,REVIEW_DETAIL,DETAIL_CONTENT,DETAIL_RATING
264885,AVOJ80698961934435,KFBP08201442763641,LAWH48635806619390,BSTP73772171355668,overall,Большую часть экскурсии проплывали в пустынной...,2.0
343729,SHFQ99988678126158,SQSF86685053241337,NXWC83717244154170,IOAZ86898760525143,personal,"Гид Эдмон - добрый, интересный, весёлый и очен...",5.0
224841,VKSX85551632147003,XYKD60462397373952,PAVU78636999826960,PXDY74260453404101,overall,Экскурсия не понравилась. Началась со слов экс...,1.0
77494,JBVQ46555005524815,FGWM21535456721591,XIAR05801485257334,TVSK17714633269360,overall,Экскурсия очень понравилась. Отдельное спасибо...,5.0
238680,BKIK92456455932962,ODMU40328928586325,VQLG76761604348084,ARIQ78166110953828,overall,"Не всё было идеально, поэтому 4/5. У нас был п...",4.0


In [None]:
dataset_df = dataset_df.merge(melted_df[['REVIEW_ID', 'REVIEW_DETAIL', 'DETAIL_RATING']], how='left', left_on='review_id', right_on='REVIEW_ID')

In [None]:
dataset_df.drop(columns=['REVIEW_ID'], inplace=True)

In [None]:
dataset_df = dataset_df.drop_duplicates('review_text')

In [None]:
dataset_df[dataset_df.REVIEW_DETAIL=='overall'].DETAIL_RATING.value_counts()

DETAIL_RATING
5.0    244246
4.0     29988
3.0     23395
1.0     21398
2.0     11479
Name: count, dtype: int64

In [None]:
dataset_df.DETAIL_RATING = dataset_df.DETAIL_RATING.astype(int)

In [None]:
dataset_df = pd.read_csv('./for_sentiment.csv')

In [None]:
train_df = dataset_df[(dataset_df.REVIEW_DETAIL=='overall')&
                      dataset_df.DETAIL_RATING.isin([2, 3, 4, 5])].groupby('DETAIL_RATING').sample(
    2500, random_state=42).reset_index(drop=True)

In [None]:
train_df = train_df.sample(frac=1, random_state=42) #shuffling

In [None]:
train_df.DETAIL_RATING.value_counts()

DETAIL_RATING
4    2500
3    2500
2    2500
5    2500
Name: count, dtype: int64

In [None]:
train_df['polarity'] = np.nan

In [None]:
train_df.loc[train_df.DETAIL_RATING.isin([4, 5]), 'polarity'] = 1
train_df.loc[train_df.DETAIL_RATING.isin([2, 3]), 'polarity'] = 0

In [None]:
train_df.polarity = train_df.polarity.astype(int)

In [None]:
train_df.polarity.value_counts()

polarity
1    5000
0    5000
Name: count, dtype: int64

In [None]:
train_df

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING,polarity
6252,CXDY53207468553676,"Прмятная женщина, с хорошими знаниями.",overall,4,1
4684,CBCJ73773620889894,Программа растянуть. Если едешь только одну оп...,overall,3,0
1731,NLGF03338661359966,Не понравилось. Санаторий Орджоникидзе закрыт ...,overall,2,0
4742,ZPIN85955727324520,Билет был очень дорогой,overall,3,0
4521,LNYG73544411318501,"Увидела, что где расположено в Праге, на каком...",overall,3,0
...,...,...,...,...,...
5734,VKWX00493069665146,Советую посетить это замечательное место всем ...,overall,4,1
5191,BOTM20305155644501,"Экскурсия замечательная. Повезло с гидом, Тиму...",overall,4,1
5390,YPYQ16018639049381,"Весело, интересно, несколько не скучали , вели...",overall,4,1
860,GJEN71034531524385,Все те же причины! Гид везде торопит. И на тар...,overall,2,0


In [None]:
for_training = train_df[['review_text', 'polarity']]

In [None]:
for_training.columns  = ['text', 'label']

Splitting dataset into train and test (test = 15% of the whole dataset, 9337 rows)

In [None]:
test_sample = for_training.groupby('label').sample(frac=0.15, random_state=42)
train_sample = for_training[for_training.index.isin(test_sample.index.to_list())==False]

In [None]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

Formatting datasets - mapping to label ids, resetting indices.



In [None]:
tr_df = train_sample.copy(deep=True)
te_df = test_sample.copy(deep=True)

In [None]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

In [None]:
mapper = {0:0, 1:1}

Turning dataframes into transformers dataset type.

In [None]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=tr_df.label.unique().tolist())})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [None]:
tr_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=[0, 1], id=None)}

Preprocessing text - tokenization:

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
tok_tr_dataset[2]

{'text': 'Билет был очень дорогой',
 'label': 0,
 'input_ids': [2, 29311, 1588, 991, 6003, 35908, 3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [None]:
MODEL_NAME = 'cointegrated/rubert-tiny2'

Loading pretrained model:

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mapper), id2label=mapper, label2id=mapper, ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up retraining parameters:

In [None]:
training_args = TrainingArguments(
    output_dir="polarity_train",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=8,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    #fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

### Round 1 of retraining

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
0,0.4824,0.445037,0.807959
2,0.3559,0.444611,0.820849
4,0.2872,0.523933,0.810205
6,0.2408,0.644775,0.816485
7,0.2231,0.661467,0.816759


TrainOutput(global_step=8496, training_loss=0.32158054021343224, metrics={'train_runtime': 432.1827, 'train_samples_per_second': 157.341, 'train_steps_per_second': 19.658, 'total_flos': 99083877698976.0, 'train_loss': 0.32158054021343224, 'epoch': 7.996235294117647})

In [None]:
trainer.evaluate()

{'eval_loss': 0.4803033769130707,
 'eval_f1': 0.8212525177508969,
 'eval_runtime': 1.8518,
 'eval_samples_per_second': 810.015,
 'eval_steps_per_second': 202.504,
 'epoch': 7.996235294117647}

Let's view a more detailed report for resulting classification model: one that would include F-score, Precision and Recall for each of the classes.

To see, on which class labels model performs best, and whether there are classes needing additional retraining.

In [None]:
from sklearn.metrics import classification_report


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]

    print(classification_report(labels, predictions))
    return metric.compute(predictions=predictions, references=labels)


In [None]:
from transformers import pipeline
text_classification_pipeline = pipeline("text-classification", model="laskovey/polarity_train")


config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Setting up features, true and predicted labels:

In [None]:
X = test_sample.text.to_list()
y_act = test_sample.label.to_list()

In [None]:
labels = test_sample.label.unique()


In [None]:
y_pred = [result["label"] for result in text_classification_pipeline(X)]

Outputting classification report:

In [None]:
print(classification_report(y_pred, y_act, labels=labels))


              precision    recall  f1-score   support

           0       0.87      0.79      0.83       834
           1       0.76      0.86      0.81       666

    accuracy                           0.82      1500
   macro avg       0.82      0.82      0.82      1500
weighted avg       0.82      0.82      0.82      1500



The model performs solidly, with majority of metrics for separate classes reaching 0.8.

Let's see how well rubert-tiny2 performs "out of the box"

In [None]:
original_pipeline = pipeline("text-classification", model="cointegrated/rubert-tiny2")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up features, true and predicted labels:

In [None]:
y_pred = [result["label"] for result in original_pipeline(X)]

Outputting classification report:

In [None]:
y_pred[:5]

['LABEL_0', 'LABEL_1', 'LABEL_1', 'LABEL_1', 'LABEL_1']

In [None]:
y_pred = [int(p[-1]) for p in y_pred]

In [None]:
print(classification_report(y_pred, y_act, labels=labels))


              precision    recall  f1-score   support

           0       0.30      0.38      0.33       594
           1       0.51      0.42      0.46       906

    accuracy                           0.40      1500
   macro avg       0.40      0.40      0.40      1500
weighted avg       0.42      0.40      0.41      1500



Our new retrained model performs much better (twice as good) as an original one; therefore, we are going to feed our model some more data, hoping to improve its performance further

### Round 2 of retraining - more data

Splitting dataset into train and test (test = 15% of the whole dataset, 9337 rows)

In [None]:
train_df = dataset_df[(dataset_df.REVIEW_DETAIL=='overall')&
                      dataset_df.DETAIL_RATING.isin([2, 3, 4, 5])].reset_index(drop=True)

In [None]:
train_df = train_df.sample(frac=1, random_state=42) #shuffling

In [None]:
train_df.DETAIL_RATING.value_counts()

DETAIL_RATING
5    10000
3    10000
4    10000
2    10000
Name: count, dtype: int64

In [None]:
train_df['polarity'] = np.nan

In [None]:
train_df.loc[train_df.DETAIL_RATING.isin([4, 5]), 'polarity'] = 1
train_df.loc[train_df.DETAIL_RATING.isin([2, 3]), 'polarity'] = 0

In [None]:
train_df.polarity = train_df.polarity.astype(int)

In [None]:
train_df.polarity.value_counts()

polarity
1    20000
0    20000
Name: count, dtype: int64

In [None]:
for_training = train_df[['review_text', 'polarity']]

In [None]:
for_training.columns  = ['text', 'label']

In [None]:
test_sample = for_training.groupby('label').sample(frac=0.15, random_state=42)
train_sample = for_training[(for_training.index.isin(test_sample.index.to_list())==False)]

In [None]:
tr_df = train_sample.copy(deep=True)
te_df = test_sample.copy(deep=True)

In [None]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

Turning dataframes into transformers dataset type.

In [None]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=tr_df.label.unique().tolist())})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [None]:
tr_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=[1, 0], id=None)}

Preprocessing text - tokenization:

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/34000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
tok_tr_dataset[2]

{'text': 'Экскурсия понравилась. Но в снегопад подниматься на вершину смысла нет. Ничего не видно. зря деньги потратила. Экскурсовод могла бы и предупридить.',
 'label': 1,
 'input_ids': [2,
  78700,
  42664,
  18,
  5634,
  314,
  65539,
  53482,
  548,
  54227,
  33923,
  10030,
  18,
  37556,
  769,
  31116,
  18,
  37601,
  21760,
  77259,
  18,
  60018,
  42557,
  16226,
  6796,
  320,
  30322,
  19491,
  3604,
  18,
  3],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [None]:
MODEL_NAME = 'laskovey/polarity_train'

In [None]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

Loading pretrained model:

In [None]:
mapper = {0:0, 1:1}

In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mapper), id2label=mapper, label2id=mapper, ignore_mismatched_sizes=True
)

Setting up retraining parameters:

In [None]:
training_args = TrainingArguments(
    output_dir="polarity_train2",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=8,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    #fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.4054,0.383168,0.840633
2,0.3501,0.425171,0.83877
3,0.3164,0.50339,0.837544
4,0.2847,0.554083,0.832327
5,0.2588,0.651437,0.833969


Epoch,Training Loss,Validation Loss,F1
1,0.4054,0.383168,0.840633
2,0.3501,0.425171,0.83877
3,0.3164,0.50339,0.837544
4,0.2847,0.554083,0.832327
5,0.2588,0.651437,0.833969
6,0.2336,0.725854,0.829108
7,0.2141,0.775199,0.830409


In [None]:
trainer.evaluate()

In [None]:
from transformers import pipeline
text_classification_pipeline = pipeline("text-classification", model="laskovey/polarity_train2")


config.json:   0%|          | 0.00/839 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Setting up features, true and predicted labels:

In [None]:
X = test_sample.text.to_list()
y_act = test_sample.label.to_list()

In [None]:
labels = test_sample.label.unique()


In [None]:
y_pred = [result["label"] for result in text_classification_pipeline(X)]

In [None]:
%%time
y_proba = [result["score"] for result in text_classification_pipeline(X)]

CPU times: user 2min 13s, sys: 10.2 s, total: 2min 23s
Wall time: 3min 48s


Outputting classification report:

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_pred, y_act, labels=labels))


              precision    recall  f1-score   support

           0       0.85      0.81      0.83      3126
           1       0.81      0.84      0.82      2874

    accuracy                           0.83      6000
   macro avg       0.83      0.83      0.83      6000
weighted avg       0.83      0.83      0.83      6000



#### Let's see how this polarity corresponds to rating from original dataset

In [None]:
dataset_df.sample()

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING
34259,INEO10273597193284,Отличная экскурсия...хорошо организована и дос...,overall,5


In [None]:
test_sample.sample()

Unnamed: 0,text,label
11297,Не могла ответить на вопросы по зданиям и памя...,0


In [None]:
relabeling_df = test_sample.merge(dataset_df[['review_text', 'DETAIL_RATING']], left_on='text', right_on='review_text', how='left')

In [None]:
relabeling_df.sample()

Unnamed: 0,text,label,review_text,DETAIL_RATING
5111,Экскурсия была очень приятная. Теплоход ухожен...,1,Экскурсия была очень приятная. Теплоход ухожен...,4


In [None]:
len(relabeling_df)

6000

In [None]:
len(y_pred)

6000

In [None]:
relabeling_df['predicted_label'] = y_pred

In [None]:
relabeling_df['predicted_cont'] = y_proba

In [None]:
relabeling_df.sample()

Unnamed: 0,text,label,review_text,DETAIL_RATING,predicted_label,predicted_cont
5672,"Коса-это то, что обязательно посещается при пе...",1,"Коса-это то, что обязательно посещается при пе...",4,1,0.999426


In [None]:
relabeling_df.groupby('DETAIL_RATING').predicted_cont.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
DETAIL_RATING,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2,1492.0,0.974968,0.075089,0.511958,0.992264,0.997641,0.998874,0.99954
3,1508.0,0.967527,0.083846,0.500505,0.987414,0.997202,0.998713,0.999576
4,1502.0,0.96996,0.078626,0.503039,0.98843,0.997853,0.99897,0.999588
5,1498.0,0.990181,0.049578,0.509329,0.998424,0.99907,0.999294,0.999584


#### Now let's see whether the model is capable of identlifying reviews marked 1 star by mistake (the review is highly positive and would expected to have 5 stars, but apparently the user clicked wrong)

This will also help us pick "truly" 1 star reviews for model training

In [None]:
with_1star_df = dataset_df[(dataset_df.REVIEW_DETAIL=='overall')&(dataset_df.DETAIL_RATING==1)].reset_index(drop=True)

In [None]:
with_1star_df.sample(3)

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING
866,BXSC40994691640364,Экскурсию Ваш сотрудник не провёл. Деньги взял...,overall,1.0
12125,KECJ47244848994674,Отвратительный сервис. Сотрудники в офисе на М...,overall,1.0
7681,CPMV39188705747145,"Ужасно, джаз играл только на нижней палубе, на...",overall,1.0


In [None]:
len(with_1star_df)

21398

In [None]:
X = with_1star_df.review_text.to_list()

In [None]:
y_act = [0] * len(with_1star_df)


In [None]:
labels = [0, 1]

In [None]:
%%time
predictions = text_classification_pipeline(X)

CPU times: user 6min 28s, sys: 20 s, total: 6min 48s
Wall time: 3min 38s


In [None]:
y_pred = [result["label"] for result in predictions]

In [None]:
%%time
y_proba = [result["score"] for result in predictions]

CPU times: user 4.58 ms, sys: 8.13 ms, total: 12.7 ms
Wall time: 14 ms


In [None]:
print(classification_report(y_pred, y_act, labels=labels))


              precision    recall  f1-score   support

           0       0.92      1.00      0.96     19791
           1       0.00      0.00      0.00      1607

    accuracy                           0.92     21398
   macro avg       0.46      0.50      0.48     21398
weighted avg       0.86      0.92      0.89     21398



Presumably, these 1607 instances where model predicted 1 could be exactly the case where users put 1 star by mistake

We will look at these examples, and specifically onto ones __with high prediction score__ - where model was "certain" with its prediction of positive polarity

In [None]:
with_1star_df['prediction'] = y_pred
with_1star_df['prediction_score'] = y_proba

In [None]:
with_1star_df[with_1star_df.prediction==1]

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING,prediction,prediction_score
31,YHZQ00957658946878,пролог Начну с предостережения! Садясь на любо...,overall,1.0,1,0.877259
102,OLYA60335656802705,Супер!!!,overall,1.0,1,0.998643
126,UQPV79206644088159,Безобразно!!!,overall,1.0,1,0.992779
128,HYJW05663426361063,Добрый день Экскурсия по Казани оставила очень...,overall,1.0,1,0.999317
177,YMAO28782154267197,Ваша фирма Спутник тут вообще не причем. Когда...,overall,1.0,1,0.980237
...,...,...,...,...,...,...
21364,KMTV66803928763314,безопасности на судне как будто вообще не было...,overall,1.0,1,0.842647
21366,WYPU49007208955228,сложно оценить безопасность. на каждого челове...,overall,1.0,1,0.933073
21373,YWYG93077358788698,Никого из организаторов на борту не было.,overall,1.0,1,0.935343
21381,RNOY94697101990551,"Не очень понятно, что имеется в виду под безоп...",overall,1.0,1,0.917693


In [None]:
with_1star_df[with_1star_df.prediction==1].sort_values('prediction_score', ascending=False).head(10)

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING,prediction,prediction_score
512,FCTI15368982299170,На этой Экскурсии нашим экскурсоводом была Дин...,overall,1.0,1,0.999529
8282,KKOZ77438733397320,"В Арзамасе ремонт, шум строительной техники. Г...",overall,1.0,1,0.999514
7903,AHGV70450489474167,Экскурсия можно сказать ПРОВАЛЬНАЯ- внутрь не ...,overall,1.0,1,0.999501
7316,HSUI58313121445703,Посетили Принцевы острова в Стамбуле. Заплаттл...,overall,1.0,1,0.999492
4809,VDPM38857435753790,Поездка в Каир с Шарма. Ехали колонной 9 автоб...,overall,1.0,1,0.999488
4800,PLFS49215268637553,"С великолепным экскурсоводом Ильёй, очень инте...",overall,1.0,1,0.999483
3598,LWLI03329004892892,В первую очередь хотелось бы отметить нашего з...,overall,1.0,1,0.999482
5268,WOKE24789910826426,Великолепная экскурсия! Замечательный экскурсо...,overall,1.0,1,0.999481
2904,BQPC32197635673986,"Дворец - великолепен, захватывает дух от экспо...",overall,1.0,1,0.999474
5482,CYNF88154795790029,"Всем, здравствуйте! Хочу оставить отзыв об экс...",overall,1.0,1,0.999472


It can be seen that some of these predictions are very positive ("великолепно"), and some extremely negative. Partially, it could be explained by the fact that the model did not train on 1-star examples (as they wer "dirty")

Current model doesn't do a good enough job at splitting such instances. Let us manually label these examples.

Additionally, from now on we will develop multi-class classification, predicting the number of stars - from 1 to 5.

In [None]:
with_1star_df.to_csv('./1star_for_labeling.csv', index=False)

#### Defining 1-5 star reviews by rules

What we know about how to define 1 and 5 star reviews:
* positive reviews tend to be shorter, negative - longer (the user will describe problems extensively)
* negative reviews are more likely to contain upper case, expressing extreme dissatisfaction
* positive reviews tend to contain gratitude ("спасибо большое") - unless it's sarcastic
* positive reviews tend to contain words such as "великолепно", "замечательно", "рекомендую" - unless it's sarcastic

In [None]:
with_1star_df['length'] = with_1star_df.review_text.str.len()

#### Creating dataset for new training (multi-class)

Let's only take "certain" predictions and manually labeled 1star ones

In [None]:
dataset_df.head()

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING
0,IBEL40458932288368,"Добрый день! Была на экскурсии театральной, оч...",overall,5.0
1,XJQL86924315279991,"Спасибо за прекрасное утро, за концерт птиц, г...",overall,5.0
2,WYTJ13913662608790,Праздновал с друзьями свой День рождения на эк...,overall,5.0
3,TPSP07717133573269,Большое спасибо за экскурсию! Было очень профе...,overall,5.0
4,LRHK42247441970821,"Ходили на эту экскурсию в среду, оказывается, ...",overall,5.0


In [None]:
dataset_df.DETAIL_RATING.value_counts()

DETAIL_RATING
5.0    246129
4.0     30487
3.0     24598
1.0     22036
2.0     11850
Name: count, dtype: int64

In [None]:
trunc_dataset_df = dataset_df.groupby('DETAIL_RATING').sample(4000, random_state=42)

In [None]:
trunc_dataset_df.DETAIL_RATING.value_counts()

DETAIL_RATING
1.0    4000
2.0    4000
3.0    4000
4.0    4000
5.0    4000
Name: count, dtype: int64

In [None]:
X = trunc_dataset_df.review_text.to_list()

In [None]:
len(X)

20000

In [None]:
y_act = [int(np.floor(star/4)) for star in trunc_dataset_df.DETAIL_RATING.to_list()]


In [None]:
y_act[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [None]:
labels = [0, 1]

In [None]:
%%time
predictions = text_classification_pipeline(X)

CPU times: user 6min, sys: 16.9 s, total: 6min 17s
Wall time: 3min 25s


In [None]:
y_pred = [result["label"] for result in predictions]

In [None]:
%%time
y_proba = [result["score"] for result in predictions]

CPU times: user 2.78 ms, sys: 278 µs, total: 3.06 ms
Wall time: 3.23 ms


In [None]:
print(classification_report(y_pred, y_act, labels=labels))


              precision    recall  f1-score   support

           0       0.91      0.89      0.90     12315
           1       0.83      0.86      0.84      7685

    accuracy                           0.88     20000
   macro avg       0.87      0.87      0.87     20000
weighted avg       0.88      0.88      0.88     20000



In [None]:
trunc_dataset_df['pred_polarity'] = y_pred
trunc_dataset_df['pred_score'] = y_proba

In [None]:
trunc_dataset_df.sample(3)

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING,pred_polarity,pred_score
664507,SBEL65878183675315,Гид экскурсию не вела кроме как разговоров в а...,overall,2.0,0,0.99814
635402,BVZS93147444076999,"умна,эмоциональна, знает материал.",overall,4.0,1,0.968268
533683,EWNZ62476505336182,Плохая организация экскурсии.,overall,2.0,0,0.993133


In [None]:
trunc_dataset_df[trunc_dataset_df.pred_score>=.98].DETAIL_RATING.value_counts(normalize=True)

DETAIL_RATING
5.0    0.216182
2.0    0.207898
1.0    0.197798
3.0    0.190763
4.0    0.187358
Name: proportion, dtype: float64

Let's take for training data with prediction score >= 98%) + manually labeled

In [None]:
for_training_df = trunc_dataset_df[trunc_dataset_df.pred_score>=.98].reset_index(drop=True)

In [None]:
ml = pd.read_csv('./1 star - labeled_1star (1).csv')

In [None]:
ml

Unnamed: 0,review_id,review_text,true_rating
0,DVKI26451643730516,Организационный шедевр или как пережить эту эк...,1.0
1,LBFJ93287604653580,Добрый день. О данной компании я узнал в интер...,1.0
2,DGHR58806958857585,Почему такая оценка? Аргументирую и начнём мы ...,1.0
3,RUWW23375207863701,Про саму долину Вади-эль-Вишваши и окружающие ...,1.0
4,UPWL49183051015400,"А вот организация, к сожалению, кошмар. Чудови...",1.0
...,...,...,...
759,VPVU80331708364787,"Автобус со сломанными креслами, ремонт тут же ...",1.0
760,PBHZ17776087173894,Безопасность? Не уверен что я могу ее оценить,1.0
761,ULAE21642962608045,Паре людей стало плохо на экскурсии. Абсолютно...,1.0
762,PWRD18099931863557,Билеты не выдавали,1.0


In [None]:
for_training_df.sort_values('review_id', inplace=True)

In [None]:
for_training_df.drop_duplicates('review_id', inplace=True)

In [None]:
for_training_df.drop_duplicates('review_id')[for_training_df.review_id.isin(ml.review_id.to_list())]

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING,pred_polarity,pred_score,true_rating
3091,AAYP51608048844769,Обычное судно,overall,1.0,0,0.993025,overall
962,AHXP69327040866154,"Ни экскурсии, ни гида не было, и не требовалос...",overall,1.0,0,0.998467,overall
217,AKEF56337265380255,"Какие могут быть впечатления, если официальный...",overall,1.0,0,0.996175,overall
2830,ANQL31206781401548,Очень скучная. Просто очень. Причём дело не в ...,overall,1.0,0,0.999245,overall
701,AQTI51172664452944,Никто не собирался проводить экскурсию,overall,1.0,0,0.992144,overall
...,...,...,...,...,...,...,...
2427,ZSTL76658061477822,Скучно. Возили туда сюда. Экскурсионная запись...,overall,1.0,0,0.998514,overall
2534,ZUDT86396480516073,Скучно. Без захватывающих подробностей. Почти ...,overall,1.0,0,0.998689,overall
2315,ZUKD25849083584161,"Билет прислали за 4 минуты до начала сеанса, в...",overall,1.0,0,0.999046,overall
685,ZUOQ45701141569126,"Так не было её, отменили, деньги верните 600 ,...",overall,1.0,0,0.999060,overall


In [None]:
ml_ids = for_training_df[for_training_df.review_id.isin(ml.review_id.to_list())].review_id.to_list()

In [None]:
ml.sort_values('review_id', inplace=True)

In [None]:
ml.drop_duplicates('review_id', inplace=True)

In [None]:
ml.drop_duplicates('review_id')[ml.review_id.isin(ml_ids)]

Unnamed: 0,review_id,review_text,true_rating
758,AAYP51608048844769,"Были открыты двери на заднюю палубу, ребёнок п...",1.0
360,AHXP69327040866154,Отвратительная экскурсия. Водитель сильно опоз...,1.0
221,AKEF56337265380255,Начало экскурсии было задержано на час. Мы час...,1.0
600,ANQL31206781401548,"Завышенная цена, заказывайте на оффициальном с...",1.0
397,AQTI51172664452944,Потрясающая экскурсия с замечательным гидом Ан...,5.0
...,...,...,...
313,ZSTL76658061477822,Приехали в назначенное время и в указанное мес...,1.0
72,ZUDT86396480516073,Экскурсию я бы разделила на две части. Первая-...,1.0
500,ZUKD25849083584161,"Хорошая познавательная экскурсия, благодаря пр...",5.0
432,ZUOQ45701141569126,"Ужасная экскурсия, остановка была возле Исакие...",1.0


In [None]:
for_training_df['true_rating'] = for_training_df['DETAIL_RATING']

In [None]:
for_training_df.loc[for_training_df.review_id.isin(ml_ids), 'true_rating'] = ml[ml.review_id.isin(ml_ids)].true_rating.to_list()

In [None]:
ml[ml.review_id.isin(ml_ids)==False].true_rating.value_counts()

true_rating
1.0    478
5.0     65
Name: count, dtype: int64

In [None]:
ml.sample(3)

Unnamed: 0,review_id,review_text,true_rating
187,ATXR31247916260403,Добрый день. Итак отзыв о посещении каких-то д...,1.0
637,RFGT30046533362917,"Всё было замечено, огромное спасибо девушке,пр...",5.0
9,HJUO25275092463016,Самая худшая экскурсия. Я и вся остальная груп...,1.0


In [None]:
for_training_df.sample(3)

Unnamed: 0,review_id,review_text,REVIEW_DETAIL,DETAIL_RATING,pred_polarity,pred_score,true_rating
17188,NDOF89599403744829,"Хороший гид, грамотная речь.",overall,5.0,1,0.990577,5.0
12957,AHKC27349784774617,Недостатки: 1.Заезд в выставочный центр BMW не...,overall,4.0,0,0.994991,4.0
8434,XOXM83986550888026,Экскурсовод держал нас на ветру так что жутко ...,overall,3.0,0,0.997591,3.0


In [None]:
for_training_df.drop(columns=['REVIEW_DETAIL', 'DETAIL_RATING', 'pred_polarity', 'pred_score'], inplace=True)

In [None]:
for_training_df.sample(3)

Unnamed: 0,review_id,review_text,true_rating
13189,NHAF26401717867929,Прекрасная экскурсия на остров-град Свияжск/ х...,4.0
11668,FPYO06858118894394,"Добрый день. Ожила интереснее, доставка до и п...",4.0
15932,CIDX72782435808624,Впечатления от экскурсии превзошли мои ожидани...,5.0


In [None]:
for_training_df = pd.concat([for_training_df, ml[ml.review_id.isin(ml_ids)==False]], axis=0, ignore_index=True)

In [None]:
for_training_df.true_rating.value_counts(normalize=True)

true_rating
5.0    0.237694
1.0    0.213358
4.0    0.193596
3.0    0.184020
2.0    0.171333
Name: proportion, dtype: float64

In [None]:
for_training_df = for_training_df[for_training_df.true_rating.notna()].reset_index(drop=True)

In [None]:
for_training_df.true_rating = for_training_df.true_rating.astype(int)

In [None]:
for_training_df.sample(5)

Unnamed: 0,review_id,review_text,true_rating
16394,ZZKT73883327200485,Пустая трата денег. Экскурсовод Лидия - ярчайш...,1
7585,MJDG79866613655020,"Брестская крепость это конечно история, точнее...",4
379,APQL79105267571869,"Очень интересная экскурсия по вечерней Москве,...",5
8906,ORXN31804203867282,"Тур одного дня, Скантур, гид Вахтанг. Понравил...",4
4760,HXJA44254117147817,"Скучно, монотонно и ничего авторского.",1


In [None]:
for_training_df.to_csv('./for_multiclass_sentiment.csv', index=False)

## Getting ready for multi-class modeling

In [12]:
for_training_df = pd.read_csv('./for_multiclass_sentiment.csv')

In [13]:
for_training_df.true_rating = for_training_df.true_rating.astype(str)

In [14]:
test_sample = for_training_df.groupby('true_rating').sample(frac=0.15, random_state=42)
train_sample = for_training_df[for_training_df.index.isin(test_sample.index.to_list())==False]

In [15]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [16]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

Formatting datasets - mapping to label ids, resetting indices.



In [17]:
tr_df = train_sample.copy(deep=True)
te_df = test_sample.copy(deep=True)

In [18]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

In [19]:
tr_df = tr_df.drop(columns=['review_id']).rename(columns={'review_text': 'text', 'true_rating': 'label'})
te_df = te_df.drop(columns=['review_id']).rename(columns={'review_text': 'text', 'true_rating': 'label'})

In [20]:
mapper = {'1': 1, '2': 2, '3': 3, '4': 4, '5': 5}

In [21]:
id_mapper = {1: '1', 2: '2', 3: '3', 4: '4', 5: '5'}

Turning dataframes into transformers dataset type.

In [22]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=tr_df.label.unique().tolist())})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [23]:
tr_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['5', '4', '2', '3', '1'], id=None)}

Preprocessing text - tokenization:

In [24]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [25]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/13935 [00:00<?, ? examples/s]

Map:   0%|          | 0/2460 [00:00<?, ? examples/s]

In [26]:
tok_tr_dataset[2]

{'text': 'Нет четкой организации...',
 'label': 2,
 'input_ids': [2, 30616, 79109, 7200, 18, 18, 18, 3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [47]:
#MODEL_NAME = 'cointegrated/rubert-tiny2'
MODEL_NAME = 'laskovey/polarity_train2'

Loading pretrained model:

In [48]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mapper), id2label=mapper, label2id=mapper, ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at laskovey/polarity_train2 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([2, 312]) in the checkpoint and torch.Size([5, 312]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up retraining parameters:

In [49]:
training_args = TrainingArguments(
    output_dir="polarity_train3",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=8,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

### Round 1 of retraining

In [50]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,1.1295,1.032053,0.538243
2,0.9546,1.037642,0.55283
3,0.8688,1.056596,0.542422
4,0.7866,1.09397,0.551514
5,0.7139,1.135747,0.550653
6,0.6576,1.182217,0.545549
7,0.6126,1.212406,0.544857
8,0.5859,1.227771,0.540787


TrainOutput(global_step=13936, training_loss=0.7886799484936159, metrics={'train_runtime': 824.0543, 'train_samples_per_second': 135.282, 'train_steps_per_second': 16.912, 'total_flos': 201495948802188.0, 'train_loss': 0.7886799484936159, 'epoch': 8.0})

In [51]:
trainer.evaluate()

{'eval_loss': 1.0376417636871338,
 'eval_f1': 0.5528299428612437,
 'eval_runtime': 3.9392,
 'eval_samples_per_second': 624.489,
 'eval_steps_per_second': 156.122,
 'epoch': 8.0}

Let's view a more detailed report for resulting classification model: one that would include F-score, Precision and Recall for each of the classes.

To see, on which class labels model performs best, and whether there are classes needing additional retraining.

In [56]:
from sklearn.metrics import classification_report


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]

    print(classification_report(labels, predictions))
    return metric.compute(predictions=predictions, references=labels)


In [57]:
from transformers import pipeline
text_classification_pipeline = pipeline("text-classification", model="laskovey/polarity_train3")


In [58]:
text_classification_pipeline('Всё отлично, спасибо!')

KeyError: 0

In [59]:
MODEL_NAME = 'cointegrated/rubert-tiny2'
#MODEL_NAME = 'laskovey/polarity_train2'

Loading pretrained model:

In [61]:
id_mapper

{1: '1', 2: '2', 3: '3', 4: '4', 5: '5'}

In [62]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mapper), id2label=id_mapper, label2id=mapper, ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Setting up retraining parameters:

In [65]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [66]:
training_args = TrainingArguments(
    output_dir="polarity_train4",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=8,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

### Round 2 of retraining (rubert-tiny2)

In [67]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.9892,1.088013,0.515918
2,0.9385,1.082916,0.538723
3,0.8641,1.10714,0.535298
4,0.7906,1.142945,0.543485
5,0.7282,1.188981,0.545152
6,0.6745,1.232486,0.533491
7,0.6325,1.267604,0.533274
8,0.6064,1.275729,0.536568


TrainOutput(global_step=13936, training_loss=0.7780180097858066, metrics={'train_runtime': 982.1403, 'train_samples_per_second': 113.507, 'train_steps_per_second': 14.189, 'total_flos': 201495948802188.0, 'train_loss': 0.7780180097858066, 'epoch': 8.0})

In [68]:
trainer.evaluate()

{'eval_loss': 1.1889806985855103,
 'eval_f1': 0.5451517121535752,
 'eval_runtime': 4.8679,
 'eval_samples_per_second': 505.353,
 'eval_steps_per_second': 126.338,
 'epoch': 8.0}

In [69]:
X = te_df.text.to_list()
y_act = te_df.label.to_list()

In [70]:
labels = te_df.label.unique()


In [71]:
len(X)

2460

In [73]:
from transformers import pipeline
text_classification_pipeline = pipeline("text-classification", model="laskovey/polarity_train4")


config.json:   0%|          | 0.00/923 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [74]:
%%time
predictions = text_classification_pipeline(X, top_k=5)

KeyError: 0

### Adding more data

In [78]:
new_data_df = pd.read_csv('./polarity_retaining_new.csv')

In [79]:
new_for_training_df = new_data_df[['review_text', 'DETAIL_RATING']]
new_for_training_df.columns = ['text', 'label']
new_for_training_df.label = new_for_training_df.label.astype(str)

In [80]:
test_sample = new_for_training_df.groupby('label').sample(frac=0.15, random_state=42)
train_sample = new_for_training_df[new_for_training_df.index.isin(test_sample.index.to_list())==False]

Formatting datasets - mapping to label ids, resetting indices.



In [81]:
tr_df = train_sample.copy(deep=True)
te_df = test_sample.copy(deep=True)

In [82]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

In [83]:
mapper = {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4}

In [84]:
id_mapper = {0: '1', 1: '2', 2: '3', 3: '4', 4: '5'}

Turning dataframes into transformers dataset type.

In [85]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=tr_df.label.unique().tolist())})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [86]:
tr_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['5', '1', '4', '3', '2'], id=None)}

Preprocessing text - tokenization:

In [87]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [88]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10166 [00:00<?, ? examples/s]

Map:   0%|          | 0/1795 [00:00<?, ? examples/s]

In [92]:
tok_tr_dataset[3]

{'text': 'Ужасная экскурсия. Организатор выступает только как трансфер. Экскурсовод в первую дорогу еще пытался абсолютно скушно что то рассказать. На самой экскурсии выяснилось, что билеты для льготных граждан не куплены. Их приобритали непосредственно в день экскурсии на другое время чем основная группа, очень долго регали этот вопрос. Экскурсовод абсолютно не умеет организовывать людей. На обратно дороге просто ехали молча. В качестве трансфера можно взять такси, быстрее и дешевле. Очень разочарована',
 'label': 1,
 'input_ids': [2,
  64702,
  34811,
  55469,
  18,
  59155,
  21932,
  2768,
  1150,
  41264,
  18,
  60018,
  42557,
  314,
  12712,
  32626,
  21523,
  29280,
  30528,
  31791,
  13770,
  1046,
  1619,
  33490,
  18,
  1041,
  14382,
  39182,
  33534,
  16,
  1046,
  35193,
  871,
  66899,
  22800,
  769,
  75879,
  700,
  18,
  13028,
  29744,
  52956,
  1044,
  20948,
  314,
  4045,
  39182,
  548,
  28260,
  1614,
  5151,
  35257,
  7173,
  16,
  6003,
  27811,
  483

In [93]:
#MODEL_NAME = 'cointegrated/rubert-tiny2'
MODEL_NAME = 'laskovey/polarity_train4'

Loading pretrained model:

In [94]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(mapper), id2label=id_mapper, label2id=mapper, ignore_mismatched_sizes=True
)

Setting up retraining parameters:

In [95]:
training_args = TrainingArguments(
    output_dir="polarity_train5",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    num_train_epochs=8,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [96]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,1.0749,0.881078,0.602623
2,0.8348,0.878932,0.604303
3,0.7475,0.893225,0.60327
4,0.6781,0.937063,0.610481
5,0.607,0.983889,0.599475
6,0.5567,1.027319,0.598881
7,0.5111,1.050943,0.59358
8,0.4863,1.06241,0.595145


TrainOutput(global_step=10168, training_loss=0.6870417861278183, metrics={'train_runtime': 661.0706, 'train_samples_per_second': 123.025, 'train_steps_per_second': 15.381, 'total_flos': 174179471986320.0, 'train_loss': 0.6870417861278183, 'epoch': 8.0})

In [97]:
trainer.evaluate()

{'eval_loss': 0.9370633959770203,
 'eval_f1': 0.610480603101462,
 'eval_runtime': 3.0641,
 'eval_samples_per_second': 585.821,
 'eval_steps_per_second': 146.537,
 'epoch': 8.0}