## 01. Libraries Import

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
!pip install datasets



In [3]:
!pip install transformers evaluate accelerate



In [4]:
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

from transformers import TrainerCallback
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import EarlyStoppingCallback

from datasets import Features, Sequence, Value, ClassLabel

from datasets import Dataset

import evaluate

In [5]:
metric = evaluate.load("f1", average='macro')

In [6]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [7]:
def calculate_metrics(true_labels, predictions):
    metrics = {
    'Accuracy': accuracy_score(true_labels, predictions)
    }

    for avg in ['micro', 'macro', 'weighted']:
        metrics['Precision_'+avg.title()] = precision_score(true_labels, predictions, average=avg)
        metrics['Recall_'+avg.title()] = recall_score(true_labels, predictions, average=avg)
        metrics['F1-ratio_'+avg.title()] = f1_score(true_labels, predictions, average=avg)

    return metrics

In [8]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("laskovey/review_train3", ignore_mismatched_sizes=True)
model = AutoModel.from_pretrained("laskovey/review_train3", ignore_mismatched_sizes=True)
model.cuda()  # uncomment it if you have a GPU

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(83828, 312, padding_idx=0)
    (position_embeddings): Embedding(2048, 312)
    (token_type_embeddings): Embedding(2, 312)
    (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-2): 3 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=312, out_features=312, bias=True)
            (key): Linear(in_features=312, out_features=312, bias=True)
            (value): Linear(in_features=312, out_features=312, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=312, out_features=312, bias=True)
            (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)

In [9]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 02. Importing manually labeled "certain" labels

In [10]:
ml = pd.read_csv('./certain_labels.csv')

In [11]:
ml

Unnamed: 0,sentence,pred_cluster,pred_clus_label,pred_clus_label_name,prediction_score,correct_predicted_label,words
0,Если бы можно было поставить 100...,13,16,unrelated,0.991362,unrelated,"['если', 'бы', 'можно', 'быть', 'поставлять']"
1,"Казалось бы, такие места есть в любом городе, ...",16,12,guide's proficiency,0.994126,guide's proficiency,"['казаться', 'бы', 'такой', 'место', 'быть', '..."
2,"Интересный рассказ, внимательное и обходительн...",16,12,guide's proficiency,0.976098,guide's proficiency,"['интересный', 'рассказ', 'внимательный', 'и',..."
3,Ни сервиса ни вежливости..,7,0,customer service,0.998966,customer service,"['ни', 'сервис', 'ни', 'вежливость']"
4,Экскурсия очень понравилась.,0,8,appraisal,0.999557,appraisal,"['экскурсия', 'очень', 'понравиться']"
...,...,...,...,...,...,...,...
62276,"Прогулка на катере не заставила долго ждать, о...",2,1,delay,0.980207,delay,"['прогулка', 'на', 'катер', 'не_заставлять', '..."
62277,Экскурсия была аудио.,13,16,unrelated,0.998328,unrelated,"['экскурсия', 'быть', 'аудио']"
62278,Огромная Вам благодарность за Ваше гостеприимс...,0,8,appraisal,0.992342,appraisal,"['огромный', 'вы', 'благодарность', 'за', 'ваш..."
62279,Связи с ними вообще нет.,7,0,customer service,0.984219,customer service,"['связь', 'с', 'они', 'вообще', 'нет']"


In [12]:
len(ml.groupby("correct_predicted_label"))

32

Original mapper, on which the best previous model was trained:

In [13]:
mapper = {'customer service': 0,
 'delay': 1,
 'no substance': 2,
 'unmet expectations': 3,
 'coordination': 4,
 "guide's attitude": 5,
 'conditioning': 6,
 'vehicle': 7,
 'appraisal': 8,
 'overpriced': 9,
 'cancelation': 10,
 'stray': 11,
 "guide's proficiency": 12,
 'offer disparity': 13,
 'program change': 14,
 'other tourists': 15,
 'unrelated': 16,
 'haste': 17,
 'dirty windows': 18,
 'bland delivery': 19,
 'hazard': 20,
 'price lift': 21,
 'time shortage': 22,
 'theme divergence': 23,
 'food': 24,
 'fact misstatement': 25,
 'rudeness & bigotry': 26,
 'shallow narration': 27,
 'facilities': 28,
 'humble route': 29,
 'weather': 30,
 'refund': 31,
 'product': 32,
 'weather | advice': 33}

In [14]:
ml = ml[['sentence', 'correct_predicted_label']]

In [15]:
ml.columns  = ['text', 'label']

Splitting dataset into train and test (test = 15% of the whole dataset, 9337 rows)

In [16]:
test_sample = ml.groupby('label').sample(frac=0.15, random_state=42)
train_sample = ml[ml.index.isin(test_sample.index.to_list())==False]

In [None]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

Formatting datasets - mapping to label ids, resetting indices.

Important to note: there are less labels than were in train dataset at previous stages (32 against 34 previously; thus, we need to remap the values and set up model training in such way that it ignores original mappings).

In [17]:
tr_df = train_sample.copy(deep=True)
te_df = test_sample.copy(deep=True)

In [18]:
tr_df.label = tr_df.label.map(mapper)
te_df.label = te_df.label.map(mapper)

In [19]:
tr_df.reset_index(inplace=True, drop=True)
te_df.reset_index(inplace=True, drop=True)

In [20]:
new_mapper = dict(zip(tr_df.label.unique().tolist(),[i for i in range(tr_df.label.nunique())]))

In [21]:
tr_df.label = tr_df.label.map(new_mapper)
te_df.label = te_df.label.map(new_mapper)

In [22]:
corr_mapper = dict()
for m in mapper:
    if mapper[m] in new_mapper:
        corr_mapper[m] = new_mapper[mapper[m]]


In [23]:
len(corr_mapper)

32

Turning dataframes into transformers dataset type.

In [24]:
ftrs= Features({'text': Value(dtype='string'), 'label': ClassLabel(num_classes=tr_df.label.nunique(),
                           names=tr_df.label.unique().tolist())})

tr_dataset = Dataset.from_pandas(tr_df, features=ftrs)
te_dataset = Dataset.from_pandas(te_df, features=ftrs)

In [25]:
tr_dataset.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], id=None)}

Preprocessing text - tokenization:

In [26]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [27]:
tok_tr_dataset = tr_dataset.map(preprocess_function, batched=True)
tok_te_dataset = te_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/52944 [00:00<?, ? examples/s]

Map:   0%|          | 0/9337 [00:00<?, ? examples/s]

In [28]:
tok_tr_dataset[3]

{'text': 'Ни сервиса ни вежливости..',
 'label': 2,
 'input_ids': [2, 30180, 32705, 5548, 52820, 1948, 18, 18, 3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [29]:
id_mapper = dict(zip(list(corr_mapper.values()), list(corr_mapper.keys())))

Choosing previous best nodel for further retraining:

In [30]:
MODEL_NAME = 'laskovey/review_train3'

In [31]:
MODEL_NAME

'laskovey/review_train3'

Loading pretrained model:

In [39]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=len(corr_mapper), id2label=id_mapper, label2id=corr_mapper, ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at laskovey/review_train3 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([34]) in the checkpoint and torch.Size([32]) in the model instantiated
- classifier.weight: found shape torch.Size([34, 312]) in the checkpoint and torch.Size([32, 312]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
!huggingface-cli login --token #MY_HuggingFace_TOKEN_HERE


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Setting up retraining parameters:

In [40]:
training_args = TrainingArguments(
    output_dir="review_train5",
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    logging_strategy = "epoch",
    save_total_limit=5,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    num_train_epochs=8,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    greater_is_better=True,
    #fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tok_tr_dataset,
    eval_dataset=tok_te_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

### Round 1 of retraining

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
0,1.5057,0.683765,0.675412
2,0.3318,0.248373,0.893138
4,0.1777,0.163144,0.941402
6,0.1291,0.154689,0.941096
7,0.118,0.147893,0.943216


TrainOutput(global_step=13232, training_loss=0.4011227712665821, metrics={'train_runtime': 647.6866, 'train_samples_per_second': 653.946, 'train_steps_per_second': 20.43, 'total_flos': 222782389068288.0, 'train_loss': 0.4011227712665821, 'epoch': 7.997582351163493})

In [42]:
trainer.evaluate()

{'eval_loss': 0.14789295196533203,
 'eval_f1': 0.9432155521927994,
 'eval_runtime': 3.6312,
 'eval_samples_per_second': 2571.324,
 'eval_steps_per_second': 160.828,
 'epoch': 7.997582351163493}

Let's view a more detailed report for resulting classification model: one that would include F-score, Precision and Recall for each of the classes.

To see, on which class labels model performs best, and whether there are classes needing additional retraining.

In [43]:
from sklearn.metrics import classification_report


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if task != "stsb":
        predictions = np.argmax(predictions, axis=1)
    else:
        predictions = predictions[:, 0]

    print(classification_report(labels, predictions))
    return metric.compute(predictions=predictions, references=labels)


In [44]:
from transformers import pipeline
text_classification_pipeline = pipeline("text-classification", model="laskovey/review_train5")


config.json:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Setting up features, true and predicted labels:

In [46]:
X = test_sample.text.to_list()
y_act = test_sample.label.to_list()

In [47]:
labels = test_sample.label.unique()


In [48]:
y_pred = [result["label"] for result in text_classification_pipeline(X)]

Outputting classification report:

In [49]:
print(classification_report(y_pred, y_act, labels=labels))


                     precision    recall  f1-score   support

          appraisal       0.98      0.96      0.97      1440
     bland delivery       0.93      0.95      0.94       221
        cancelation       0.97      0.91      0.94        64
       conditioning       0.97      0.89      0.93        63
       coordination       0.96      0.97      0.97       434
   customer service       0.97      0.96      0.97       320
              delay       0.97      0.94      0.96       340
      dirty windows       0.90      1.00      0.95        27
  fact misstatement       0.85      1.00      0.92        51
               food       0.82      1.00      0.90        28
   guide's attitude       0.94      0.96      0.95       412
guide's proficiency       0.97      0.96      0.97      1449
              haste       0.97      0.94      0.95        64
             hazard       0.94      0.97      0.95        87
       humble route       0.95      0.94      0.95       217
       no substance    

The model performs solidly, with majority of metrics for separate classes reaching 0.9.

The lowest score is Precision=0.77 for class "other tourists", which is a challenging class to identify, since the text content may vary greatly depending on what those _other tourists_ done to deserve mentioning by the reviewer.

This class was decided to be identified to avoid inclusing such sentences into further analysis, considering that other tourists' behaviour is out of company's control.

Still, the Recall for class reaches 0.94, signyfying that the model identifies most "other tourists" instances, which is desirable. Overall F-score for this class is 0.85, which surpassess the initial goal of 0.8.