## 1. **ABTE**
#### 1.1 Запускаем на тестовых данных

Комментарии см. в readme репозитория

In [None]:
!pip install adapters --q

In [None]:
from utils_for_abte import clean_data, clean_idx, spans_to_tokens, predictor
from abte import ABTEDataset, ABTEBert, ABTEModel
from transformers import BertModel, BertTokenizer
import torch

In [None]:
!wget -O fine_model.pkl "drive.google.com/u/3/uc?id=1-LjUKtk4ejuFFgDg6PBH0ung0o25XWhR&export=download&confirm=yes" --quiet

In [None]:
!wget -O adapter_model.pkl "drive.google.com/u/3/uc?id=1-upUgxGn__M-WZHAdOIsbyN0Lf-ULrCy&export=download&confirm=yes" --quiet

In [None]:
text_path = 'dev_reviews.txt'
asp_path = 'dev_aspects.txt'
# форматируем данные
dev_ids = clean_idx(text_path)
dev_raw = clean_data(text_path, asp_path)

#инициализируем модель
tokenizer_path = 'cointegrated/rubert-tiny2'
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
fine_model = ABTEModel(tokenizer, False)
adapter_model = ABTEModel(tokenizer, True)

In [None]:
# сразу записывает в файл в нужном формате
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
predictor(dev_raw, 'fine_model.pkl', fine_model, dev_ids, 'fine-tuning_preds.txt')

In [None]:
predictor(dev_raw, 'adapter_model.pkl', adapter_model, dev_ids, 'adapter_preds.txt')

#### 1.2 Оцениваем предсказания

код из `Evaluation.ipynb` обернём

In [None]:
from collections import defaultdict

def evaluator(gold_test_path, pred_test_path):
    gold_aspect_cats = {}
    with open(gold_test_path) as fg:
        for line in fg:
            line = line.rstrip('\r\n').split('\t')
            if line[0] not in gold_aspect_cats:
                gold_aspect_cats[line[0]] = {"starts":[], "ends":[], "cats":[], "sents":[]}
                gold_aspect_cats[line[0]]["starts"].append(int(line[3]))
                gold_aspect_cats[line[0]]["ends"].append(int(line[4]))
                gold_aspect_cats[line[0]]["cats"].append(line[1])
                gold_aspect_cats[line[0]]["sents"].append(line[5])
    full_match, partial_match, full_cat_match, partial_cat_match = 0, 0, 0, 0
    total = 0
    fully_matched_pairs = []
    partially_matched_pairs = []
    with open(pred_test_path) as fp:
        for line in fp:
            total += 1
            line = line.rstrip('\r\n').split('\t')
            start, end = int(line[3]), int(line[4])
            category = line[1]
            doc_gold_aspect_cats = gold_aspect_cats[line[0]]
            if start in doc_gold_aspect_cats["starts"]:
               i = doc_gold_aspect_cats["starts"].index(start)
               if doc_gold_aspect_cats["ends"][i] == end:
                    full_match += 1
                    if doc_gold_aspect_cats["cats"][i] == category:
                        full_cat_match += 1
                    else:
                        partial_cat_match += 1
                    fully_matched_pairs.append(
                       (
                           [
                            doc_gold_aspect_cats["starts"][i],
                            doc_gold_aspect_cats["ends"][i],
                            doc_gold_aspect_cats["cats"][i],
                            doc_gold_aspect_cats["sents"][i]
                            ],
                           line
                       )
                    )
                    continue
            for s_pos in doc_gold_aspect_cats["starts"]:
                if start <= s_pos:
                    i = doc_gold_aspect_cats["starts"].index(s_pos)
                    if doc_gold_aspect_cats["ends"][i] == end:
                       partial_match += 1
                        partially_matched_pairs.append(
                           (
                                [
                                    doc_gold_aspect_cats["starts"][i],
                                    doc_gold_aspect_cats["ends"][i],
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                               ],
                               line
                            )
                        )
                        if doc_gold_aspect_cats["cats"][i] == category:
                            partial_cat_match += 1
                        continue
                    matched = False
                    for e_pos in doc_gold_aspect_cats["ends"][i:]:
                        if s_pos <= end <= e_pos:
                            partial_match += 1
                            partially_matched_pairs.append(
                                (
                                    [
                                    doc_gold_aspect_cats["starts"][i],
                                    doc_gold_aspect_cats["ends"][i],
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                                    ],
                                    line
                                )
                            )
                            if doc_gold_aspect_cats["cats"][i] == category:
                                partial_cat_match += 1
                            matched = True
                            break
                    if matched:
                        break
                if start > s_pos:
                    i = doc_gold_aspect_cats["starts"].index(s_pos)
                    if start < doc_gold_aspect_cats["ends"][i] <= end:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats["starts"][i],
                                    doc_gold_aspect_cats["ends"][i],
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats["cats"][i] == category:
                            partial_cat_match += 1
                        break
    gold_size = sum([len(gold_aspect_cats[x]["cats"]) for x in gold_aspect_cats])
    print(f"""
    Full match precision: {full_match / total}
    Full match recall: {full_match / gold_size}
    Partial match ratio in pred: {(full_match + partial_match)  / total}
    Full category accuracy: {full_cat_match / total}
    Partial category accuracy: {(full_cat_match + partial_cat_match) / total}
    """)

# 2. **ABSA**

Двумя способами:
1. out-of-the-box решение с помощью [PyABSA](https://pyabsa.readthedocs.io/en/latest/#).<br>**Проблемы**: есть только три класса тональности (нет *both*), ~~нельзя~~ очень сложно дообучить (свой формат датасетов...)
2. при помощи BERT и маскирования аспектных слов<br>**Проблемы**: долго, лосс начинает стагнироваться, нельзя большие батчи.

## 2.1. *BERT*

In [None]:
# !pip install transformers -q

In [None]:
from absa_bert import ABSADataset, ABSABert, ABSAModel
from utils_for_absa import get_join, accuracy
import pandas as pd

Вот ссылки на обученные модельки:

In [None]:
!wget -O model_epochs3_batch5.pkl 'drive.google.com/u/3/uc?id=1Zsadd-x4ZiODUSj9kIORDQdTMDTgpBcj&export=download&confirm=yes' --quiet
!wget -O model_epochs15_batch5.pkl 'drive.google.com/u/3/uc?id=1m62fpVcmZSdiBpJzpiLzVfam918SncMq&export=download&confirm=yes' --quiet

Так выглядит тренировка:

In [None]:
n_epoch = 15
batch_size = 5

df_train = get_join('train_reviews.txt', 'train_aspects.txt')

In [None]:
modelABSA = ABSAModel()

In [None]:
modelABSA.train(df_train, n_epoch=n_epoch, batch_size=batch_size)

Можно взять готовую модельку. Тогда сначала нужно объединить файлы с отзывами и размеченный по категориям (т.е. результат работы ABTE) при помощи `utils_for_absa.get_join()`, а потом подать в предикт:
* полученное объединение (например, `df_finetuning`)
* название файла, куда запишутся тональности (например, `'fine-tuning_preds_absa.txt'`)
* готовую модель (например, `'model_epochs15_batch5.pkl'`)

In [None]:
modelABSA = ABSAModel()

In [None]:
df_finetuning = get_join('dev_reviews.txt', 'fine-tuning_preds.txt')
modelABSA.predict(df_finetuning, 'fine-tuning_preds_absa.txt', 'model_epochs15_batch5.pkl')

  0%|          | 0/259 [00:00<?, ?it/s]

  return self._call_impl(*args, **kwargs)


Чтобы посчитать accuracy, надо вызвать функцию `utils_for_absa.accuracy()`, которая получает на вход название двух файлов: с реальными тональностями и предсказанными.

In [None]:
df_dev = get_join('dev_reviews.txt', 'dev_aspects.txt')
modelABSA.predict(df_dev, 'dev_absa.txt', 'model_epochs15_batch5.pkl')
print('Accuracy (epoch 15):', accuracy('dev_aspects.txt', 'dev_absa.txt'))

  0%|          | 0/397 [00:00<?, ?it/s]

  return self._call_impl(*args, **kwargs)


Accuracy (epoch 15): 0.6686291000841043


Соответственно, запуск для тестового файла (для двух аутпутов ABTE):

In [None]:
modelABSA = ABSAModel()

df_train_ft = get_join('test_reviews.txt', 'fine-tuning_preds.txt')
modelABSA.predict(df_dev, 'test_fine-tuning_absa.txt', 'model_epochs15_batch5.pkl')
print('Accuracy (epoch 15, fine-tuning):', accuracy('test_aspects.txt', 'test_fine-tuning_absa.txt'))

df_train_ad = get_join('test_reviews.txt', 'adapter_preds.txt')
modelABSA.predict(df_dev, 'test_adapter_absa.txt', 'model_epochs15_batch5.pkl')
print('Accuracy (epoch 15, adapter):', accuracy('test_aspects.txt', 'test_adapter_absa.txt'))

## 2.1 *PyABSA*
**NB!** Работает только в Kaggle.

In [None]:
# !pip install transformers==4.29.0 -q
# !pip install pyabsa==1.16.27 -q

In [None]:
from absa_pyabsa import PyABSAModel

from utils_for_abte import clean_data

Надо подать в функцию `utils_for_abte.clean_data()` два файла: с текстами отзывов и с правильной разметкой. После чего запустить `predict_and_accuracy()` от полученного датафрейма, передав название файла, куда запишется результат модели.

In [None]:
df_test = clean_data('dev_reviews.txt', 'dev_aspects.txt')
pyabsa_model = PyABSAModel()
print('Accuracy Dev:', pyabsa_model.predict_and_accuracy(df_test, 'dev_pyabsa.txt'))

Accuracy Dev: 0.6218487394957983


Соответственно, запуск для тестового файла:

In [None]:
df_test = clean_data('test_reviews.txt', 'test_aspects.txt')
pyabsa_model = PyABSAModel()
print('Accuracy Dev:', pyabsa_model.predict_and_accuracy(df_test, 'train_pyabsa.txt'))

# 3. **Cats**

In [1]:
import re
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [2]:
#знакомьтесь франкенштейн

model_name_food = 'numblilbug/food-rubert-sentiment-model'
model_name_interior = 'numblilbug/interior-rubert-sentiment-model'
model_name_price = 'numblilbug/price-rubert-sentiment-model'
model_name_service = 'numblilbug/service-rubert-sentiment-model'
model_name_whole = 'numblilbug/service-rubert-sentiment-model'

food_tokenizer = AutoTokenizer.from_pretrained(model_name_food)
food_model = AutoModelForSequenceClassification.from_pretrained(model_name_food)

interior_tokenizer = AutoTokenizer.from_pretrained(model_name_interior)
interior_model = AutoModelForSequenceClassification.from_pretrained(model_name_interior)

price_tokenizer = AutoTokenizer.from_pretrained(model_name_price)
price_model = AutoModelForSequenceClassification.from_pretrained(model_name_price)

service_tokenizer = AutoTokenizer.from_pretrained(model_name_service)
service_model = AutoModelForSequenceClassification.from_pretrained(model_name_service)

whole_tokenizer = AutoTokenizer.from_pretrained(model_name_whole)
whole_model = AutoModelForSequenceClassification.from_pretrained(model_name_whole)


tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/711M [00:00<?, ?B/s]

In [None]:
numbers = []
text_list = []

file_path = 'test_reviews.txt'

with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split('\t', 1)
        numbers.append(int(parts[0]))
        text_list.append(parts[1])


In [None]:
def predict(text, tokenizer, model):
  tokenized_input = tokenizer(text, return_tensors='pt')

  with torch.no_grad():
      model_output = model(**tokenized_input)

  probabilities = torch.softmax(model_output.logits, dim=1).tolist()[0]
  predicted_class = max(range(len(probabilities)), key=probabilities.__getitem__)
  return predicted_class


In [None]:
food_preds = []
interior_preds = []
price_preds = []
service_preds = []
whole_preds = []
for text in text_list:
  food_preds.append(predict(text, food_tokenizer, food_model))
  interior_preds.append(predict(text, interior_tokenizer, interior_model))
  price_preds.append(predict(text, price_tokenizer, price_model))
  service_preds.append(predict(text, service_tokenizer, service_model))
  whole_preds.append(predict(text, whole_tokenizer, whole_model))


In [None]:
class_labels = {'positive': 0, 'negative': 1, 'both': 2, 'neutral': 3, 'absence': 4}

def map_numbers_to_labels(predictions):
    return [key for key, value in class_labels.items() if value in predictions]

food_labels = map_numbers_to_labels(food_preds)
interior_labels = map_numbers_to_labels(interior_preds)
price_labels = map_numbers_to_labels(price_preds)
service_labels = map_numbers_to_labels(service_preds)
whole_labels = map_numbers_to_labels(whole_preds)

In [None]:
output_file = 'test_cats_predictions.txt'

with open(output_file, 'w') as file:
    for number in numbers:
        for interior, food, price, whole, service in zip(interior_labels, food_labels, price_labels, whole_labels, service_labels):
            file.write(f"{number}\tInterior\t{interior}\n")
            file.write(f"{number}\tFood\t{food}\n")
            file.write(f"{number}\tPrice\t{price}\n")
            file.write(f"{number}\tWhole\t{whole}\n")
            file.write(f"{number}\tService\t{service}\n")