In [32]:
from sklearn.model_selection import train_test_split

import spacy
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline, TrainingArguments, Trainer
from datasets import Dataset

import numpy as np
import pandas as pd
import stanza

## Load and split data

In [33]:
texts, ids = [], []
with open('train_reviews.txt') as f:
    for line in f:
        text_id, text = line.rstrip('\r\n').split('\t')
        texts.append(text)
        ids.append(text_id)

In [34]:
train_texts, dev_texts, train_ids, dev_ids = train_test_split(texts, ids)

In [35]:
train_aspects, dev_aspects = [], []
with open('train_aspects.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_aspects.append(line)
        if text_id in dev_ids:
            dev_aspects.append(line)

In [36]:
train_sentiment, dev_sentiment = [], []
with open('train_cats.txt') as f:
    for line in f:
        line = line.rstrip('\r\n')
        text_id = line.split('\t')[0]
        if text_id in train_ids:
            train_sentiment.append(line)
        if text_id in dev_ids:
            dev_sentiment.append(line)

In [37]:
with open('train_split_aspects.txt', 'w') as f:
    for l in train_aspects:
        print(l, file=f)
with open('dev_aspects.txt', 'w') as f:
    for l in dev_aspects:
        print(l, file=f)
with open('train_split_reviews.txt', 'w') as f:
    for i, l in zip(train_ids, train_texts):
        print(i, l, sep="\t", file=f)
with open('dev_reviews.txt', 'w') as f:
    for i, l in zip(dev_ids, dev_texts):
        print(i, l, sep="\t", file=f)
with open('train_split_cats.txt', 'w') as f:
    for l in train_sentiment:
        print(l, file=f)
with open('dev_cats.txt', 'w') as f:
    for l in dev_sentiment:
        print(l, file=f)

## Prepare aspects dataset

In [38]:
data = []

for elem in train_aspects:
    cur = list(elem.split('\t'))
    row = {
        'token': cur[2],
        'category': cur[1],
    }
    data.append(row)

asp_df = pd.DataFrame(data)

In [39]:
asp_df.category.value_counts()

Food        1426
Service      927
Whole        604
Interior     516
Price        100
Name: category, dtype: int64

In [40]:
stanza.download('ru')

stanza_nlp = stanza.Pipeline('ru', processors='tokenize,lemma')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-01-17 14:24:27 INFO: Downloading default packages for language: ru (Russian) ...
2023-01-17 14:24:28 INFO: File exists: /Users/ceo/stanza_resources/ru/default.zip
2023-01-17 14:24:31 INFO: Finished downloading models and saved to /Users/ceo/stanza_resources.
2023-01-17 14:24:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

2023-01-17 14:24:32 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| lemma     | syntagrus |

2023-01-17 14:24:32 INFO: Use device: cpu
2023-01-17 14:24:32 INFO: Loading: tokenize
2023-01-17 14:24:32 INFO: Loading: lemma
2023-01-17 14:24:32 INFO: Done loading processors!


In [41]:
def normalize(text):
    doc = stanza_nlp(text)
    words = [word.lemma for sent in doc.sentences for word in sent.words]
    return words

In [42]:
norm_tokens = []
for elem in asp_df.iterrows():
    norm_tokens.append(normalize(elem[1]['token']))

In [43]:
asp_df['norm'] = norm_tokens

In [44]:
asp_df.head()

Unnamed: 0,token,category,norm
0,ресторане,Whole,[ресторан]
1,ресторанах,Whole,[ресторан]
2,ресторане,Whole,[ресторан]
3,Столик бронировали,Service,"[столик, бронировали]"
4,администратор,Service,[администратор]


## Aspects extraction

In [45]:
nlp = spacy.load("ru_core_news_lg")

In [46]:
aspects = []
start_pos = []
end_pos = []
descr = []
asp_ids = []

for t, idx in tqdm(zip(dev_texts, dev_ids)):
    doc = nlp(t)

    for token in doc:
        if token.pos_ == 'NOUN':
            for j in token.lefts:
                if j.dep_ == 'amod' and j.pos_ == 'ADJ':
                    flag = False
                    for k in j.lefts:
                        if k.dep_ == 'advmod':
                            flag = True
                            aspects.append(token)
                            start_pos.append(token.idx)
                            end_pos.append(token.idx + len(token))
                            descr.append(k.lemma_ + ' ' + j.lemma_)
                            asp_ids.append(idx)
                            
                    if not flag:
                        aspects.append(token)
                        start_pos.append(token.idx)
                        end_pos.append(token.idx + len(token))
                        descr.append(j.lemma_)
                        asp_ids.append(idx)
                    
        if token.pos_ == 'VERB':
            for j in token.lefts:
                if j.dep_ == 'advmod' and j.pos_ == 'ADV':
                    aspects.append(token)
                    start_pos.append(token.idx)
                    end_pos.append(token.idx + len(token))
                    descr.append(j.lemma_)
                    asp_ids.append(idx)
                    
            for j in token.rights:
                if j.dep_ == 'advmod'and j.pos_ == 'ADV':
                    aspects.append(token)
                    start_pos.append(token.idx)
                    end_pos.append(token.idx + len(token))
                    descr.append(j.lemma_)
                    asp_ids.append(idx)

71it [00:02, 24.32it/s]


## Aspects classification

In [47]:
weights = {}

label2id = {
    'Food': 0,
    'Whole': 1,
    'Service': 2,
    'Interior': 3,
    'Price': 4,
}

id2label = {
    0: 'Food',
    1: 'Whole',
    2: 'Service',
    3: 'Interior',
    4: 'Price',
    5: 'None',
}

for elem in asp_df.iterrows():
    for lemm in elem[1]['norm']:
        if lemm not in weights:
            weights[lemm] = {
                0: 0,
                1: 0,
                2: 0,
                3: 0,
                4: 0,
            }
        
        weights[lemm][label2id[elem[1]['category']]] += 1
        
for lemm in weights:
    total = weights[lemm][0] + weights[lemm][1] + weights[lemm][2] + weights[lemm][3] + weights[lemm][4]
    weights[lemm][0] /= total
    weights[lemm][1] /= total
    weights[lemm][2] /= total
    weights[lemm][3] /= total
    weights[lemm][4] /= total

In [48]:
labels = []

for token in aspects:
    lemm = token.lemma_
    if lemm not in weights:
        labels.append('None')
    else:
        cur_label = 5
        best = 0
        for i in range(5):
            if weights[lemm][i] > 0.4 and weights[lemm][i] > best:
                best = weights[lemm][i]
                cur_label = i
                
        labels.append(id2label[cur_label])

In [49]:
pred_asp = pd.DataFrame(
    {
        'text_id': asp_ids,
        'category': labels,
        'mention': aspects,
        'start': start_pos,
        'end': end_pos,
    }
)

## Aspect sentiment analysis

In [50]:
sentiment_pipe = pipeline("sentiment-analysis", model='Tatyana/rubert-base-cased-sentiment-new')

In [51]:
sent_scores = []
sent_labels = []

for elem in tqdm(descr):
    res = sentiment_pipe(elem)
    sent_scores.append(res[0]['score'])
    sent_labels.append(res[0]['label'].lower())

100%|███████████████████████████████████████| 1034/1034 [00:39<00:00, 26.08it/s]


In [52]:
pred_asp['sentiment'] = sent_labels
pred_asp['sent_score'] = sent_scores

## Aspects finish

In [53]:
pred_asp.drop(pred_asp[pred_asp['category'] == 'None'].index, inplace=True)

In [54]:
pred_asp.head()

Unnamed: 0,text_id,category,mention,start,end,sentiment,sent_score
1,16568,Service,посетителем,87,98,neutral,0.810643
2,16568,Whole,мест,172,176,positive,0.9796
4,16568,Food,еда,254,257,positive,0.97675
5,16568,Food,рыба,276,280,negative,0.749643
6,16568,Food,рыба,276,280,neutral,0.748429


In [55]:
with open('final_pred_aspects.txt', 'w') as f:
    for elem in pred_asp.drop(columns='sent_score', axis=1).iterrows():
        vals = list(map(str, elem[1].values))
        print('\t'.join(vals), file=f)

## Text sentiment analysis

In [56]:
CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']

sent_ids = []
sent_cats = []
text_sent = []

for text_id in dev_ids:
    for cat in CATEGORIES:
        positive = 0
        negative = 0
        neutral = 0
        cnt_pos = 0
        cnt_neg = 0
        cnt_neu = 0
        for elem in pred_asp[pred_asp['text_id'] == text_id][pred_asp['category'] == cat].iterrows():
            if elem[1].sentiment == 'positive':
                positive += elem[1].sent_score * elem[1].sent_score
                cnt_pos += 1
            elif elem[1].sentiment == 'negative':
                negative += elem[1].sent_score * elem[1].sent_score
                cnt_neg += 1
            else:
                neutral += elem[1].sent_score * elem[1].sent_score
                cnt_neu += 1
            
        if cnt_pos > 1 and cnt_neg > 1:
            verdict = 'both'
        elif cnt_pos + cnt_neg + cnt_neu == 0:
            verdict = 'absence'
        else:
            verdict = 'positive'
            best = positive
            
            if negative > best:
                best = negative
                verdict = 'negative'
            
            if neutral > best:
                best = neutral
                verdict = 'neutral'
        
        sent_ids.append(text_id)
        sent_cats.append(cat)
        text_sent.append(verdict)

  for elem in pred_asp[pred_asp['text_id'] == text_id][pred_asp['category'] == cat].iterrows():


In [57]:
final_pred_cats = pd.DataFrame(
    {
        'text_id': sent_ids,
        'category': sent_cats,
        'sentiment': text_sent,
    }
)

In [58]:
with open('final_pred_cats.txt', 'w') as f:
    for elem in final_pred_cats.iterrows():
        vals = list(map(str, elem[1].values))
        print('\t'.join(vals), file=f)

## Отчет

### Использованные методы

* Для выделения aspects и opinions была использована библиотека spacy
* Для классификации аспектов планировалось применение bert-base-multilingual-cased + fine-tune с помощью библиотеки transformers от hugging-face, но к сожалению она быстро не завелась (а время сильно поджимало) и пришлось использовать dummy-классификацию
* Для оценки sentiment скора была применена предобученная модель Tatyana/rubert-base-cased-sentiment-new с помощью библиотеки transformers
* text/aspect окрас был предсказан на основе полученных opinions при выделении аспектов с помощью скора sentiment анализа по аспектам
* Для лемматизации также кое-где была использована stanza

### Потенциальные лучшения

* Самое тривиальное, но действенное – улучшить выбор кандидатов в список тематических аспектов. Необходимо добавить больше синтаксических паттернов в разбор выражений
* Как было описано выше – использовать предобученную модель + fine-tune для классификации аспектов
* Аналогично, можно сделать fine-tune для модели оценки окраса (в нашем случае это tatyana/rubert-base-cased-sentiment-new)
* Еще один простой способ повысить скор – смержить бейзлайн модель с нашей (что не стало делаться из уважения, вместо этого были попытки успеть завести качественную модель)

### Инструкция для тестирования

Данный файл содержит в себе весь пайплайн. Для тестирования на других данных но при той же обучающей выборке нужно подставить необходимые dev_texts и dev_ids.
Оценка скора производится скриптом из условия.