# Train/dev split

In [None]:
!git clone https://github.com/named-entity/hse-nlp/
!cp -r ./hse-nlp/4th_year/Project/* .

Cloning into 'hse-nlp'...
remote: Enumerating objects: 2841, done.[K
remote: Counting objects: 100% (2671/2671), done.[K
remote: Compressing objects: 100% (2544/2544), done.[K
remote: Total 2841 (delta 278), reused 2390 (delta 119), pack-reused 170[K
Receiving objects: 100% (2841/2841), 69.13 MiB | 12.31 MiB/s, done.
Resolving deltas: 100% (362/362), done.
Checking out files: 100% (2128/2128), done.


# Baseline 1,2: категория и тональность упоминаний

Выделяем только аспекты, встретившиеся в train'е, приписываем самую частотную категорию.

In [None]:
import pandas as pd

In [None]:
train_asp = pd.read_csv(
    'train_split_aspects.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
train_texts = pd.read_csv('train_split_reviews.txt', delimiter='\t', names=['text_id','text'])

In [None]:
train_asp.head(3)

Unnamed: 0,text_id,category,mention,start,end,sentiment
0,30808,Whole,ресторане,16,25,neutral
1,30808,Interior,первом этаже,43,55,neutral
2,30808,Whole,руководству ресторана,124,145,positive


In [None]:
!pip install -qq stanza

[K     |████████████████████████████████| 432 kB 4.8 MB/s 
[K     |████████████████████████████████| 170 kB 42.3 MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [None]:
import stanza
stanza.download('ru')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2021-12-27 13:46:52 INFO: Downloading default packages for language: ru (Russian)...


Downloading https://huggingface.co/stanfordnlp/stanza-ru/resolve/v1.3.0/models/default.zip:   0%|          | 0…

2021-12-27 13:47:13 INFO: Finished downloading models and saved to /root/stanza_resources.


In [None]:
nlp = stanza.Pipeline('ru', processors='tokenize,pos,lemma')

2021-12-27 13:47:13 INFO: Loading these models for language: ru (Russian):
| Processor | Package   |
-------------------------
| tokenize  | syntagrus |
| pos       | syntagrus |
| lemma     | syntagrus |

2021-12-27 13:47:13 INFO: Use device: cpu
2021-12-27 13:47:13 INFO: Loading: tokenize
2021-12-27 13:47:13 INFO: Loading: pos
2021-12-27 13:47:14 INFO: Loading: lemma
2021-12-27 13:47:14 INFO: Done loading processors!


In [None]:
import nltk
from nltk.corpus import stopwords
import string
import re

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop = stopwords.words('russian')
exclude = set(string.punctuation)

In [None]:
exclude.add('№')

In [None]:
exclude = exclude | set(string.digits)

In [None]:
for i in range(len(train_asp['mention'])):
  train_asp['mention'][i] = re.sub(r'[^а-яА-Я]', ' ', train_asp['mention'][i].lower())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
def normalize(text):
    doc = nlp(text)
    words = [word for sent in doc.sentences for word in sent.words]
    words = [word for word in words if str(word) not in stop]
    words = [word.lemma for word in words if str(word) not in exclude]
    return words

In [None]:
from collections import defaultdict, Counter

In [None]:
train_asp['norm_mention'] = [tuple(normalize(m)) for m in train_asp['mention']]

In [None]:
train_asp

Unnamed: 0,text_id,category,mention,start,end,sentiment,norm_mention
0,30808,Whole,ресторане,16,25,neutral,"(ресторан,)"
1,30808,Interior,первом этаже,43,55,neutral,"(первый, этаж)"
2,30808,Whole,руководству ресторана,124,145,positive,"(руководство, ресторан)"
3,30808,Service,обслуживающему персоналу,147,171,positive,"(обслуживать, персонал)"
4,30808,Service,сотрудникам,189,200,positive,"(сотрудник,)"
...,...,...,...,...,...,...,...
3568,16630,Service,обслуживание,85,97,positive,"(обслуживание,)"
3569,16630,Food,еда,99,102,positive,"(еда,)"
3570,16630,Service,персоналу,244,253,positive,"(персонал,)"
3571,16630,Whole,ресторан,294,302,positive,"(ресторан,)"


Строим частотный словарь "токенизированное упоминание + категория"

Категория - аспектная категория или тональность

In [None]:
def get_mention_category(data, cat_type):
    mention_categories = data.value_counts(subset=['norm_mention', cat_type])
    mention_categories_dict = defaultdict(dict)
    for key, value in mention_categories.items():
        mention_categories_dict[key[0]][key[1]] = value
    return {k: Counter(v).most_common(1)[0][0] for k, v in mention_categories_dict.items()}

In [None]:
best_mention_cat = get_mention_category(train_asp, 'category')

In [None]:
best_mention_sentiment = get_mention_category(train_asp, 'sentiment')

In [None]:
dev_texts = pd.read_csv('dev_reviews.txt', delimiter='\t', names=['text_id', 'text'])

In [None]:
dev_texts

Unnamed: 0,text_id,text
0,13823,"Зашли в""аппетит"" случайно.Не смотря на то,что ..."
1,1427,Здравствуйте!Посетили ваше заведение вчера пер...
2,16714,"Были в пятницу (19.03.10), заказывали столик д..."
3,797,"Были в ресторане 2 раза. Один раз днем, все по..."
4,34710,Удивляюсь отзывам про хорошее обслуживание. Бы...
...,...,...
66,9216,Вы брали этот ресторан так как он близко от до...
67,8996,"Были с друзьями в пабе Метрополь, всё очень по..."
68,38299,"Случайно увидели акцию на сайте купонов, решил..."
69,37819,Очень долго выбирали ресторан на Новогодний ка...


Длины упоминаний аспектов в трейне:

In [None]:
Counter([len(x) for x in best_mention_cat.keys()])

Counter({0: 1, 1: 479, 2: 322, 3: 144, 4: 48, 5: 22, 6: 7, 7: 7, 8: 2})

Для каждого существительного добавим в словарь упоминания схожих слов. Будем считать их словами той же категории, что и исходное слово

In [None]:
!pip install git+https://github.com/lopuhin/python-adagram.git

Collecting git+https://github.com/lopuhin/python-adagram.git
  Cloning https://github.com/lopuhin/python-adagram.git to /tmp/pip-req-build-icseh7tf
  Running command git clone -q https://github.com/lopuhin/python-adagram.git /tmp/pip-req-build-icseh7tf
Building wheels for collected packages: adagram
  Building wheel for adagram (setup.py) ... [?25l[?25hdone
  Created wheel for adagram: filename=adagram-0.0.1-cp37-cp37m-linux_x86_64.whl size=449886 sha256=1f84db6fd783596b7b92c4b07357446d11b6c38cc0afc63843f93f14acae5ae8
  Stored in directory: /tmp/pip-ephem-wheel-cache-7pa4y4rq/wheels/5a/8c/f9/7dee902dd325a3317e768f126aa6f7aa085c79a6e763ed2cb8
Successfully built adagram
Installing collected packages: adagram
Successfully installed adagram-0.0.1


In [None]:
!curl "https://s3.amazonaws.com/kostia.lopuhin/all.a010.p10.d300.w5.m100.nonorm.slim.joblib" > all.a010.p10.d300.w5.m100.nonorm.slim.joblib

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1394M  100 1394M    0     0  48.3M      0  0:00:28  0:00:28 --:--:-- 45.4M


In [None]:
import adagram

vm = adagram.VectorModel.load('all.a010.p10.d300.w5.m100.nonorm.slim.joblib')

In [None]:
for t in list(best_mention_cat.keys()):
  if len(t) != 1:
    continue
  if nlp(t[0]).sentences[0].words[0].upos != "NOUN":
    continue
  try:
    for neighbor in vm.sense_neighbors(t[0], 0)[:1]:
      best_mention_cat[tuple([neighbor[0]])] = best_mention_cat[t]
      best_mention_sentiment[tuple([neighbor[0]])] = best_mention_sentiment[t]
  except KeyError:
    continue

  sim_matrix = np.dot(self.In, s_v) / self.InNorms


Установим спелл-чекер и будем пытаться исправить слово перед тем, как проверять его наличие в словаре:

In [None]:
!sudo apt-get install swig
!sudo pip install jamspell
!wget https://github.com/bakwc/JamSpell-models/raw/master/ru.tar.gz
!tar -xzf ru.tar.gz

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  swig3.0
Suggested packages:
  swig-doc swig-examples swig3.0-examples swig3.0-doc
The following NEW packages will be installed:
  swig swig3.0
0 upgraded, 2 newly installed, 0 to remove and 37 not upgraded.
Need to get 1,100 kB of archives.
After this operation, 5,822 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig3.0 amd64 3.0.12-1 [1,094 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 swig amd64 3.0.12-1 [6,460 B]
Fetched 1,100 kB in 2s (689 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 2.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (T

In [None]:
import jamspell

corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('ru_small.bin')

True

Будем учитывать только упоминания длиной 1-5 токенов. Исключим слова, которые входят в найденные раньше n-граммы:

In [None]:
def label_texts(text, mentions, sentiments, max_len=5):
    tokenized = [word for sent in nlp(text).sentences for word in sent.words]
    text_end = len(tokenized)
    prev_end = 0
    for i, token in enumerate(tokenized):
        for l in reversed(range(max_len)):
            if i + l > text_end:
                continue
            span = tokenized[i:i + l]
            key = tuple([corrector.FixFragment(t.lemma) for t in span])
            if key in mentions and span:
                start, end = span[0].start_char, span[-1].end_char
                if start < prev_end:
                    continue
                prev_end = end
                yield mentions[key], text[start:end], start, end, sentiments[key]
                break

Применяем полученные данные к текстам из dev:

In [None]:
with open('dev_pred_aspects.txt', 'w') as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for asp in label_texts(text, best_mention_cat, best_mention_sentiment):
            print(idx, *asp, sep="\t", file=f)

# Baseline 3

Посчитаем упоминания аспектов с предсказанной тональностью, припишем
- `absence` - если нет упоминаний данной категории
- `both` - если есть упоминания с разной тональностью
- `positive/neutral/negative` - если все упоминания одной тональности

In [None]:
CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']

In [None]:
def get_full_sentiment(text, mentions, sentiment, max_len=5):
    asp_counter = defaultdict(Counter)
    for asp in label_texts(text, best_mention_cat, best_mention_sentiment, max_len):
        category, *_, sentiment = asp
        asp_counter[category][sentiment] += 1
    for c in CATEGORIES:
        if not asp_counter[c]:
            s = 'absence'
        elif len(asp_counter[c]) == 1:
            s = asp_counter[c].most_common(1)[0][0]
        else:
            s = 'both'
        yield c, s

Применяем частотные данные к текстам из dev:

In [None]:
with open('dev_pred_cats.txt', 'w') as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for c, s in get_full_sentiment(text, best_mention_cat, best_mention_sentiment):
            print(idx, c, s, sep="\t", file=f)

# Test

In [None]:
gold_test_path = "dev_aspects.txt"
pred_test_path = "dev_pred_aspects.txt"

# Оценка 1: accuracy по выделению упоминаний с категориями

In [None]:
from collections import defaultdict

In [None]:
gold_aspect_cats = {}
with open(gold_test_path) as fg:
    for line in fg:
        line = line.rstrip('\r\n').split('\t')
        if line[0] not in gold_aspect_cats:
            gold_aspect_cats[line[0]] = {"starts":[], "ends":[], "cats":[], "sents":[]}
        gold_aspect_cats[line[0]]["starts"].append(int(line[3]))
        gold_aspect_cats[line[0]]["ends"].append(int(line[4]))
        gold_aspect_cats[line[0]]["cats"].append(line[1])
        gold_aspect_cats[line[0]]["sents"].append(line[5])

In [None]:
full_match, partial_match, full_cat_match, partial_cat_match = 0, 0, 0, 0
total = 0
fully_matched_pairs = []
partially_matched_pairs = []
with open(pred_test_path) as fp:
    for line in fp:    
        total += 1
        line = line.rstrip('\r\n').split('\t')
        start, end = int(line[3]), int(line[4])
        category = line[1]
        doc_gold_aspect_cats = gold_aspect_cats[line[0]]
        if start in doc_gold_aspect_cats["starts"]:
            i = doc_gold_aspect_cats["starts"].index(start)
            if doc_gold_aspect_cats["ends"][i] == end:
                full_match += 1
                if doc_gold_aspect_cats["cats"][i] == category:
                    full_cat_match += 1
                else:
                    partial_cat_match += 1
                fully_matched_pairs.append(
                    (
                        [
                            doc_gold_aspect_cats["starts"][i], 
                            doc_gold_aspect_cats["ends"][i], 
                            doc_gold_aspect_cats["cats"][i],
                            doc_gold_aspect_cats["sents"][i]
                        ],
                        line
                    )
                )
                continue
        for s_pos in doc_gold_aspect_cats["starts"]:
            if start <= s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if doc_gold_aspect_cats["ends"][i] == end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i], 
                                doc_gold_aspect_cats["ends"][i], 
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    continue
                matched = False
                for e_pos in doc_gold_aspect_cats["ends"][i:]:
                    if s_pos <= end <= e_pos:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats["starts"][i], 
                                    doc_gold_aspect_cats["ends"][i], 
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats["cats"][i] == category:
                            partial_cat_match += 1
                        matched = True
                        break
                if matched:
                    break
            if start > s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if start < doc_gold_aspect_cats["ends"][i] <= end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i], 
                                doc_gold_aspect_cats["ends"][i], 
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    break

In [None]:
gold_size = sum([len(gold_aspect_cats[x]["cats"]) for x in gold_aspect_cats])

In [None]:
print(f"""
Full match precision: {full_match / total}
Full match recall: {full_match / gold_size}
Partial match ratio in pred: {(full_match + partial_match)  / total}
Full category accuracy: {full_cat_match / total}
Partial category accuracy: {(full_cat_match + partial_cat_match) / total}
""")


Full match precision: 0.484985835694051
Full match recall: 0.719327731092437
Partial match ratio in pred: 0.6118980169971672
Full category accuracy: 0.4594900849858357
Partial category accuracy: 0.5988668555240794



# Оценка 2: accuracy по тональности упоминаний

In [None]:
def sentiment_accuracy(matches):
    matched_sentiment = 0.
    for pair in matches:
        *_, gold_s = pair[0]
        *_, pred_s = pair[1]
        if gold_s == pred_s:
            matched_sentiment += 1
    print(f"Mention sentiment accuracy: {matched_sentiment / len(matches)}")

Accuracy по полностью совпавшим упоминаниям:

In [None]:
sentiment_accuracy(fully_matched_pairs)

Mention sentiment accuracy: 0.6518691588785047


Accuracy по частично совпавшим упоминаниям:

In [None]:
sentiment_accuracy(partially_matched_pairs)

Mention sentiment accuracy: 0.5848214285714286


# Оценка 3: accuracy по тональности категории

In [None]:
gold_test_cats_path = "dev_cats.txt"
pred_test_cats_path = "dev_pred_cats.txt"

In [None]:
with open(gold_test_cats_path) as gc, open(pred_test_cats_path) as pc:
    gold_labels = set(gc.readlines())
    pred_labels = set(pc.readlines())
    print(
        "Overall sentiment accuracy:",
        len(gold_labels & pred_labels) / len(gold_labels)
    )

Overall sentiment accuracy: 0.476056338028169
