<a href="https://colab.research.google.com/github/hineri-san/project_Karimova/blob/main/%D0%9F%D1%80%D0%BE%D0%B5%D0%BA%D1%82_%D0%9A%D0%B0%D1%80%D0%B8%D0%BC%D0%BE%D0%B2%D0%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pymorphy2
!pip install stanza

import stanza
stanza.download('ru')
nlp = stanza.Pipeline('ru', processors='tokenize,lemma')

import pandas as pd

from collections import defaultdict, Counter

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

In [None]:
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_reviews.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_aspects.txt
!wget https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/dev_reviews.txt

In [None]:
train_aspects = pd.read_csv('train_aspects.txt',
                             delimiter='\t', 
                             names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment'])

In [None]:
train_aspects.head()

In [None]:
train_texts = pd.read_csv('train_reviews.txt', 
                          delimiter='\t', 
                          names=['text_id','text'])

In [None]:
train_texts.head()

In [None]:
dev_texts = pd.read_csv('dev_reviews.txt', 
                          delimiter='\t', 
                          names=['text_id','text'])

### Задание 1 и 2 | Baseline

Экспериментировать будем на основе бейзлайна, поэтому сразу подгружаем оттуда все необходимые функции

In [None]:
def normalize(text):
    doc = nlp(text)
    words = [word.lemma for sent in doc.sentences for word in sent.words]
    return words

train_aspects['norm_mention'] = [tuple(normalize(m)) for m in train_aspects['mention']]

In [None]:
def get_mention_category(data, cat_type):
    mention_categories = data.value_counts(subset=['norm_mention', cat_type])
    mention_categories_dict = defaultdict(dict)
    for key, value in mention_categories.items():
        mention_categories_dict[key[0]][key[1]] = value
    return {k: Counter(v).most_common(1)[0][0] for k, v in mention_categories_dict.items()}
    
best_mention_cat = get_mention_category(train_aspects, 'category')
best_mention_sentiment = get_mention_category(train_aspects, 'sentiment')


In [None]:
def label_texts(text, mentions, sentiments, max_len=5):
    tokenized = [word for sent in nlp(text).sentences for word in sent.words]
    text_end = len(tokenized)
    for i, token in enumerate(tokenized):
        for l in reversed(range(max_len)):
            if i + l > text_end:
                continue
            span = tokenized[i:i + l]
            key = tuple([t.lemma for t in span])
            if key in mentions:
                start, end = span[0].start_char, span[-1].end_char
                yield mentions[key], text[start:end], start, end, sentiments[key]
                break

## add food info

Добавим информацию о блюдах, которая собрана с платформы рецептов eda.ru. Данные уже были собраны неизвестными коллегами и размещены в репозитории https://github.com/Alenush/dish_id_sirius

In [None]:
data = pd.read_csv('eda_all_recipes.csv')[['name', "ingridient_keywords"]]

В датасете содержится 40122 уникальных названий блюд и 39961 уникальных наборов ингредиентов.

In [None]:
data.describe()

In [None]:
data.head(10)

Вытащим названия блюд, слишком длинные обрежем (не очень обоснованно, но попробуем)

In [None]:
#food = list(data['name'])
#food = [x for x in food if isinstance(x, str)]
#food = [item if len(item.split()) < 4 else ' '.join(item.split()[:3]) for item in food]

Соберем все ингредиенты, выбросим стоп слова и слишком короткие названия(это не продукты, а граммы и прочее), лемматизируем

In [None]:
ingredients = list(data["ingridient_keywords"].apply(lambda x: x[2:-2].split('\', \'')))
ingredients = [item for sublist in ingredients for item in sublist if len(item)>2]
ingredients = list(set(ingredients) - set(stopwords.words("russian")))
ingredients = [item for item in ingredients if morph.parse(item)[0].tag.POS == 'NOUN']
ingredients_norm = [tuple(normalize(str(m))) for m in ingredients]


In [None]:
len(ingredients_norm)

Сами названия блюд было решено не добавлять --- это не улучшало качество. Есть предположение, что в отзывах люди не пишут такие подробные названия блюд

In [None]:
#additional = ingredients + food
#additional = [tuple(normalize(str(m))) for m in additional]

В первую очередь ищем аспекты на основе частотного подхода из бейзлайна. После пробуем найти на основе наших данных об ингредиентах. Возникает, конечно, вопрос, какую тональность приписывать аспектам, обнаруженным в списке ингредиентов.

In [None]:
train_aspects[train_aspects.category == 'Food']['sentiment'].value_counts()

В тренировочной выборке люди чаще всего писали о еде положительно. Окей, попробуем каждому упоминанию еды не из тренировочной выборки присвоить положительную тональность.

In [None]:
def label_texts(text, mentions, sentiments, additional, max_len=5):
    tokenized = [word for sent in nlp(text).sentences for word in sent.words]
    text_end = len(tokenized)
    for i, token in enumerate(tokenized):
        for l in reversed(range(max_len)):
            if i + l > text_end:
                continue
            span = tokenized[i:i + l]
            key = tuple([t.lemma for t in span])
            if key in mentions:
                start, end = span[0].start_char, span[-1].end_char
                yield mentions[key], text[start:end], start, end, sentiments[key]
                break
            elif key in additional:
                start, end = span[0].start_char, span[-1].end_char
                yield 'Food', text[start:end], start, end, 'positive'
                break

In [None]:
with open('dev_pred_aspects.txt', 'w') as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for asp in label_texts(text, best_mention_cat, best_mention_sentiment, ingredients_norm):
            print(idx, *asp, sep="\t", file=f)


# Evaluation 1 & 2

In [None]:
!wget https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_aspects.txt
gold_test_path = "dev_aspects.txt"
pred_test_path = "dev_pred_aspects.txt"

In [None]:
gold_aspect_cats = {}
with open(gold_test_path) as fg:
    for line in fg:
        line = line.rstrip('\r\n').split('\t')
        if line[0] not in gold_aspect_cats:
            gold_aspect_cats[line[0]] = {"starts":[], "ends":[], "cats":[], "sents":[]}
        gold_aspect_cats[line[0]]["starts"].append(int(line[3]))
        gold_aspect_cats[line[0]]["ends"].append(int(line[4]))
        gold_aspect_cats[line[0]]["cats"].append(line[1])
        gold_aspect_cats[line[0]]["sents"].append(line[5])
full_match, partial_match, full_cat_match, partial_cat_match = 0, 0, 0, 0
total = 0
fully_matched_pairs = []
partially_matched_pairs = []
with open(pred_test_path) as fp:
    for line in fp:    
        total += 1
        line = line.rstrip('\r\n').split('\t')
        start, end = int(line[3]), int(line[4])
        category = line[1]
        doc_gold_aspect_cats = gold_aspect_cats[line[0]]
        if start in doc_gold_aspect_cats["starts"]:
            i = doc_gold_aspect_cats["starts"].index(start)
            if doc_gold_aspect_cats["ends"][i] == end:
                full_match += 1
                if doc_gold_aspect_cats["cats"][i] == category:
                    full_cat_match += 1
                else:
                    partial_cat_match += 1
                fully_matched_pairs.append(
                    (
                        [
                            doc_gold_aspect_cats["starts"][i], 
                            doc_gold_aspect_cats["ends"][i], 
                            doc_gold_aspect_cats["cats"][i],
                            doc_gold_aspect_cats["sents"][i]
                        ],
                        line
                    )
                )
                continue
        for s_pos in doc_gold_aspect_cats["starts"]:
            if start <= s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if doc_gold_aspect_cats["ends"][i] == end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i], 
                                doc_gold_aspect_cats["ends"][i], 
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    continue
                matched = False
                for e_pos in doc_gold_aspect_cats["ends"][i:]:
                    if s_pos <= end <= e_pos:
                        partial_match += 1
                        partially_matched_pairs.append(
                            (
                                [
                                    doc_gold_aspect_cats["starts"][i], 
                                    doc_gold_aspect_cats["ends"][i], 
                                    doc_gold_aspect_cats["cats"][i],
                                    doc_gold_aspect_cats["sents"][i]
                                ],
                                line
                            )
                        )
                        if doc_gold_aspect_cats["cats"][i] == category:
                            partial_cat_match += 1
                        matched = True
                        break
                if matched:
                    break
            if start > s_pos:
                i = doc_gold_aspect_cats["starts"].index(s_pos)
                if start < doc_gold_aspect_cats["ends"][i] <= end:
                    partial_match += 1
                    partially_matched_pairs.append(
                        (
                            [
                                doc_gold_aspect_cats["starts"][i], 
                                doc_gold_aspect_cats["ends"][i], 
                                doc_gold_aspect_cats["cats"][i],
                                doc_gold_aspect_cats["sents"][i]
                            ],
                            line
                        )
                    )
                    if doc_gold_aspect_cats["cats"][i] == category:
                        partial_cat_match += 1
                    break
gold_size = sum([len(gold_aspect_cats[x]["cats"]) for x in gold_aspect_cats])
print(f"""
Full match precision: {full_match / total}
Full match recall: {full_match / gold_size}
Partial match ratio in pred: {(full_match + partial_match)  / total}
Full category accuracy: {full_cat_match / total}
Partial category accuracy: {(full_cat_match + partial_cat_match) / total}
""")

In [None]:
def sentiment_accuracy(matches):
    matched_sentiment = 0.
    for pair in matches:
        *_, gold_s = pair[0]
        *_, pred_s = pair[1]
        if gold_s == pred_s:
            matched_sentiment += 1
    print(f"Mention sentiment accuracy: {matched_sentiment / len(matches)}")

sentiment_accuracy(fully_matched_pairs)


sentiment_accuracy(partially_matched_pairs)

# Задание 3

In [None]:
CATEGORIES = ['Whole', 'Interior', 'Service', 'Food', 'Price']
def get_full_sentiment(text, mentions, sentiment, max_len=5):
    asp_counter = defaultdict(Counter)
    for asp in label_texts(text, best_mention_cat, best_mention_sentiment, ingredients_norm, max_len):
        category, *_, sentiment = asp
        asp_counter[category][sentiment] += 1
    for c in CATEGORIES:
        if not asp_counter[c]:
            s = 'absence'
        elif len(asp_counter[c]) == 1:
            s = asp_counter[c].most_common(1)[0][0]
        else:
            s = 'both'
        yield c, s

with open('dev_pred_cats.txt', 'w') as f:
    for text, idx in zip(dev_texts['text'], dev_texts['text_id']):
        for c, s in get_full_sentiment(text, best_mention_cat, best_mention_sentiment):
            print(idx, c, s, sep="\t", file=f)

In [None]:
!wget https://raw.githubusercontent.com/named-entity/hse-nlp/master/4th_year/Project/dev_cats.txt
gold_test_cats_path = "dev_cats.txt"
pred_test_cats_path = "dev_pred_cats.txt"
with open(gold_test_cats_path) as gc, open(pred_test_cats_path) as pc:
    gold_labels = set(gc.readlines())
    pred_labels = set(pc.readlines())
    print(
        "Overall sentiment accuracy:",
        len(gold_labels & pred_labels) / len(gold_labels)
    )