## data analysis

In [None]:
vk_data = open(r'..\..\1. crawling & parsing\vk_test_queries\test_data.txt', encoding='utf-8').read().split('\n')
wiki_data = open(r'..\..\1. crawling & parsing\wiki_film_descriptions\film_plots.txt', encoding='utf-8').read().split('\n')

vk_len = [len(x.split(' ')) for x in vk_data]
wiki_len = [len(x.split(' ')) for x in wiki_data]
vk_mean = sum(vk_len) / len(vk_len)
wiki_mean = sum(wiki_len) / len(wiki_len)
vk_variance = sum([(x - vk_mean) ** 2 for x in vk_len]) / len(vk_len)
wiki_variance = sum([(x - wiki_mean) ** 2 for x in wiki_len]) / len(wiki_len)
print(vk_mean, wiki_mean)
print(vk_variance, wiki_variance)

In [None]:
import spacy
from tqdm.notebook import tqdm

nlp = spacy.load('ru_core_news_sm')
not_main = ['nsubj', 'ROOT', 'nsubj:pass', 'punkt', 'subj', 'csubj', 'csubj:pass', 'xcomp', 'ccomp']
def get_deps(data):
    deps = []
    for doc in nlp.pipe(tqdm(data)):
        for sent in doc.sents:
            sent_dep = []
            for token in sent:
                if token.dep_ not in not_main:
                    sent_dep.append(token.dep_)
            deps.append(len(sent_dep))
    return deps

def get_deps_distr(data):
    deps = []
    for doc in nlp.pipe(tqdm(data)):
        for sent in doc.sents:
            for token in sent:
                deps.append(token.dep_)
    return deps

In [None]:
vk_deps = get_deps(vk_data)
wiki_deps = get_deps(wiki_data[:1000])
# sum(vk_deps) / len(vk_deps), sum(wiki_deps) / len(wiki_deps)

In [None]:
vk_distr = get_deps_distr(vk_data)
wiki_distr = get_deps_distr(wiki_data[:1000])

In [None]:
import pandas as pd

pd.DataFrame(vk_distr).value_counts(normalize=True)

In [None]:
pd.DataFrame(wiki_distr).value_counts(normalize=True)

## predicts analysis

In [None]:
import json

import pandas as pd

minilm_json_100 = json.load(open('MiniLM search result 100.json', encoding='utf-8'))
tfidf_json_100 = json.load(open('TD-IDF search result 100.json', encoding='utf-8'))
minilm_json_1000 = json.load(open('MiniLM search result 1000.json', encoding='utf-8'))
tfidf_json_1000 = json.load(open('TD-IDF search result 1000.json', encoding='utf-8'))

In [None]:
def get_scores(minilm_json, tfidf_json):
    inter = []
    scores = {
        'Общие правильные': 0,
        'Общие неправильные': 0,
        'Только TF-IDF правильные': 0,
        'Только MiniLM правильные': 0,
        'Всего': 0
    }
    minilm, only_minilm = [], []
    tfidf, only_tfidf = [], []
    not_found = []

    for (query_i, result_i), (query_j, result_j) in zip(minilm_json.items(), tfidf_json.items()):
        inter.append(len(set(result_i['predicted']).intersection(set(result_j['predicted']))) / 10)

        for true in result_i['true']:
            if true in result_i['predicted'] and true in result_j['predicted']:
                scores['Общие правильные'] += 1
                minilm.append(
                    ('MiniLM',
                     len(result_i['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
                tfidf.append(
                    ('TF-IDF',
                     len(result_j['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            elif true in result_i['predicted']:
                scores['Только MiniLM правильные'] += 1
                minilm.append(
                    ('MiniLM',
                     len(result_i['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
                only_minilm.append(
                    ('MiniLM',
                     len(result_i['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            elif true in result_j['predicted']:
                scores['Только TF-IDF правильные'] += 1
                tfidf.append(
                    ('TF-IDF',
                     len(result_j['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
                only_tfidf.append(
                    ('TF-IDF',
                     len(result_j['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            else:
                scores['Общие неправильные'] += 1
                not_found.append(
                    ('NA',
                     len(result_j['true'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            scores['Всего'] += 1

    # print(sum(inter) / len(inter))
    # for name, value in scores.items():
        # print(name, value, round(value / scores['Всего'], 2))
    return scores, minilm, tfidf, not_found

scores, minilm, tfidf, not_found = get_scores(minilm_json_100, tfidf_json_100)
# get_scores(minilm_json_1000, tfidf_json_1000)

In [None]:
# средняя длина описаний фильмов, правильно найденных только одним методом
# нейронка плохо подходит для длинных текстов, не хвататет выразительной способности
print(sum([x[1] for x in only_minilm]) / len(only_minilm), sum([x[1] for x in only_tfidf]) / len(only_tfidf))

# средняя длина запросов фильмов, правильно найденных только одним методом
print(sum([x[2] for x in only_minilm]) / len(only_minilm), sum([x[2] for x in only_tfidf]) / len(only_tfidf))

In [None]:
print(sum([x[1] for x in minilm]) / len(minilm), sum([x[1] for x in tfidf]) / len(tfidf))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# средняя длина описаний фильмов, правильно найденных
sns.histplot([x[1] for x in minilm], label='MiniLM', bins=20, kde=True, color='r', stat='density')
sns.histplot([x[1] for x in tfidf], label='TF-IDF', bins=20, kde=True, color='g', stat='density')
sns.histplot([x[1] for x in not_found], label='not found', bins=20, kde=True, color='b', stat='density')
plt.legend()

In [None]:
# средняя длина описаний фильмов, правильно найденных только одним методом
sns.histplot([x[1] for x in only_minilm], label='MiniLM', kde=True, color='r', stat='density')
sns.histplot([x[1] for x in only_tfidf], label='TF-IDF', kde=True, color='g', stat='density')
plt.legend()

In [None]:
import pandas as pd

# корреляция с кол-вом вариантов заголовков для запросов в правильных ответах
res = []
for (query_i, result_i), (query_j, result_j) in zip(minilm_json_100.items(), tfidf_json_100.items()):
    frac_minilm = [1 if title in result_i['predicted'] else 0 for title in result_i['true']]
    frac_tfidf = [1 if title in result_j['predicted'] else 0 for title in result_j['true']]
    res.append((len(result_i['true']),
                sum(frac_minilm) / len(frac_minilm),
                sum(frac_tfidf) / len(frac_tfidf)
                ))

res_df = pd.DataFrame(res, columns=['n_true', 'frac_minilm', 'frac_tfidf'])
res_df.corr()