## data analysis

In [1]:
vk_data = open(r'..\..\1. crawling & parsing\vk_test_queries\test_data.txt', encoding='utf-8').read().split('\n')
wiki_data = open(r'..\..\1. crawling & parsing\wiki_film_descriptions\film_plots.txt', encoding='utf-8').read().split('\n')

vk_len = [len(x.split(' ')) for x in vk_data]
wiki_len = [len(x.split(' ')) for x in wiki_data]
vk_mean = sum(vk_len) / len(vk_len)
wiki_mean = sum(wiki_len) / len(wiki_len)
vk_variance = sum([(x - vk_mean) ** 2 for x in vk_len]) / len(vk_len)
wiki_variance = sum([(x - wiki_mean) ** 2 for x in wiki_len]) / len(wiki_len)
print(vk_mean, wiki_mean)
print(vk_variance, wiki_variance)

50.891 208.14355245075583
1462.7231189999995 76891.80566388102


In [15]:
import spacy
from tqdm.notebook import tqdm

nlp = spacy.load('ru_core_news_sm')
not_main = ['nsubj', 'ROOT', 'nsubj:pass', 'punkt', 'subj', 'csubj', 'csubj:pass', 'xcomp', 'ccomp']
def get_deps(data):
    deps = []
    for doc in nlp.pipe(tqdm(data)):
        for sent in doc.sents:
            sent_dep = []
            for token in sent:
                if token.dep_ not in not_main:
                    sent_dep.append(token.dep_)
            deps.append(len(sent_dep))
    return deps

def get_deps_distr(data):
    deps = []
    i = 0
    for doc in nlp.pipe(tqdm(data)):
        for sent in doc.sents:
            for token in sent:
                deps.append(token.dep_)
                # deps.append(token.tag_)
                if token.text == "не":
                    i += 1
    print(i, i / len(deps))
    return deps

In [None]:
vk_deps = get_deps(vk_data)
wiki_deps = get_deps(wiki_data[:1000])
# sum(vk_deps) / len(vk_deps), sum(wiki_deps) / len(wiki_deps)

In [16]:
vk_distr = get_deps_distr(vk_data)
wiki_distr = get_deps_distr(wiki_data[:1000])

  0%|          | 0/1000 [00:00<?, ?it/s]

594 0.009709372650299128


  0%|          | 0/1000 [00:00<?, ?it/s]

1690 0.006603909983626992


In [17]:
import pandas as pd

pd.DataFrame(vk_distr).value_counts(normalize=True)

punct            0.139478
case             0.105381
nsubj            0.101049
obl              0.080813
conj             0.076760
advmod           0.072052
obj              0.055576
nmod             0.054546
ROOT             0.054497
cc               0.051505
amod             0.038543
det              0.037383
xcomp            0.018471
mark             0.015479
parataxis        0.013927
appos            0.011213
nummod           0.010788
iobj             0.010543
fixed            0.008304
acl:relcl        0.006931
cop              0.006555
advcl            0.006375
ccomp            0.006277
acl              0.004610
nummod:gov       0.003498
nsubj:pass       0.002632
flat:name        0.001487
discourse        0.001406
csubj            0.001357
aux              0.001062
aux:pass         0.000932
orphan           0.000114
obl:agent        0.000098
flat             0.000098
expl             0.000065
flat:foreign     0.000065
nummod:entity    0.000049
csubj:pass       0.000049
compound    

In [18]:
pd.DataFrame(wiki_distr).value_counts(normalize=True)

punct            0.165176
case             0.104916
obl              0.085386
nsubj            0.080716
nmod             0.071111
amod             0.060084
obj              0.053523
ROOT             0.051299
conj             0.051284
advmod           0.046024
cc               0.039737
appos            0.023532
xcomp            0.022293
det              0.021984
mark             0.016916
advcl            0.013989
iobj             0.012235
acl              0.010934
acl:relcl        0.010898
parataxis        0.010547
flat:name        0.010269
fixed            0.008581
ccomp            0.007085
nummod           0.004314
nsubj:pass       0.003951
flat:foreign     0.002700
nummod:gov       0.002657
csubj            0.001817
cop              0.001536
aux:pass         0.001375
obl:agent        0.001121
aux              0.000617
discourse        0.000598
flat             0.000231
orphan           0.000195
expl             0.000148
nummod:entity    0.000090
csubj:pass       0.000086
compound    

## predicts analysis

In [None]:
import json

import pandas as pd

minilm_json_100 = json.load(open('MiniLM search result 100.json', encoding='utf-8'))
tfidf_json_100 = json.load(open('TD-IDF search result 100.json', encoding='utf-8'))
minilm_json_1000 = json.load(open('MiniLM search result 1000.json', encoding='utf-8'))
tfidf_json_1000 = json.load(open('TD-IDF search result 1000.json', encoding='utf-8'))

In [None]:
def get_scores(minilm_json, tfidf_json):
    inter = []
    scores = {
        'Общие правильные': 0,
        'Общие неправильные': 0,
        'Только TF-IDF правильные': 0,
        'Только MiniLM правильные': 0,
        'Всего': 0
    }
    minilm, only_minilm = [], []
    tfidf, only_tfidf = [], []
    not_found = []

    for (query_i, result_i), (query_j, result_j) in zip(minilm_json.items(), tfidf_json.items()):
        inter.append(len(set(result_i['predicted']).intersection(set(result_j['predicted']))) / 10)

        for true in result_i['true']:
            if true in result_i['predicted'] and true in result_j['predicted']:
                scores['Общие правильные'] += 1
                minilm.append(
                    ('MiniLM',
                     len(result_i['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
                tfidf.append(
                    ('TF-IDF',
                     len(result_j['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            elif true in result_i['predicted']:
                scores['Только MiniLM правильные'] += 1
                minilm.append(
                    ('MiniLM',
                     len(result_i['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
                only_minilm.append(
                    ('MiniLM',
                     len(result_i['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            elif true in result_j['predicted']:
                scores['Только TF-IDF правильные'] += 1
                tfidf.append(
                    ('TF-IDF',
                     len(result_j['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
                only_tfidf.append(
                    ('TF-IDF',
                     len(result_j['predicted'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            else:
                scores['Общие неправильные'] += 1
                not_found.append(
                    ('NA',
                     len(result_j['true'][true].split(' ')),
                     len(query_i.split(' ')),
                     query_i, true)
                )
            scores['Всего'] += 1

    # print(sum(inter) / len(inter))
    # for name, value in scores.items():
        # print(name, value, round(value / scores['Всего'], 2))
    return scores, minilm, tfidf, not_found

scores, minilm, tfidf, not_found = get_scores(minilm_json_100, tfidf_json_100)
# get_scores(minilm_json_1000, tfidf_json_1000)

In [None]:
# средняя длина описаний фильмов, правильно найденных только одним методом
# нейронка плохо подходит для длинных текстов, не хвататет выразительной способности
print(sum([x[1] for x in only_minilm]) / len(only_minilm), sum([x[1] for x in only_tfidf]) / len(only_tfidf))

# средняя длина запросов фильмов, правильно найденных только одним методом
print(sum([x[2] for x in only_minilm]) / len(only_minilm), sum([x[2] for x in only_tfidf]) / len(only_tfidf))

In [None]:
print(sum([x[1] for x in minilm]) / len(minilm), sum([x[1] for x in tfidf]) / len(tfidf))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# средняя длина описаний фильмов, правильно найденных
sns.histplot([x[1] for x in minilm], label='MiniLM', bins=20, kde=True, color='r', stat='density')
sns.histplot([x[1] for x in tfidf], label='TF-IDF', bins=20, kde=True, color='g', stat='density')
sns.histplot([x[1] for x in not_found], label='not found', bins=20, kde=True, color='b', stat='density')
plt.legend()

In [None]:
# средняя длина описаний фильмов, правильно найденных только одним методом
sns.histplot([x[1] for x in only_minilm], label='MiniLM', kde=True, color='r', stat='density')
sns.histplot([x[1] for x in only_tfidf], label='TF-IDF', kde=True, color='g', stat='density')
plt.legend()

In [None]:
import pandas as pd

# корреляция с кол-вом вариантов заголовков для запросов в правильных ответах
res = []
for (query_i, result_i), (query_j, result_j) in zip(minilm_json_100.items(), tfidf_json_100.items()):
    frac_minilm = [1 if title in result_i['predicted'] else 0 for title in result_i['true']]
    frac_tfidf = [1 if title in result_j['predicted'] else 0 for title in result_j['true']]
    res.append((len(result_i['true']),
                sum(frac_minilm) / len(frac_minilm),
                sum(frac_tfidf) / len(frac_tfidf)
                ))

res_df = pd.DataFrame(res, columns=['n_true', 'frac_minilm', 'frac_tfidf'])
res_df.corr()