In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

import pandas as pd

import nltk
from nltk.corpus import stopwords
import re
import pymorphy3

from string import punctuation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tiv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [147]:
products_data =  pd.read_csv('data/link_product_sentences.csv')
print('products: ', products_data.shape)

pages_data = pd.read_csv('data/link_page_sentences.csv')
print('base: ',pages_data.shape)

pages_data = pd.concat([products_data, pages_data],  ignore_index=True)
pages_data.dropna(inplace=True)
pages_data.set_index(['link', 'sentence_order'], inplace=True)


print('base: ', pages_data.shape)

products:  (1018, 3)
base:  (2532, 3)
base:  (3541, 1)


In [148]:
pages_data.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence_text
link,sentence_order,Unnamed: 2_level_1
https://em-russia.ru/shop/all/vozrozhdennoe-budushchee-t-khiga,0,возродить будущее так хиго
https://em-russia.ru/shop/all/broshyura-prirodnoe-zemledelie-s-em,0,брошюра природный земледелие с эм
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,0,эм био концентрат 1 восток эм 1 набор для само...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,1,концентрат эм-био ем·1 микробиологический удоб...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,2,срок годность 12 месяц с дата изготовление сос...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,3,состав концентрат вода питьевой патока комплек...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,4,в 1 см3 содержаться не менее 1 106 кой молочно...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,5,применение продукт применяться только после ак...
https://em-russia.ru/shop/all/em-5-bioregulyator-bolezni-vrediteli,0,эм 5 биорегулятор болезнь вредитель предназнач...
https://em-russia.ru/shop/all/em-5-bioregulyator-bolezni-vrediteli,1,состав вода питьевой патока сахар-песок спирт ...


In [169]:
pages_data.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence_text
link,sentence_order,Unnamed: 2_level_1
https://em-russia.ru/base/osennie-em-podskazki/,0,осенний эм-подсказка 04.09.2020осень очень важ...
https://em-russia.ru/base/osennie-em-podskazki/,1,так что это дело по право считаться один из ва...
https://em-russia.ru/base/osennie-em-podskazki/,2,офэма это дополнительный органика который спос...
https://em-russia.ru/base/osennie-em-podskazki/,3,этот вариант наиболее приемлемый для внесение ...
https://em-russia.ru/base/osennie-em-podskazki/,4,обработка почва и стенка в теплица обязательны...
https://em-russia.ru/base/osennie-em-podskazki/,5,или что наиболее эффективно и удобный внести о...
https://em-russia.ru/base/osennie-em-podskazki/,6,здесь важный накопительный эффект
https://em-russia.ru/base/osennie-em-podskazki/,7,весна обязательно при появление первый листоче...
https://em-russia.ru/base/osennie-em-podskazki/,8,эм-5 природный биорегулятор на основа эффектив...
https://em-russia.ru/base/osennie-em-podskazki/,9,если задумать высадить дерево и кустарник осен...


In [150]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))
page_indices_array = vectorizer.fit_transform(pages_data['sentence_text']).toarray()
columns = list(vectorizer.vocabulary_.keys())

page_indices = pd.DataFrame(page_indices_array, columns=columns)
page_indices.index = pages_data.index
page_indices.shape

(3541, 10496)

In [151]:
ranker = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker.fit(page_indices)

In [152]:
MORPH = pymorphy3.MorphAnalyzer()

def preprocess_query(query):
    query = query.lower()
    query = re.sub('\s\s+', ' ', query)
    
    wrds = []
    for wrd in nltk.word_tokenize(query):
        if wrd in punctuation:
            continue

        wrd = MORPH.parse(wrd)[0].normal_form
        wrds.append(wrd)

    clear_query = ' '.join(wrds)
    
    return clear_query

In [184]:
query = preprocess_query('Устраняет неприятные запахи')
query_vect = vectorizer.transform([query]).toarray()

scores, indices = ranker.kneighbors(query_vect)
scores = scores[0]
indices = indices[0]



In [185]:
scores, indices

(array([0.57489308, 0.60342912, 0.60342912, 0.60342912, 0.60342912,
        0.60754055, 0.60754055, 0.60754055, 0.61045341, 0.6371671 ,
        0.6461252 , 0.67007915, 0.69117316, 0.70420729, 0.71056406,
        0.71560138, 0.71560138, 0.71560138, 0.71560138, 0.73035727,
        0.74614267, 0.74819152, 0.75114171, 0.75394298, 0.75990838,
        0.75990838, 0.760624  , 0.7606545 , 0.76103399, 0.77027303,
        0.77311228, 0.77543573, 0.77711588, 0.78307409, 0.78447118,
        0.78528405, 0.79774335, 0.79894499, 0.80228613, 0.8029201 ,
        0.80312149, 0.80561174, 0.8070911 , 0.80774208, 0.80931978,
        0.81210536, 0.81629603, 0.82206109, 0.82207708, 0.82412049,
        0.82919158, 0.82933032, 0.8297431 , 0.82990153, 0.83091799,
        0.83172822, 0.83310908, 0.83385772, 0.83449963, 0.84025518,
        0.84981629, 0.85109046, 0.85109046, 0.85216797, 0.8531642 ,
        0.85434482, 0.85520496, 0.85524067, 0.8562847 , 0.85635398,
        0.85692041, 0.85905701, 0.86291344, 0.86

In [186]:
page_indices.iloc[indices]

Unnamed: 0_level_0,Unnamed: 1_level_0,возродить,будущее,хиго,брошюра,природный,земледелие,эм,био,концентрат,восток,...,коломенское,днище,голосовой,ecostandard,group,интерактивный,отбирать,непрофессионал,ecostandart,патруль
link,sentence_order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
https://em-russia.ru/shop/all/dezodorant-antibakterialnyy-500-ml,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-10,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-05,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-025,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-pomeshcheniy-em-sprey-0-5-l,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://em-russia.ru/base/otchyet-ob-ispolzovanii-produktsii-ooo-primorskiy-em-tsentr-v-ramkakh-prirodookhrannogo-lagerya-na-o/,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://em-russia.ru/base/otchyet-ob-ispolzovanii-produktsii-ooo-primorskiy-em-tsentr-v-ramkakh-prirodookhrannogo-lagerya-na-o/,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-05,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-pomeshcheniy-em-sprey-0-5-l,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [187]:
candidate_links = page_indices.iloc[indices].reset_index()['link']
score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})

score_df.iloc[1]['candidate_link']

'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-10'

In [188]:
group_score_df = score_df.groupby('candidate_link').mean()['score']
group_score_df = group_score_df.sort_values()

best_link = group_score_df.index[0]
best_link

'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-025'

In [189]:
group_score_df[:10].index.tolist()

['https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-025',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-10',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-1',
 'https://em-russia.ru/shop/all/preparat-dlya-ochistki-vody-aqua-em-1-1-l',
 'https://em-russia.ru/shop/all/dezodorant-antibakterialnyy-500-ml',
 'https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita',
 'https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita-05',
 'https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita-10-l',
 'https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita-025',
 'https://em-russia.ru/base/ustranenie-zapakha-pochvogrunta-i-obezvozhennogo-osadka-na-kanalizatsionnykh-ochistnykh-sooruzheniya/']

In [190]:
pca = PCA(n_components=60)
pca_indices_array = pca.fit_transform(page_indices_array)

In [191]:
ranker_pca = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker_pca.fit(pca_indices_array)

In [175]:
pca.transform(vectorizer.transform([query]).toarray())

array([[-4.29272875e-02,  2.84659442e-02, -2.53190120e-02,
        -6.43516017e-03,  2.44565581e-02,  1.21904801e-02,
        -3.85741170e-02, -1.20961809e-02,  3.01127243e-02,
        -4.48208465e-02, -1.07225327e-02, -2.49854518e-02,
         1.10550141e-02,  3.49880596e-02, -4.98734575e-04,
         1.61152500e-02, -4.71206423e-03, -3.32728894e-02,
        -8.52745097e-04,  1.15504661e-02, -1.99501519e-02,
        -5.49564653e-03, -9.94922611e-03,  3.28388085e-02,
        -3.09897960e-03, -5.10116508e-03,  1.13347255e-02,
        -2.44424047e-02, -1.67250535e-03, -8.22212503e-03,
         1.51633502e-02,  1.51290172e-03, -3.61023130e-02,
        -2.45787848e-02, -6.22896372e-02,  9.65246032e-03,
        -4.32660947e-02,  1.46145360e-02, -8.19175971e-03,
        -2.66694973e-03,  4.35239570e-02,  1.05647757e-03,
         7.74504020e-02,  5.95300922e-02,  3.48955980e-05,
         4.86963744e-03, -2.36725529e-02, -6.96443188e-02,
         1.46616644e-02, -1.99262130e-02, -4.32391799e-0

In [174]:
def search_page(query, vectorizer, pca, ranker, page_count=10, score_type='sum'):
    query = preprocess_query(query)
    query_vect = vectorizer.transform([query]).toarray()
    query_vect = pca.transform(query_vect)
    
    scores, indices = ranker.kneighbors(query_vect)
    scores = scores[0]
    indices = indices[0]
    
    candidate_links = page_indices.iloc[indices].reset_index()['link']
    score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})
    
    if score_type == 'mean':
        group_score_df = score_df.groupby('candidate_link').mean()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    elif score_type == 'min':
        group_score_df = score_df.groupby('candidate_link').min()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    else:
        score_df['score'] = 1 - score_df['score']
        group_score_df = score_df.groupby('candidate_link').sum()['score']
        group_score_df = group_score_df.sort_values(ascending=False)
    
    return group_score_df[:page_count].index.tolist()

In [173]:
search_page('средство для уборки', vectorizer, pca, ranker_pca, page_count=15, score_type='mean')

['https://em-russia.ru/shop/all/em-mylo-khozyaystvennoe',
 'https://em-russia.ru/base/vebinar-ekologiya-na-bazakh-otdykha-s-em/',
 'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-10',
 'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-025',
 'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-pomeshcheniy-em-sprey-0-5-l',
 'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-05',
 'https://em-russia.ru/base/kompleksnoe-primenenie-em-preparatov-v-gostinitse-art-kongress-otel-ekvator-delitsya-opytom-i-rezult/',
 'https://em-russia.ru/base/kak-izbavitsya-ot-pyaten/',
 'https://em-russia.ru/shop/all/dezodorant-antibakterialnyy-500-ml',
 'https://em-russia.ru/base/vliyanie-bioem-aktiv-na-patogennuyu-mikrofloru/',
 'https://em-russia.ru/base/pyl-kak-umenshit-eye-kolichestvo/',
 'https://em-russia.ru/base/rezultaty-primeneniya-em-preparata-dlya-ustraneniya-zapakhov-i-uluchsheniya-kachestva-stokov-na-goro/',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqu

In [176]:
import pickle

with open('search_model', 'wb') as f:
    pickle.dump({'vectorizer': vectorizer, 'pca': pca, 'ranker': ranker_pca}, f)

In [177]:
with open('search_model', 'rb') as f:
    search_model = pickle.load(f)
    
search_model

{'vectorizer': TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 'pca': PCA(n_components=60),
 'ranker': NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)}

In [178]:
vectorizer = search_model['vectorizer']
pca = search_model['pca']
ranker_pca = search_model['ranker']

vectorizer, pca, ranker_pca

(TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 PCA(n_components=60),
 NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100))

In [192]:
search_page('возродить', vectorizer, pca, ranker_pca, page_count=15, score_type='sum')

['https://em-russia.ru/base/snizhenie-kontsentratsii-tyazhelykh-metallov-v-vode-i-v-donnykh-osadkakh-s-pomoshchyu-em-kak-rabotae/',
 'https://em-russia.ru/base/tass-kotoryy-upolnomochen-zayavit-o-tom-kak-v-primore-priruchayut-mikrobov/',
 'https://em-russia.ru/base/vliyanie-bioem-aktiv-na-patogennuyu-mikrofloru/',
 'https://em-russia.ru/base/rassada-vsem-za-zavist/',
 'https://em-russia.ru/base/poyavilsya-malysh-teper-byt-dolzhen-stat-ekologichnym/',
 'https://em-russia.ru/base/vesti24-v-programme-senat-o-plodorodii-s-em/',
 'https://em-russia.ru/base/ii-mezhdunarodnaya-nauchno-prakticheskaya-konferentsiya-klimat-plodorodie-pochv-agrotekhnologii-/',
 'https://em-russia.ru/base/konferentsiya-samara-pochvy-uglerodnaya-neytralnost/',
 'https://em-russia.ru/base/em-dlya-resheniya-ekologicheskikh-problem-rezultaty/',
 'https://em-russia.ru/base/samostoyatelno-aktiviruem-em-kontsentrat/',
 'https://em-russia.ru/base/glavnyy-selskokhozyaystvennyy-vuz-rossii-teper-tozhe-znakom-s-em-tekhnologie