In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

import pandas as pd

import nltk
from nltk.corpus import stopwords
import re
import pymorphy3

from string import punctuation
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tiv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
pages_data = pd.read_csv('data/link_page_sentences.csv')

pages_data.dropna(inplace=True)
pages_data.set_index(['link', 'sentence_order'], inplace=True)

pages_data.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence_text
link,sentence_order,Unnamed: 2_level_1
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,0,набор для самостоятельный активирование пригот...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,1,концентрат эм-био ем·1 микробиологический удоб...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,2,срок годность 12 месяц с дата изготовление сос...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,3,состав концентрат вода питьевой патока комплек...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,4,в 1 см3 содержаться не менее 1 106 кой молочно...
https://em-russia.ru/shop/all/em-bio-kontsentrat-em1-vostok-em-1,5,применение продукт применяться только после ак...
https://em-russia.ru/shop/all/em-5-bioregulyator-bolezni-vrediteli,0,предназначить для предотвращение заболевание и...
https://em-russia.ru/shop/all/em-5-bioregulyator-bolezni-vrediteli,1,состав вода питьевой патока сахар-песок спирт ...
https://em-russia.ru/shop/all/em-5-bioregulyator-bolezni-vrediteli,2,в 1 см3 содержаться не менее 1 103 кой молочно...
https://em-russia.ru/shop/all/em-5-bioregulyator-bolezni-vrediteli,3,применение в профилактический цель цветок овощ...


In [4]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))
page_indices_array = vectorizer.fit_transform(pages_data['sentence_text']).toarray()
columns = list(vectorizer.vocabulary_.keys())

page_indices = pd.DataFrame(page_indices_array, columns=columns)
page_indices.index = pages_data.index
page_indices.shape

(1016, 2261)

In [5]:
ranker = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker.fit(page_indices)

In [8]:
MORPH = pymorphy3.MorphAnalyzer()

def preprocess_query(query):
    query = query.lower()
    query = re.sub('\s\s+', ' ', query)
    
    wrds = []
    for wrd in nltk.word_tokenize(query):
        if wrd in punctuation:
            continue

        wrd = MORPH.parse(wrd)[0].normal_form
        wrds.append(wrd)

    clear_query = ' '.join(wrds)
    
    return clear_query

In [12]:
query = preprocess_query('средство для уборки')
query_vect = vectorizer.transform([query]).toarray()

scores, indices = ranker.kneighbors(query_vect)
scores = scores[0]
indices = indices[0]



In [13]:
scores, indices

(array([0.63134332, 0.63134332, 0.63134332, 0.63134332, 0.7058981 ,
        0.73118279, 0.75842602, 0.79566068, 0.80788199, 0.80788199,
        0.80788199, 0.82122761, 0.82122761, 0.82923646, 0.83550843,
        0.84268238, 0.84745595, 0.85191578, 0.85275916, 0.86087773,
        0.87290508, 0.90295148, 0.9052923 , 0.91267118, 0.95666574,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 1.        , 1.        , 1.  

In [14]:
page_indices.iloc[indices]

Unnamed: 0_level_0,Unnamed: 1_level_0,набор,самостоятельный,активирование,приготовление,литр,эм,био,ем,микробиологический,удобрение,...,прикорм,улья,зимовка,сезон,начать,поэтапный,заливка,100000,признак,октябрь
link,sentence_order,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-10,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-pomeshcheniy-em-sprey-0-5-l,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-025,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-05,1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
https://em-russia.ru/shop/all/ozdoravlivayushchee-maslo-2,1,0.0,0.0,0.0,0.000000,0.327071,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita-025,11,0.0,0.0,0.0,0.245113,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita-025,12,0.0,0.0,0.0,0.000000,0.000000,0.459479,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.314365
https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita-025,13,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
https://em-russia.ru/shop/all/kormovaya-dobavka-em-vita-025,27,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [15]:
candidate_links = page_indices.iloc[indices].reset_index()['link']
score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})

score_df.head(5)

Unnamed: 0,candidate_link,score
0,https://em-russia.ru/shop/all/sredstvo-dlya-ub...,0.631343
1,https://em-russia.ru/shop/all/sredstvo-dlya-ub...,0.631343
2,https://em-russia.ru/shop/all/sredstvo-dlya-ub...,0.631343
3,https://em-russia.ru/shop/all/sredstvo-dlya-ub...,0.631343
4,https://em-russia.ru/shop/all/ozdoravlivayushc...,0.705898


In [16]:
group_score_df = score_df.groupby('candidate_link').sum()['score']
group_score_df = group_score_df.sort_values()

best_link = group_score_df.index[0]
best_link

'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-10'

In [17]:
group_score_df[:10].index.tolist()

['https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-10',
 'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-pomeshcheniy-em-sprey-0-5-l',
 'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-025',
 'https://em-russia.ru/shop/all/sredstvo-dlya-uborki-em-sprey-05',
 'https://em-russia.ru/shop/all/em-5-bioregulyator-bolezni-vrediteli',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-10',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-1',
 'https://em-russia.ru/shop/all/preparat-dlya-ochistki-vody-aqua-em-1-1-l',
 'https://em-russia.ru/shop/all/em-zhidkoe-organicheskoe',
 'https://em-russia.ru/shop/all/sredstvo-dezinfitsiruyushchee-em-vita-pro']

In [18]:
pca = PCA(n_components=60)
pca_indices_array = pca.fit_transform(page_indices_array)

In [19]:
ranker_pca = NearestNeighbors(n_neighbors=100, metric='cosine', n_jobs=-1)
ranker_pca.fit(pca_indices_array)

In [20]:
pca.transform(vectorizer.transform([query]).toarray())

array([[-4.52917542e-02,  2.55122901e-02, -5.22958065e-02,
        -5.57161676e-03, -4.05391166e-03, -4.37157494e-02,
         2.55881084e-03,  2.21116241e-03,  1.74116589e-02,
        -2.26752708e-02, -4.74056930e-02, -2.88351978e-03,
        -7.77403659e-03,  3.83111691e-03, -1.43960089e-02,
        -3.40149785e-02, -2.34254897e-02, -1.03048174e-02,
        -1.74061103e-02,  3.05164568e-02, -9.36629644e-03,
        -4.74763784e-03,  2.33886193e-02, -3.17115899e-02,
        -1.03958330e-02, -1.93589291e-02,  3.49152733e-03,
         4.18240911e-02,  1.10455227e-02,  1.24219386e-02,
         1.35622271e-03, -3.87486532e-02, -4.27758252e-03,
        -3.37259786e-03,  2.68240161e-02,  2.45825265e-02,
         1.49829551e-02, -5.65609562e-02,  1.85656720e-02,
        -1.58534355e-02,  1.45582728e-02, -5.67728766e-02,
        -3.73029352e-02,  5.09004394e-05, -1.39849528e-02,
         3.42751286e-02,  4.29819748e-02, -4.61535494e-02,
        -2.79463638e-02, -8.14370107e-02,  1.97415535e-0

In [21]:
def search_page(query, vectorizer, pca, ranker, page_count=10, score_type='sum'):
    query = preprocess_query(query)
    query_vect = vectorizer.transform([query]).toarray()
    query_vect = pca.transform(query_vect)
    
    scores, indices = ranker.kneighbors(query_vect)
    scores = scores[0]
    indices = indices[0]
    
    candidate_links = page_indices.iloc[indices].reset_index()['link']
    score_df = pd.DataFrame({'candidate_link': candidate_links, 'score': scores})
    
    if score_type == 'mean':
        group_score_df = group_score_df.groupby('candidate_link').mean()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    elif score_type == 'min':
        group_score_df = score_df.groupby('candidate_link').min()['score']
        group_score_df = group_score_df.sort_values(ascending=True)
    else:
        score_df['score'] = 1 - score_df['score']
        group_score_df = score_df.groupby('candidate_link').sum()['score']
        group_score_df = group_score_df.sort_values(ascending=False)
    
    return group_score_df[:page_count].index.tolist()

In [22]:
search_page('средство для уборки', vectorizer, pca, ranker_pca, page_count=15, score_type='sum')

['https://em-russia.ru/shop/all/sredstvo-dezinfitsiruyushchee-em-vita-pro',
 'https://em-russia.ru/shop/all/dezodorant-antibakterialnyy-500-ml',
 'https://em-russia.ru/shop/all/zubnaya-pasta',
 'https://em-russia.ru/shop/all/em-mylo-zhidkoe-dlya-ruk-i-tela',
 'https://em-russia.ru/shop/all/em-zhidkoe-organicheskoe',
 'https://em-russia.ru/shop/all/em-keramika',
 'https://em-russia.ru/shop/all/ozdoravlivayushchee-maslo-2',
 'https://em-russia.ru/shop/all/biodezinfektant-bioem-aktiv',
 'https://em-russia.ru/shop/all/krem-vokrug-glaz-i-ugolkov-rta',
 'https://em-russia.ru/shop/all/em-poroshok',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-10',
 'https://em-russia.ru/shop/all/preparat-dlya-ochistki-vody-aqua-em-1-1-l',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-1',
 'https://em-russia.ru/shop/all/em-mylo-khozyaystvennoe',
 'https://em-russia.ru/shop/all/shampun-ot-vypadeniya-volos']

In [23]:
import pickle

with open('search_model', 'wb') as f:
    pickle.dump({'vectorizer': vectorizer, 'pca': pca, 'ranker': ranker_pca}, f)

In [24]:
with open('search_model', 'rb') as f:
    search_model = pickle.load(f)
    
search_model

{'vectorizer': TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 'pca': PCA(n_components=60),
 'ranker': NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100)}

In [25]:
vectorizer = search_model['vectorizer']
pca = search_model['pca']
ranker_pca = search_model['ranker']

vectorizer, pca, ranker_pca

(TfidfVectorizer(stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                             'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                             'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                             'по', 'только', 'ее', 'мне', ...]),
 PCA(n_components=60),
 NearestNeighbors(metric='cosine', n_jobs=-1, n_neighbors=100))

In [26]:
search_page('средство для уборки', vectorizer, pca, ranker_pca, page_count=15, score_type='sum')

['https://em-russia.ru/shop/all/sredstvo-dezinfitsiruyushchee-em-vita-pro',
 'https://em-russia.ru/shop/all/dezodorant-antibakterialnyy-500-ml',
 'https://em-russia.ru/shop/all/zubnaya-pasta',
 'https://em-russia.ru/shop/all/em-mylo-zhidkoe-dlya-ruk-i-tela',
 'https://em-russia.ru/shop/all/em-zhidkoe-organicheskoe',
 'https://em-russia.ru/shop/all/em-keramika',
 'https://em-russia.ru/shop/all/ozdoravlivayushchee-maslo-2',
 'https://em-russia.ru/shop/all/biodezinfektant-bioem-aktiv',
 'https://em-russia.ru/shop/all/krem-vokrug-glaz-i-ugolkov-rta',
 'https://em-russia.ru/shop/all/em-poroshok',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-10',
 'https://em-russia.ru/shop/all/dlya-ochistki-vody-aqua-em-1-1',
 'https://em-russia.ru/shop/all/preparat-dlya-ochistki-vody-aqua-em-1-1-l',
 'https://em-russia.ru/shop/all/em-mylo-khozyaystvennoe',
 'https://em-russia.ru/shop/all/shampun-ot-vypadeniya-volos']