In [1]:
import json
import typing as t
from pathlib import Path

import itertools as it
import abc
import time
from overrides import overrides

import asyncio
import logging


from config import CONFIGURATION
from utils.logger import setup_logging


from copy import deepcopy
from operator import itemgetter

from elasticsearch import Elasticsearch
from overrides import overrides


import numpy as np
import tensorflow as tf
import keras, keras.layers as L
from utils.utils import Vocab, load, infer_mask, read_conll
from utils.conlleval import evaluate
from tqdm import tqdm
from IPython.display import clear_output
import matplotlib.pyplot as plt
from collections import defaultdict
%matplotlib inline
from textblob import TextBlob
import gensim
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api
from sklearn.model_selection import train_test_split


Using TensorFlow backend.


In [2]:
def preprocessing(texts):
    for i,text in enumerate(texts):   
        text = (''.join(text.lower()))
        texts[i] = text.replace('\n', '')
    return texts

In [3]:
# Определение степени принадлежности к категории Еда или Спорт для выделения Item для запроса
class IntentClassifier():
    def __init__(self, w2v_ru_fpath="all.norm-sz100-w10-cb0-it1-min100.w2v", w2v_en_model = 'glove-twitter-50'):
        self.w2v_ru = gensim.models.KeyedVectors.load_word2vec_format(w2v_ru_fpath, binary=True, unicode_errors='ignore')
        self.w2v_ru.init_sims(replace=True)        
        self.w2v_en = api.load(w2v_en_model)
        
        self.catigories_en = np.zeros((2,50))
        self.catigories_en[0] = self.w2v_en.get_vector('food')
        self.catigories_en[1] = self.w2v_en.get_vector('sport')
        
        self.catigories_ru = np.zeros((2,100))
        self.catigories_ru[0] = self.w2v_ru.get_vector('еда')
        self.catigories_ru[1] = self.w2v_ru.get_vector('спорт')
        
    def classify(self, word):
        try:
            language = TextBlob(word).detect_language()
        except:
            print ('short word')
            return (np.array((-1,-2)))            
            
        try:
            if language == 'ru':
                sim = self.w2v_ru.cosine_similarities(self.w2v_ru.get_vector(word),self.catigories_ru)
                return (np.array((np.argmax(sim), np.max(sim))))
            else :
                sim = self.w2v_en.cosine_similarities(self.w2v_en.get_vector(word),self.catigories_en)
                return (np.array((np.argmax(sim), np.max(sim))))
        except : 
            print ('unknown word - ', word)
            return (np.array((-1,-2)))

# Named-entity recognition
Инициализация и обучение

In [4]:
data = read_conll('123g.txt', lower_words=True)

In [5]:
tf.reset_default_graph()
sess = tf.InteractiveSession()

train_data = read_conll('gggggg.txt', lower_words=True)
dev_data = read_conll('123g.txt', lower_words=True)
# train_data, dev_data = train_test_split(data, test_size=0.2, random_state=42)

vocabs = {
    key: Vocab.from_lines([row[key] for row in train_data])
    for key in ['word', 'pos', 'ne']
}

def prepare_batch(data):
    keys = data[0].keys()
    return {
        key: vocabs[key].to_matrix(row[key] for row in data)
        for key in keys
    }

class SimpleModel:
    def __init__(self, name, emb_size=16, hid_size=16):  
        self.emb = L.Embedding(len(vocabs['word']), emb_size)
        self.lstm = L.Bidirectional(L.LSTM(hid_size, return_sequences=True))
        self.logits = L.Dense(len(vocabs['ne']))
    
    def __call__(self, input_ix):
        embeddings = self.emb(input_ix)
        bdlstm = self.lstm(embeddings)
        ner_logits = self.logits(bdlstm)        
        return {'ne': ner_logits}
    
model = SimpleModel('mod1')
sess.run(tf.global_variables_initializer())

class trainer:
    """ A bunch of tensorflow operations used for model training """
    
    ph = {
        'word': tf.placeholder('int32', [None, None], name='input_tokens'),
        'ne': tf.placeholder('int32', [None, None], name='named_entities'),
    }

    logits = model(ph['word'])
    mask = infer_mask(ph['word'])
    
    loss = -tf.nn.log_softmax(logits['ne'], -1) * tf.one_hot(ph['ne'], len(vocabs['ne']))
    loss = tf.reduce_sum(loss * mask[:, :, None]) / tf.reduce_sum(mask)

    step = tf.train.AdamOptimizer().minimize(loss)

def iterate_minibatches(data, batch_size=128, shuffle=True, cycle=False, max_batches=None):
    indices = np.arange(len(data))
    total_batches = 0
    while True:
        if shuffle: indices = np.random.permutation(indices)
        for start_i in range(0, len(data), batch_size):
            batch_ix = indices[start_i: start_i + batch_size]
            yield prepare_batch(data[batch_ix])
            total_batches += 1
            if max_batches and total_batches >= max_batches:
                return
        if not cycle: break
            

def compute_error_rate(trainer, data, batch_size=128, key='ne'):
    numerator = denominator = 0.0
    for batch in iterate_minibatches(data, batch_size, shuffle=False, cycle=False):
        batch_ne_logits, batch_mask = sess.run([trainer.logits[key], trainer.mask], 
                                               {trainer.ph['word']: batch['word']})
        numerator += np.sum((batch[key] == batch_ne_logits.argmax(-1)) * batch_mask)
        denominator += batch_mask.sum()
    return (1.0 - numerator / denominator) * 100

def decode_greedy(trainer, data, vocabs, batch_size=128, key='ne'):
    result = []
    for batch in iterate_minibatches(data, batch_size, shuffle=False, cycle=False):
        batch_logits = sess.run(trainer.logits[key], 
                                {trainer.ph['word']: batch['word']})
        result.extend(vocabs[key].to_lines(batch_logits.argmax(-1)))
    return result

results = list()
def compute_stats(trainer, data, vocabs, batch_size=128, key='ne', verbose=False):
    pred_seqs = decode_greedy(trainer, data, vocabs, batch_size, key)
    true_seqs = [r[key] for r in data]
    results.append((pred_seqs, true_seqs))
    precision, recall, f1 = evaluate(true_seqs, pred_seqs, verbose)
    return precision, recall, f1
    
class StatsHistory:
    def __init__(self):
        self.precision = []
        self.recall = []
        self.f1 = []

eval_every = 50

loss_history = []
dev_stats_history = StatsHistory()
indomain_stats_history = StatsHistory()
outdomain_stats_history = StatsHistory()

sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
''' 
for batch in tqdm(iterate_minibatches(train_data, cycle=True, max_batches=10000)):
    loss_t, _ = sess.run([trainer.loss, trainer.step],
                         {trainer.ph[key]: batch[key] for key in trainer.ph})
    loss_history.append(loss_t)
    
    if len(loss_history) % eval_every == 0:
        precision, recall, f1 = compute_stats(trainer, dev_data, vocabs, verbose=True)
        dev_stats_history.precision.append(precision)
        dev_stats_history.recall.append(recall)
        dev_stats_history.f1.append(f1)
        clear_output(True)
        plt.figure(figsize=[12, 6])
        plt.subplot(1, 2, 1)
        plt.plot(loss_history)
        plt.title('train loss'), plt.grid()
        plt.subplot(1, 2, 2)       
        plt.plot(np.arange(1, len(dev_stats_history.f1) + 1) * eval_every, dev_stats_history.f1, label="dev f1")
        plt.legend()
        plt.title('dev stats %'), plt.grid()
        plt.show()
'''

' \nfor batch in tqdm(iterate_minibatches(train_data, cycle=True, max_batches=10000)):\n    loss_t, _ = sess.run([trainer.loss, trainer.step],\n                         {trainer.ph[key]: batch[key] for key in trainer.ph})\n    loss_history.append(loss_t)\n    \n    if len(loss_history) % eval_every == 0:\n        precision, recall, f1 = compute_stats(trainer, dev_data, vocabs, verbose=True)\n        dev_stats_history.precision.append(precision)\n        dev_stats_history.recall.append(recall)\n        dev_stats_history.f1.append(f1)\n        clear_output(True)\n        plt.figure(figsize=[12, 6])\n        plt.subplot(1, 2, 1)\n        plt.plot(loss_history)\n        plt.title(\'train loss\'), plt.grid()\n        plt.subplot(1, 2, 2)       \n        plt.plot(np.arange(1, len(dev_stats_history.f1) + 1) * eval_every, dev_stats_history.f1, label="dev f1")\n        plt.legend()\n        plt.title(\'dev stats %\'), plt.grid()\n        plt.show()\n'

In [6]:
saver = tf.train.Saver()
saver.restore(sess, "model/model2.ckpt")

INFO:tensorflow:Restoring parameters from model/model2.ckpt


Получение запроса из NERa

In [7]:
def Extractor(queries):
    
    def NER(data):
    
        def prepare_for_predict(sent):
            result = dict()
            result['word'] = sent
            result['ne'] = ' '.join(['O'] * len(sent.split()))
            result['pos'] = ' '.join(['O'] * len(sent.split()))
            return np.array([result])
    
        def get_ner(p):
            result = list()
            for batch in iterate_minibatches(p, 1, shuffle=False, cycle=False):
                    batch_logits = sess.run(trainer.logits['ne'], 
                                            {trainer.ph['word']: batch['word']})
                    result.extend(vocabs['ne'].to_lines(batch_logits.argmax(-1)))
            return result

        data_format = prepare_for_predict(data)
        return get_ner(data_format)  
    
    er = list()
    for q in queries:
        er.append((q, NER(q)))
    l = list()
    for queri in er:
        items = dict()
        items['Offer_type'] = 0
        items['Attributes'] = ''
        items['Price_from'] = 0
        items['Item'] = list()
        items['Price_to'] = list()
        items['Cashback'] = list()
        items['Sale'] = list()
        for i, name in enumerate(queri[1][0].split()):
            if name == 'I-Name':
                items['Item'].append(queri[0].split()[i])
                continue
            if name == 'I-PRICE':
                try:
                    items['Price_to'].append(float(queri[0].split()[i]))
                except:
                    pass
                continue
            if name == 'I-CB':
                try:
                    items['Cashback'].append(0.5)
                except:
                    pass
                continue
            if name == 'I-SL':
                try:
                    items['Sale'].append(0.5)
                except:
                    pass
                continue
            if name == 'I-PRICE':
                try:
                    items['Price_to'].append(float(queri[0].split()[i]))
                except:
                    pass
                continue
            if name == 'I-CB_V':
                try:
                    if float(queri[0].split()[i]) < 100:
                        items['Cashback'].append(float(queri[0].split()[i]))
                    else:
                        items['Price_to'].append(float(queri[0].split()[i]))
                except:
                    pass
                continue
            if name == 'I-SL_V':
                try:
                    if float(queri[0].split()[i]) < 100:
                        items['Sale'].append(float(queri[0].split()[i]))
                    else:
                        items['Price_to'].append(float(queri[0].split()[i]))
                except:
                    pass
                continue
        if items['Sale']:
            max_sale = max(items['Sale'])
            items['Sale'].clear()
            items['Sale'] = max_sale
        else: items['Sale'] = 0
        if items['Cashback']:
            max_cb = max(items['Cashback'])
            items['Cashback'].clear()
            items['Cashback'] = max_cb
        else: items['Cashback'] = 0
        if items['Price_to']:
            max_pr = max(items['Price_to'])
            items['Price_to'].clear()
            items['Price_to'] = max_pr        
        else: items['Price_to'] = 9999999
        l.append(items)
    return l

# Поисковик

In [8]:
class Ranker(metaclass=abc.ABCMeta):
    @abc.abstractmethod
    def rank(self, search_form: t.Dict[str, t.Any]) -> t.List[t.Dict[str, t.Any]]:
        pass

In [9]:
class ElasticsearchRanker(Ranker):
    __DEFAULT_INDEX = 'offer_search'
    __DEFAULT_DOC_TYPE = 'product'

    __SEARCH_QUERY_ITEM = {
        'query': {
            'bool': {
                'must': [
                    {
                        'fuzzy': {
                            'Item': {
                                'value': None,
                                'prefix_length': 0,
                            }
                        },
                    },
                ],
                'should': [
                    {
                        'multi_match': {
                                'query': None,
                                'fields': ['Item', 'Attributes', 'Advert_text'],
                                'type': 'best_fields'
                            }
                    },
                ],
                'filter': [
                    {
                        'range': {
                            'Price': {
                                'gte': None,
                                'lte': None,
                            }
                        },
                    },
                    {
                        'range': {
                            'Cashback': {
                                'gte': None,
                            }
                        },
                    },
                    {
                        'range': {
                            'Offer_type': {
                                'gte' : None,
                                'lte' : None,
                            }
                        }
                    },
                ]
            },
        },
        'size': 10000,
    }


    __SEARCH_QUERY_ATTRIBUTES = {
        'query': {
            'bool': {
                'should': [
                    {
                        'multi_match': {
                                'query': None,
                                'fields': ['Item', 'Attributes', 'Advert_text'],
                                'type': 'best_fields'
                            }
                    },
                ],
                'filter': [
                    {
                        'range': {
                            'Price': {
                                'gte': None,
                                'lte': None,
                            }
                        },
                    },
                    {
                        'range': {
                            'Cashback': {
                                'gte': None,
                            }
                        },
                    },
                    {
                        'range': {
                            'Offer_type': {
                                'gte' : None,
                                'lte' : None,
                            }
                        }
                    },
                ]
            },
        },
        'size': 10000,
    }

    __KEYS_TO_SET_ITEM = ('query', 'bool', 'must', 0, 'fuzzy', 'Item', 'value')
    __KEYS_TO_SET_ATTRIBUTES = ('query', 'bool', 'should', 0, 'multi_match', 'query')
    __KEYS_TO_SET_PRICE_FROM = ('query', 'bool', 'filter', 0, 'range', 'Price', 'gte')
    __KEYS_TO_SET_PRICE_TO = ('query', 'bool', 'filter', 0, 'range', 'Price', 'lte')
    __KEYS_TO_SET_CASHBACK = ('query', 'bool', 'filter', 1, 'range', 'Cashback', 'gte')
    __KEYS_TO_SET_OFFER_TYPE_FROM = ('query', 'bool', 'filter', 2, 'range', 'Offer_type', 'gte')
    __KEYS_TO_SET_OFFER_TYPE_TO = ('query', 'bool', 'filter', 2, 'range', 'Offer_type', 'lte')

    def __init__(
        self, 
        es_host: str = 'localhost', 
        es_port: int = 9200,
        index: str = __DEFAULT_INDEX,
        doc_type: str = __DEFAULT_DOC_TYPE,
        preset: t.Optional[t.List[t.Dict[str, t.Any]]] = None,
    ) -> None:
        self.__elasticsearch = Elasticsearch([{
            'host': es_host,
            'port': es_port,
        }])

        if not self.__elasticsearch.ping():
            raise ValueError(f"Can not connect to Elasticsearch: {es_host}:{es_port}")

        self.__index = index
        self.__doc_type = doc_type

        if preset is not None:
            self.__preset(preset)

    @overrides
    def rank(self, search_form: t.Dict[str, t.Any]) -> t.List[t.Dict[str, t.Any]]:
        search_result = self.__elasticsearch.search(
            index=self.__index,
            doc_type=self.__doc_type,
            body=self.__build_search_query(search_form),
        )

        return [record['_source'] for record in search_result['hits']['hits']]

    def __preset(self, preset: t.List[t.Dict[str, t.Any]]) -> t.NoReturn:
        for record in preset:
            self.__elasticsearch.index(
                index=self.__index, 
                doc_type=self.__doc_type, 
                body=record,
            )

    @classmethod
    def __build_search_query(cls, search_form: t.Dict[str, t.Any]) -> t.List[t.Dict[str, t.Any]]:
        if 'Item' in search_form:
            search_query = deepcopy(cls.__SEARCH_QUERY_ITEM)

            for query_keys, query_value in (
                (cls.__KEYS_TO_SET_ITEM, search_form['Item']),
                (cls.__KEYS_TO_SET_ATTRIBUTES, search_form['Attributes']),
                (cls.__KEYS_TO_SET_PRICE_FROM, search_form['Price_from']),
                (cls.__KEYS_TO_SET_PRICE_TO, search_form['Price_to']),
                (cls.__KEYS_TO_SET_CASHBACK, search_form['Cashback']),
                (cls.__KEYS_TO_SET_OFFER_TYPE_FROM, search_form['Offer_type_from']),
                (cls.__KEYS_TO_SET_OFFER_TYPE_TO, search_form['Offer_type_to']),
            ):
                cls.__set_query_value(search_query, query_keys, query_value)
        else:
            search_query = deepcopy(cls.__SEARCH_QUERY_ATTRIBUTES)

            for query_keys, query_value in (
                (cls.__KEYS_TO_SET_ATTRIBUTES, search_form['Attributes']),
                (cls.__KEYS_TO_SET_PRICE_FROM, search_form['Price_from']),
                (cls.__KEYS_TO_SET_PRICE_TO, search_form['Price_to']),
                (cls.__KEYS_TO_SET_CASHBACK, search_form['Cashback']),
                (cls.__KEYS_TO_SET_OFFER_TYPE_FROM, search_form['Offer_type_from']),
                (cls.__KEYS_TO_SET_OFFER_TYPE_TO, search_form['Offer_type_to']),
            ):
                cls.__set_query_value(search_query, query_keys, query_value)

        return search_query

    @staticmethod
    def __set_query_value(query: t.Dict[str, t.Any], keys: t.Tuple[str], value: t.Any) -> t.NoReturn:
        container = query

        # try to come to penultimate ("last but one") contrainer, so use keys without its tail
        # it is nessaccary because otherwise we will lose reference to last container and will
        # not be able to set new value
        keys_to_penultimate_container = keys[:-1]
        last_key = keys[-1]

        for key in keys_to_penultimate_container:
            container = itemgetter(key)(container)

        container[last_key] = value


In [10]:
logger = logging.getLogger('Searcher')


class Searcher:
    def __init__(
        self,
        extractor: Extractor,
        ranker: Ranker,
        classifier: IntentClassifier
    ) -> None:
        self.__extractor = extractor
        self.__classifier = classifier
        self.__ranker = ranker

    def search(self, texts: t.List[str], n_top: int = 5) -> t.List[t.Dict[str, t.Any]]:
        
        forms = self.__extractor(texts)
        queries = []
        for i,form in enumerate(forms):
            
            if len(form['Item']) > 1: 
                scores = np.zeros((len(form['Item']),2))
                for i, word in enumerate(form['Item']):
                    scores[i] = classifier.classify(word)
                item_index = scores[:,1].argmax(axis = 0)
                scores[item_index][1] = -2
                atribute_index = scores[:,1].argmax(axis = 0)
                

                
                index = np.ones((len(form['Item'])),dtype=np.bool)
                index[item_index] = 0
#                 Отбор одного или всех оставшихся атрибутов, Раскоментить нужную опцию.
                form['Attributes'] = ' '.join(np.array(form['Item'])[index])                
#                 form['Attributes'] = form['Item'][atribute_index]
                form['Item'] = form['Item'][item_index]
                
            if len(form['Item']) == 1:
                form['Item'] = form['Item'][0]
                        
            if 0 == len(form['Item']):
                form.pop('Item')
                if 0 == len(form['Attributes']):
                    form['Attributes'] = texts[i]
            form['Offer_type_from'] = form['Offer_type']
            form['Offer_type_to'] = form['Offer_type']

            logger.debug(f"slots:\t{json.dumps(form, ensure_ascii=False, indent=4)}")
           
            print (form)
            
            if 0 == form['Offer_type'] and form['Cashback'] > 0:
                ranking = self.__ranker.rank(form)
                if 0 == len(ranking):
                    form['Cashback'] = 0
                    ranking = self.__ranker.rank(form)
            elif 1 == form['Offer_type']:
                ranking = self.__ranker.rank(form)
            else:
                form['Offer_type_from'] = 0
                form['Offer_type_to'] = 1
                ranking = self.__ranker.rank(form)

            if 0 == len(ranking):
                form['Offer_type_from'] = 0
                form['Offer_type_to'] = 1
                ranking = self.__ranker.rank(form)

            offers = self.__group_product_ranking_by_offer(ranking)
            
            queries.append(offers[:n_top])
            
#           Замедление для elastic, без остановок сервер не успевает обрабатывать,
#           можно поиграться с задержкой.

            time.sleep(2)
        return queries

    @staticmethod
    def __group_product_ranking_by_offer(
        ranking: t.List[t.Dict[str, t.Any]],
        n_top: int = 3,
    ) -> t.List[t.Dict[str, t.Any]]:
        offer_names = set()
        offers = []
        for offer, products in it.groupby(ranking, key=lambda product: product['Offer']):
            if offer in offer_names:
                continue

            new_products = []
            for product in products:
                product.pop('Offer')
                web = product.pop('Web')
                cashback = product.pop('Cashback')
                period = product.pop('Period')
                offer_type = product.pop('Offer_type')
                advert_text = product.pop('Advert_text')
                new_products.append(product)
            products = new_products

            offer_names.add(offer)
            offers.append({
                'offer': {
                    'offer': offer,
                    'web': web,
                    'cashback': cashback,
                    'period': period,
                    'offer_type': offer_type,
                    'advert_text': advert_text,
                },
                'products': list(products)[:n_top],  # here we can return shorten information about the 
                                                     # products or only links to them
            })

        return offers


In [11]:
def create_ranker(
    elasticsearch_host: str = 'localhost',
    elasticsearch_port: int = 9200,
    preset_path: t.Optional[Path] = Path('./resources/ranking/preset_small.json'),
) -> Ranker:
    preset = None

    if preset_path is not None:
        with preset_path.open('r') as preset_file:
            preset: t.List[t.Dict[str, t.Any]] = json.load(preset_file)

    return ElasticsearchRanker(
        elasticsearch_host,
        elasticsearch_port,
        preset=preset,
    )

def __create_searcher() -> Searcher:
    return Searcher(
        create_extractor(),
        create_ranker(
            CONFIGURATION['RANKER_ELASTICSEARCH_HOST'],
            CONFIGURATION['RANKER_ELASTICSEARCH_PORT'],
        ),
    )


In [12]:
classifier = IntentClassifier()

In [13]:
#Удалить параметр в create_ranker при первом запуске.
searcher = Searcher(
    Extractor, 
    create_ranker(preset_path = None),
    classifier    
)


In [14]:
#Чтение запросов
queries = list()
with open('query_text_final.txt') as f:
    for line in f:
        queries.append(line.replace('\n', ''))
queries = preprocessing(queries)


In [29]:
results = searcher.search(queries)

['приобрести', 'поесть'] ['приобрести'] [ True False]
{'Offer_type': 0, 'Attributes': 'приобрести', 'Price_from': 0, 'Item': 'поесть', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
short word
['туристическое', 'снаряжение', 'по'] ['снаряжение' 'по'] [False  True  True]
{'Offer_type': 0, 'Attributes': 'снаряжение по', 'Price_from': 0, 'Item': 'туристическое', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['туристическое', 'снаряжение'] ['снаряжение'] [False  True]
{'Offer_type': 0, 'Attributes': 'снаряжение', 'Price_from': 0, 'Item': 'туристическое', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  кешбеком
['мясо', 'кешбеком'] ['кешбеком'] [False  True]
{'Offer_type': 0, 'Attributes': 'кешбеком', 'Price_from': 0, 'Item': 'мясо', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['приобрести', 'можно', 'бег

unknown word -  кроссофки
['беговые', 'кроссофки', 'для', 'бега'] ['кроссофки' 'для' 'бега'] [False  True  True  True]
{'Offer_type': 0, 'Attributes': 'кроссофки для бега', 'Price_from': 0, 'Item': 'беговые', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['ананасы', 'заказать'] ['заказать'] [False  True]
{'Offer_type': 0, 'Attributes': 'заказать', 'Price_from': 0, 'Item': 'ананасы', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0.5, 'Offer_type_from': 0, 'Offer_type_to': 0}
['хочу', 'скушать', 'грузинская'] ['хочу' 'грузинская'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'хочу грузинская', 'Price_from': 0, 'Item': 'скушать', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  тайская
['тайская', 'еда'] ['тайская'] [ True False]
{'Offer_type': 0, 'Attributes': 'тайская', 'Price_from': 0, 'Item': 'еда', 'Price_to': 500.0, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to'

['велосипед', 'merida'] ['merida'] [False  True]
{'Offer_type': 0, 'Attributes': 'merida', 'Price_from': 0, 'Item': 'велосипед', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['хочу', 'свежие', 'фрукты'] ['хочу' 'свежие'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'хочу свежие', 'Price_from': 0, 'Item': 'фрукты', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['нужны', 'теннисные'] ['нужны'] [ True False]
{'Offer_type': 0, 'Attributes': 'нужны', 'Price_from': 0, 'Item': 'теннисные', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['мраморная', 'говядина'] ['мраморная'] [ True False]
{'Offer_type': 0, 'Attributes': 'мраморная', 'Price_from': 0, 'Item': 'говядина', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['гантели', 'можно'] ['можно'] [False  True]
{'Offer_type': 0, 'Attributes': 'можно', 'Price_from': 0, '

['поесть', 'рядом'] ['рядом'] [False  True]
{'Offer_type': 0, 'Attributes': 'рядом', 'Price_from': 0, 'Item': 'поесть', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  коровьесвадебный
['молоко', 'коровьесвадебный', 'торт', 'хочу'] ['коровьесвадебный' 'торт' 'хочу'] [False  True  True  True]
{'Offer_type': 0, 'Attributes': 'коровьесвадебный торт хочу', 'Price_from': 0, 'Item': 'молоко', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['хочу', 'можно'] ['можно'] [False  True]
{'Offer_type': 0, 'Attributes': 'можно', 'Price_from': 0, 'Item': 'хочу', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['спецпредложение', 'молоко'] ['спецпредложение'] [ True False]
{'Offer_type': 0, 'Attributes': 'спецпредложение', 'Price_from': 0, 'Item': 'молоко', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
{'Offer_type': 0, 

['фрукты', 'заказать'] ['заказать'] [False  True]
{'Offer_type': 0, 'Attributes': 'заказать', 'Price_from': 0, 'Item': 'фрукты', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0.5, 'Offer_type_from': 0, 'Offer_type_to': 0}
['самая', 'быстрая'] ['быстрая'] [False  True]
{'Offer_type': 0, 'Attributes': 'быстрая', 'Price_from': 0, 'Item': 'самая', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['приобрести', 'беговые', 'лыжи'] ['приобрести' 'лыжи'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'приобрести лыжи', 'Price_from': 0, 'Item': 'беговые', 'Price_to': 15.0, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['рестораны', 'поблизости'] ['поблизости'] [False  True]
{'Offer_type': 0, 'Attributes': 'поблизости', 'Price_from': 0, 'Item': 'рестораны', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['профессиональные', 'ролики', 'для', 'бега'] ['ролики' 'для' 'бега'] [False  True  Tr

['беговые', 'беговые', 'кроссовки'] ['беговые' 'кроссовки'] [False  True  True]
{'Offer_type': 0, 'Attributes': 'беговые кроссовки', 'Price_from': 0, 'Item': 'беговые', 'Price_to': 10000.0, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  дартс
['дартс', 'для', 'бега'] ['дартс' 'для'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'дартс для', 'Price_from': 0, 'Item': 'бега', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['рестораны', 'рядом'] ['рядом'] [False  True]
{'Offer_type': 0, 'Attributes': 'рядом', 'Price_from': 0, 'Item': 'рестораны', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['недорогой', 'беговой'] ['недорогой'] [ True False]
{'Offer_type': 0, 'Attributes': 'недорогой', 'Price_from': 0, 'Item': 'беговой', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  гантелиа
['хочу', 'одежду', 'для', 'га

unknown word -  дартс
['дартс', 'очень'] ['дартс'] [ True False]
{'Offer_type': 0, 'Attributes': 'дартс', 'Price_from': 0, 'Item': 'очень', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['горные', 'лыжи'] ['горные'] [ True False]
{'Offer_type': 0, 'Attributes': 'горные', 'Price_from': 0, 'Item': 'лыжи', 'Price_to': 9999999, 'Cashback': 10.0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['свежие', 'яблочный', 'пирог'] ['свежие' 'пирог'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'свежие пирог', 'Price_from': 0, 'Item': 'яблочный', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  дешевоом
['рестораны', 'поблизости', 'дешевоом'] ['поблизости' 'дешевоом'] [False  True  True]
{'Offer_type': 0, 'Attributes': 'поблизости дешевоом', 'Price_from': 0, 'Item': 'рестораны', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  тренаж

unknown word -  топовая
['топовая', 'теннисная'] ['топовая'] [ True False]
{'Offer_type': 0, 'Attributes': 'топовая', 'Price_from': 0, 'Item': 'теннисная', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['грузинская', 'кухня'] ['кухня'] [False  True]
{'Offer_type': 0, 'Attributes': 'кухня', 'Price_from': 0, 'Item': 'грузинская', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  тренажерного
['одежду', 'для', 'тренажерного'] ['одежду' 'тренажерного'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'одежду тренажерного', 'Price_from': 0, 'Item': 'для', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['недорогой', 'беговой'] ['недорогой'] [ True False]
{'Offer_type': 0, 'Attributes': 'недорогой', 'Price_from': 0, 'Item': 'беговой', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
{'Offer_type': 0, 'Attribute

['фигурный', 'коньки', 'максимальной'] ['фигурный' 'максимальной'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'фигурный максимальной', 'Price_from': 0, 'Item': 'коньки', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0.5, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  топовая
['топовая', 'теннисная'] ['топовая'] [ True False]
{'Offer_type': 0, 'Attributes': 'топовая', 'Price_from': 0, 'Item': 'теннисная', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['кроссовки', 'максимальным'] ['максимальным'] [False  True]
{'Offer_type': 0, 'Attributes': 'максимальным', 'Price_from': 0, 'Item': 'кроссовки', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['беговую', 'дорожку'] ['дорожку'] [False  True]
{'Offer_type': 0, 'Attributes': 'дорожку', 'Price_from': 0, 'Item': 'беговую', 'Price_to': 20000.0, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  тайская
['тайска

['приобрести', 'можно', 'беговую'] ['приобрести' 'можно'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'приобрести можно', 'Price_from': 0, 'Item': 'беговую', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['перчатки', 'для', 'бокса'] ['перчатки' 'для'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'перчатки для', 'Price_from': 0, 'Item': 'бокса', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0.5, 'Offer_type_from': 0, 'Offer_type_to': 0}
['можно', 'поесть'] ['можно'] [ True False]
{'Offer_type': 0, 'Attributes': 'можно', 'Price_from': 0, 'Item': 'поесть', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['футбольный', 'мяч'] ['мяч'] [False  True]
{'Offer_type': 0, 'Attributes': 'мяч', 'Price_from': 0, 'Item': 'футбольный', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0.5, 'Offer_type_from': 0, 'Offer_type_to': 0}
['можно', 'беговую', 'дорожку'] ['можно' 'дорожку'] [ True False  True]
{'Offer_

['заказ', 'спортивного'] ['заказ'] [ True False]
{'Offer_type': 0, 'Attributes': 'заказ', 'Price_from': 0, 'Item': 'спортивного', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['беговую', 'дорожку'] ['дорожку'] [False  True]
{'Offer_type': 0, 'Attributes': 'дорожку', 'Price_from': 0, 'Item': 'беговую', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['биг', 'мак', 'приобрести'] ['биг' 'мак'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'биг мак', 'Price_from': 0, 'Item': 'приобрести', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  кроссофки
['можно', 'кроссофки', 'для', 'бега'] ['можно' 'кроссофки' 'для'] [ True  True  True False]
{'Offer_type': 0, 'Attributes': 'можно кроссофки для', 'Price_from': 0, 'Item': 'бега', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
{'Offer_type': 0, 'Attributes':

['можно', 'свежие', 'овощи'] ['можно' 'свежие'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'можно свежие', 'Price_from': 0, 'Item': 'овощи', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  2015
['велосипед', 'горный', '2015'] ['горный' '2015'] [False  True  True]
{'Offer_type': 0, 'Attributes': 'горный 2015', 'Price_from': 0, 'Item': 'велосипед', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['беговые', 'кроссовки'] ['кроссовки'] [False  True]
{'Offer_type': 0, 'Attributes': 'кроссовки', 'Price_from': 0, 'Item': 'беговые', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
short word
unknown word -  партеров
['рестораны', 'рядом', 'у', 'партеров'] ['рядом' 'у' 'партеров'] [False  True  True  True]
{'Offer_type': 0, 'Attributes': 'рядом у партеров', 'Price_from': 0, 'Item': 'рестораны', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0.5, 'Offe

short word
['по', 'акции', 'еду'] ['по' 'акции'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'по акции', 'Price_from': 0, 'Item': 'еду', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['ресторан', 'грузинской'] ['грузинской'] [False  True]
{'Offer_type': 0, 'Attributes': 'грузинской', 'Price_from': 0, 'Item': 'ресторан', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['конфеты', 'шоколадные'] ['шоколадные'] [False  True]
{'Offer_type': 0, 'Attributes': 'шоколадные', 'Price_from': 0, 'Item': 'конфеты', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['доставка', 'еды'] ['доставка'] [ True False]
{'Offer_type': 0, 'Attributes': 'доставка', 'Price_from': 0, 'Item': 'еды', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
short word
unknown word -  партеров
['пиццу', 'у', 'партеров'] ['у' 'партеров'] [False  True  True]


['ресторан', 'грузинской', 'кухни'] ['ресторан' 'грузинской'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'ресторан грузинской', 'Price_from': 0, 'Item': 'кухни', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  тренажерного
['одежду', 'для', 'тренажерного'] ['одежду' 'тренажерного'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'одежду тренажерного', 'Price_from': 0, 'Item': 'для', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['специальное', 'предложение', 'спорт'] ['предложение' 'спорт'] [False  True  True]
{'Offer_type': 0, 'Attributes': 'предложение спорт', 'Price_from': 0, 'Item': 'специальное', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['очки', 'для', 'плавания'] ['очки' 'плавания'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'очки плавания', 'Price_from': 0, 'Item': 'для', 'Price_to': 9999999, 'Cashback': 0.5, 

short word
['можно', 'свежие', 'овощи'] ['можно' 'свежие'] [ True  True False]
{'Offer_type': 0, 'Attributes': 'можно свежие', 'Price_from': 0, 'Item': 'овощи', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
unknown word -  дартс
['дартс', 'дешевле', 'чем'] ['дартс' 'чем'] [ True False  True]
{'Offer_type': 0, 'Attributes': 'дартс чем', 'Price_from': 0, 'Item': 'дешевле', 'Price_to': 9999999, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
short word
['фигурный', 'коньки'] ['фигурный'] [ True False]
{'Offer_type': 0, 'Attributes': 'фигурный', 'Price_from': 0, 'Item': 'коньки', 'Price_to': 9999999, 'Cashback': 0.5, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['горные', 'лыжи'] ['горные'] [ True False]
{'Offer_type': 0, 'Attributes': 'горные', 'Price_from': 0, 'Item': 'лыжи', 'Price_to': 40000.0, 'Cashback': 0, 'Sale': 0, 'Offer_type_from': 0, 'Offer_type_to': 0}
['одежду', 'для', 'сноуборда'] ['одежду' 'для'] [ True 

In [30]:
for i,result in enumerate(results):
    with open('out2/queri{}.json'.format(i), 'w') as outfile:
        json.dump(result, outfile)

{'offer': {'offer': 'Tramontana', 'web': 'https://tramontana.ru/', 'cashback': 9, 'period': 0, 'offer_type': 0, 'advert_text': 'the north face, arcteryx, salomon и десятки других брендов в outdoor-центре «трамонтана». профессиональные консультации, бесплатная доставка, быстрый и простой обмен и возврат товара, и любые формы оплаты сделают покупку удобной и безопасной. спецпредложение распространяется при оплате картой на сайте, а так же в магазине на ул. бронницкая, 24.'}, 'products': [{'Item': 'походное снаряжение~туристические палатки~двухместные палатки~палатка nemo chogori 2p', 'Attributes': 'эта палатка относится к полупрофессиональной серии четырёхсезонных палаток под названием chogori tent, которая проектировалась непосредственно для использования в горных условиях. в серии изначально представлено только две версии — двухместный и трёхместный вариант. среди основных преимуществ новинки, производитель выделяет относительно быструю установку, прочный внешний каркас, вместительный 