In [1]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
import os

In [2]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [3]:
es.indices.create(index='byweb')

RequestError: RequestError(400, 'resource_already_exists_exception', 'index [byweb/f3aGFTHtTqO2PYHZrbnQ5A] already exists')

In [4]:
index_settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            },
            'title': {
                'type': 'text'
            },
            'id': {
                'type': 'keyword'
            }
        }
    },
    'settings': {
       'analysis': {
            'analyzer': {
                'white_lover': {
                    'tokenizer': 'white_20',
                    'filter': [
                        'lowercase'
                    ]
                }
            },
            'tokenizer': {
                'white_20': {
                    'type': 'whitespace'
                }
            },
        }
    }
}

In [5]:
def recreate_index():
    es.indices.delete(index='byweb')
    es.indices.create(index='byweb', body=index_settings)

In [6]:
%time
recreate_index()

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.2 µs


In [10]:
def create_es_action(index, doc_id, document):
    with open(document) as doc:
        data = {
            'content': doc.read(),
            'title': 'TODO title',
            'id': doc_id
        }
        
        return {
            '_index': index,
            '_id': doc_id,
            '_source': data
        }

In [11]:
def es_actions_generator(docs ):
    for doc in docs:
        data = doc['text']
        doc_id = doc['id']
        
        yield create_es_action('byweb', doc_id, data)


In [25]:
file = '/home/karvozavr/Documents/data/out/all.json'
with open(file) as f:
    docs = json.load(f)

generator = es_actions_generator(docs)

In [26]:
for ok, result in tqdm(parallel_bulk(es, generator, queue_size=4, thread_count=4, chunk_size=100)):
    if not ok:
        print(result)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

KeyboardInterrupt: 

In [14]:
def search(query, *args):
    pretty_print_result(es.search(index='byweb', body=query, size=20), args)
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        
def pretty_print_result(search_result, fields=[]):
    # fields is a list of fields names which we want to be printed
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
        for field in fields:
            print(f'{field}: {hit["_source"][field]}')
                  
def get_doc_by_id(doc_id):
    return es.get(index='byweb', id=doc_id)['_source']

In [27]:
query = {
    'query': {
         'bool': {
            'must': 
                {
                    'match': {
                        'content': 'трансформатор'
                    }
                }
            
        }
    }
}

search(query, 'id')

Total documents: 48
Doc 1083569, score is 15.73395
id: 1083569
Doc 1427930, score is 14.766399
id: 1427930
Doc 1474909, score is 13.700911
id: 1474909
Doc 1368363, score is 12.53838
id: 1368363
Doc 1295560, score is 12.53838
id: 1295560
Doc 1175316, score is 12.065858
id: 1175316
Doc 1323317, score is 11.847053
id: 1323317
Doc 1135771, score is 11.561841
id: 1135771
Doc 1188808, score is 11.55632
id: 1188808
Doc 1440056, score is 11.178662
id: 1440056
Doc 1189078, score is 10.974306
id: 1189078
Doc 117420, score is 10.319637
id: 117420
Doc 1174708, score is 10.102556
id: 1174708
Doc 1243251, score is 10.102556
id: 1243251
Doc 1353402, score is 10.102556
id: 1353402
Doc 1353403, score is 10.102556
id: 1353403
Doc 1247156, score is 9.894417
id: 1247156
Doc 1188482, score is 9.840713
id: 1188482
Doc 1105686, score is 9.694682
id: 1105686
Doc 1091696, score is 9.6908
id: 1091696
