In [7]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
from collections import namedtuple
import os
import xml.etree.ElementTree as ET

In [8]:
settings = {
    'queries_xml': '/home/karvozavr/Downloads/web2008_adhoc (2).xml', # path to queries xml
    'relevance_xml': '/home/karvozavr/Downloads/or_relevant-minus_table.xml', # path to relevance xml
    'collection_dir': '/home/karvozavr/Documents/data/out/all_info/', # folder with contents of all_info.zip
}

In [9]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [10]:
if not es.indices.exists(index='byweb'):
    es.indices.create(index='byweb')

In [11]:
index_settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            },
            'stem_content': {
                'type': 'text'
            },
            'title': {
                'type': 'text'
            },
            'id': {
                'type': 'keyword'
            },
            'url': {
                'type': 'keyword'
            }
        }
    },
    'settings': {
       'analysis': {
            'analyzer': {
                'white_lover': {
                    'tokenizer': 'white_20',
                    'filter': [
                        'lowercase'
                    ]
                }
            },
            'tokenizer': {
                'white_20': {
                    'type': 'whitespace'
                }
            },
        }
    }
}

In [12]:
def recreate_index():
    es.indices.delete(index='byweb')
    es.indices.create(index='byweb', body=index_settings)

In [13]:
%time
recreate_index()

CPU times: user 6 µs, sys: 1e+03 ns, total: 7 µs
Wall time: 13.1 µs


In [14]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def es_actions_generator(docs):
    for doc in os.listdir(docs):
        doc = os.path.join(docs, doc)
        with open(doc) as d:
            data = json.load(d)
            for document in data:
                yield create_es_action('byweb', document['id'], document)


In [15]:
generator = es_actions_generator(settings['collection_dir'])
for ok, result in tqdm(parallel_bulk(es, generator, queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [16]:
Query = namedtuple('Query', ['query_id', 'text', 'relevant'])

In [17]:
def parse_query(query):
    text = query.find('{http://www.romip.ru/data/adhoc}querytext').text
    query_id = query.attrib['id']
    return Query(query_id=query_id, text=text, relevant=[])


    
def extract_queries():
    filename = settings['queries_xml']
    with open(filename) as f:
        tree = ET.parse(filename)
        root = tree.getroot()
        tasks = root.findall('{http://www.romip.ru/data/adhoc}task')
        qs = list(map(parse_query, tasks))
        queries = dict()
        for x in qs:
            queries[x.query_id] = x
        return queries
        


In [18]:
def extract_relevance():
    queries_dict = extract_queries()
    def parse_relevance(query):
        query_id = query.attrib['id']
        q = queries_dict[query_id]
        for doc in query.findall('./{http://www.romip.ru/common/merged-results}document'):
            if doc.attrib['relevance'] == 'vital':
                q.relevant.append(doc.attrib['id'])
        return q   

    filename = settings['relevance_xml']
    with open(filename) as f:
        tree = ET.parse(filename)
        root = tree.getroot()
        tasks = root.findall('{http://www.romip.ru/common/merged-results}task')
        return list(map(parse_relevance, tasks))

In [19]:
queries = extract_relevance()
print(len(queries))

547


In [20]:
def search(query, result_size=20):
    """
    Use this function for search.
    """
    
    query = {
        'query': {
             'bool': {
                'should': 
                    {
                        'match': {
                            'content': query.text
                        }
                    }
            }
        }
    }
    
    result = es.search(index='byweb', body=query, size=result_size)
    return list(map(lambda x: x['_id'], result['hits']['hits']))

def pretty_print_result(search_result):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
                  
def get_doc_by_id(doc_id):
    return es.get(index='byweb', id=doc_id)['_source']

In [21]:
q = queries[1] # Query object
print(q)
results = search(q, result_size=80)

good = set(results).intersection(q.relevant)
print(good, len(good))


Query(query_id='arw53809', text='поздравления с днем рождения', relevant=['856042', '1131416', '1276926', '1401996', '1131523', '1484438', '1412691', '1159068', '1401774', '868223', '1490354', '181562', '1150512', '37290', '819119', '149785', '519872', '33927', '33603', '1449374', '1449370', '254796', '1416262', '698972', '1134971', '1134990', '1167299', '286294', '1167300', '911493', '492431', '409451', '911877', '1059664', '1431402', '32213', '553810', '1001710', '1001523', '1001471', '1217813', '1244486', '6957', '135199', '1415060', '135025', '275284', '626342', '638726', '1403095', '781582', '781520', '870119', '524', '910', '912', '1170791', '1411495', '913379', '428556', '1156', '1169', '276947', '696405', '1030263', '645963', '1402777', '145037', '1489806', '841489', '1472286', '540556', '1472924', '1472920', '1472915', '1472917'])
{'519872', '626342', '819119', '870119', '254796', '1001710', '1490354', '1217813', '1472915', '553810', '33927', '696405', '1411495', '492431', '11