In [2]:
%config IPCompleter.greedy=True
import re
import json
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
from collections import namedtuple
import os
import xml.etree.ElementTree as ET

In [3]:
settings = {
    'queries_xml': '/home/subject/Downloads/web2008_adhoc.xml', # path to queries xml
    'relevance_xml': '/home/subject/Downloads/or_relevant-minus_table.xml', # path to relevance xml
    'collection_dir': '/home/subject/Documents/informational retrieval/all_info2/', # folder with contents of all_info.zip
    'pagerank_json' : '/home/subject/Documents/informational retrieval/pagerank.json' # json file with pageranks
}

In [4]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])

In [5]:
if not es.indices.exists(index='byweb'):
    es.indices.create(index='byweb')

In [6]:
index_settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            },
            'stem_content': {
                'type': 'text'
            },
            'title': {
                'type': 'text'
            },
            'id': {
                'type': 'keyword'
            },
            'url': {
                'type': 'keyword'
            },
            'pagerank': {
                'type': 'rank_feature'
            }
        }
    },
    'settings': {
       'analysis': {
            'analyzer': {
                'white_lover': {
                    'tokenizer': 'white_20',
                    'filter': [
                        'lowercase'
                    ]
                }
            },
            'tokenizer': {
                'white_20': {
                    'type': 'whitespace'
                }
            },
        }
    }
}

In [7]:
def recreate_index():
    es.indices.delete(index='byweb')
    es.indices.create(index='byweb', body=index_settings)

In [8]:
%time
recreate_index()

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs


In [9]:
# Retrieval of pagerank
def get_rank_jsons():
    file = settings['pagerank_json']
    with open(file) as f:
        l = json.load(f)
        return dict(l)

In [11]:
pagerank_dict = get_rank_jsons()

In [10]:
from urllib.parse import urlparse
def get_pagerank_by_url(pagerank_dict, page_json):
    domain = f'{urlparse(page_json["url"]).netloc}'
    # lonely vertex -> pagerank is 0
    return pagerank_dict.get(domain, 0)

In [14]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def es_actions_generator(docs):
    for doc in os.listdir(docs):
        doc = os.path.join(docs, doc)
        with open(doc) as d:
            data = json.load(d)
            for document in data:
                #   document['pagerank'] = get_pagerank_by_url(pagerank_dict, document)
                yield create_es_action('byweb', document['id'], document)


In [None]:
generator = es_actions_generator(settings['collection_dir'])
for ok, result in tqdm(parallel_bulk(es, generator, queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

Время работы процедуры выше: чуть больше 5 минут.

Если посмотрим сюда: "http://localhost:9200/_stats/indexing,store?pretty " и найдем:

`indices.byweb.total.store.size_in_bytes`

увидим, что размер индекса равен ~3 Гб. Размер информации, запихнутой в индекс, ~4.1 Гб.

(Конечно, все это можно было сделать программно, но посмотреть на один показатель быстрее ручками.)

In [None]:
Query = namedtuple('Query', ['query_id', 'text', 'relevant'])

In [None]:
def parse_query(query):
    text = query.find('{http://www.romip.ru/data/adhoc}querytext').text
    query_id = query.attrib['id']
    return Query(query_id=query_id, text=text, relevant=[])


def extract_queries():
    filename = settings['queries_xml']
    with open(filename) as f:
        tree = ET.parse(filename)
        root = tree.getroot()
        tasks = root.findall('{http://www.romip.ru/data/adhoc}task')
        qs = list(map(parse_query, tasks))
        queries = dict()
        for x in qs:
            queries[x.query_id] = x
        return queries

In [None]:
def extract_relevance():
    queries_dict = extract_queries()
    def parse_relevance(query):
        query_id = query.attrib['id']
        q = queries_dict[query_id]
        for doc in query.findall('./{http://www.romip.ru/common/merged-results}document'):
            if doc.attrib['relevance'] == 'vital':
                q.relevant.append(doc.attrib['id'])
        return q

    filename = settings['relevance_xml']
    with open(filename) as f:
        tree = ET.parse(filename)
        root = tree.getroot()
        tasks = root.findall('{http://www.romip.ru/common/merged-results}task')
        return list(map(parse_relevance, tasks))

In [None]:
queries = extract_relevance()
# important: -38 queries
queries = list(filter(lambda q: q.relevant, queries))
print(len(queries))

In [None]:
def search(query, result_size=20):
    """
    Use this function for search.
    """
    
    query = {
        'query': {
             'bool': {
                'should': 
                    {
                        'match': {
                            'content': query.text
                        }
                    }
            }
        }
    }

    result = es.search(index='byweb', body=query, size=result_size)
    return list(map(lambda x: x['_id'], result['hits']['hits']))

def pretty_print_result(search_result):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
                  
def get_doc_by_id(doc_id):
    return es.get(index='byweb', id=doc_id)['_source']

Статистические показатели.

In [None]:
def stats_at(k, query, query_result):
    relevant = query.relevant
    result_at_k = query_result[:k]
    good = set(result_at_k).intersection(relevant)
    return [len(good) / k, len(good) / len(relevant), len(good) / min(k, len(relevant))]

In [None]:
def get_ap_at(k, query, query_result):
    s = 0
    n = 0
    for i in range(len(query_result)):
        result = query_result[i]
        if result in query.relevant:
            s += stats_at(i + 1, query, query_result)[0]
            n += 1
    return [0 if n == 0 else s / n, s / len(query.relevant), s / min(k, len(query.relevant))]

In [None]:
def get_map_at(k, queries, q_results):
    s1 = 0
    s2 = 0
    s3 = 0
    for query, query_result in zip(queries, q_results):
        ap = get_ap_at(k, query, query_result)
        s1 += ap[0]
        s2 += ap[1]
        s3 += ap[2]
    return [s1 / len(queries), s2 / len(queries), s3 / len(queries)]

In [None]:
def get_queries_results(qs, search_fun=search, k=20):
    return [search_fun(q, k) for q in qs]

In [None]:
def get_queries_stats(search_fun, k=20):
    results = get_queries_results(queries, search_fun, k)
    
    precision = 0
    recall = 0
    recall_dash = 0
    r_precision = 0
    for q, r in zip(queries, results):
        stats = stats_at(k, q, r)
        precision += stats[0]
        recall += stats[1]
        recall_dash += stats[2]
        r_precision += stats_at(len(q.relevant), q, r)[0]
    
    queries_size = len(queries)
    print("Средняя точность на уровне k=20 по всем запросам:", precision / queries_size)
    print("Средняя полнота на уровне k=20 по всем запросам:", recall / queries_size)
    print("Средняя полнота-дэш (в знаментеле стоит k, если релевантных документов больше, чем k) на уровне k=20 по всем запросам:", recall_dash / queries_size)
    print("Средняя R-точность на уровне k=20 по всем запросам:", r_precision / queries_size)
    map_at = get_map_at(k, queries, results)
    print("MAP на уровне k=20:", map_at[0])
    print("MAP-дэш (в знаменателе AP - количество всех релевантных документов для запроса) на уровне k=20:", map_at[1])
    print("MAP-дэш-дэш (в знаменателе AP - k, если релевантных документов больше k, иначе - количество всех релевантных документов для запроса) на уровне k=20:", map_at[2])

In [None]:
get_queries_stats(search)

In [None]:
%%timeit -n 3 -r 3
_ = get_queries_results(queries)

Заметки:

Query(query_id='arw53730', text='Ремолан', relevant=['980017']) []
потому что mystem, ремолать

Зачем нужны дэш-характеристики:

Query(query_id='arw50384', text='гизметео', relevant=['587130', '856298', '239338', '239387', '239377', '687059', '261338', '325277', '72835', '1285231', '223037', '1382550', '37733', '92274', '92282', '1420731', '689932', '1303050', '963503', '602944', '1372162', '256386', '1427662', '1464330', '1357789']) ['1420731', '691030'] 1.0

In [None]:
# Ячейка для лемматизации запроса; совпадает с лемматизацией для коллекции.

stop_words = {'г', '©'}
def get_stop_words(files):
    for file in files:
        with open(file) as f:
            for word in f:
                stop_words.add(word.split()[0])

get_stop_words(['../extractor/stopwords/english', '../extractor/stopwords/russian'])

def is_not_stop_word(d):
    return not getLexOrText(d) in stop_words


normal_word = re.compile('^[A-Za-z0-9Ѐ-ӿ]*$')
def is_normal_word(d):
    return normal_word.match(d['text']) is not None


def getText(d):
    return d['text'].lower()

def getLexOrText(d):
    if 'analysis' not in d or not d['analysis']:
        return getText(d)
 
    analysis = d['analysis'][0]
    return analysis['lex'] if 'lex' in analysis else getText(d)


from pymystem3 import Mystem
m = Mystem()

def lemmatize(query_text):
    result = m.analyze(query_text)

    result = list(filter(bool, result))
    result = list(filter(lambda x: 'analysis' in x or is_normal_word(x), result))
    json_results = list(filter(is_not_stop_word, result))

    lexed_content = " ".join(list(map(getLexOrText, json_results)))
    return lexed_content

In [None]:
def l_search(query, result_size=20):
    query_text = lemmatize(query.text)
    query = {
        'query': {
             'bool': {
                'should': 
                    {
                        'match': {
                            'stem_content': query_text
                        }
                    }
            }
        }
    }
    
    result = es.search(index='byweb', body=query, size=result_size)
    return list(map(lambda x: x['_id'], result['hits']['hits']))

In [None]:
get_queries_stats(l_search)

In [None]:
%%timeit -n 3 -r 3
_ = get_queries_results(queries, l_search)

Pagerank

In [None]:
def lp_search(query, result_size=20):
    query = {
        'query': {
             'bool': {
                'should': [
                    {
                        'match': {
                            'stem_content': query.text
                        }
                    },
                    {
                        'rank_feature': {
                            'field': 'pagerank',
                            'saturation': {
                                'pivot': 10
                            }
                        }
                    }
                ]
            }
        }
    }

    result = es.search(index='byweb', body=query, size=result_size)
    return list(map(lambda x: x['_id'], result['hits']['hits']))

In [None]:
# TODO: учитывать title в запросе