In [1]:
from collections import Counter
import os
import json

In [2]:
DATA_DIR = "../../data/all_info2/"
STATS_FILE = '../../data/stats'
TRAIN_FILE = '../../data/train.txt'
TEST_FILE = '../../data/test.txt'

In [3]:
def get_word_stats(text_dir):
    collection = Counter()
    collection_doc = Counter()
    
    for doc in os.listdir(text_dir):
        doc = os.path.join(text_dir, doc)

        with open(doc) as d:
            data = json.load(d)
            for document in data:
                #document['pagerank'] = get_pagerank_by_url(document["url"])
                #yield create_es_action('byweb', document['id'], document)
                stem_content = document["stem_content"]
                words = list(filter(bool, stem_content.split()))
                
                collection.update(words)
                collection_doc.update(set(words))
                    
    result = dict()
    
    for value, count in collection.items():
        
        result[value] = {
            'occurences_total': count, 
            'occurences_documents': collection_doc[value]
        }
        
    return result

In [4]:
word_statistics = get_word_stats(DATA_DIR)

In [5]:
def get_document_ids(text_dir):
    collection = Counter()
    collection_doc = Counter()

    result = dict()
    
    for doc in os.listdir(text_dir):

        doc = os.path.join(text_dir, doc)

        with open(doc) as d:
            data = json.load(d)
            for document in data:
                result[document['url']] = document['id']
        
    return result

In [6]:
document_ids_map = get_document_ids(DATA_DIR)

In [7]:
def get_avglen(text_dir):
    cnt = 0
    
    for doc in os.listdir(text_dir):

        doc = os.path.join(text_dir, doc)

        with open(doc) as d:
            data = json.load(d)
            for document in data:
                stem_content = document["stem_content"]
                words = list(filter(bool, stem_content.split()))
                cnt += len(words)
    result = cnt / len(document_ids_map)
    return result
    
avg_len = get_avglen(DATA_DIR)
print(avg_len)

701.4231383219044


In [8]:
%config IPCompleter.greedy=True
import re
from collections import defaultdict
from tqdm import tqdm_notebook as tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
from sklearn.feature_extraction.text import CountVectorizer
import requests
from time import time
from collections import namedtuple
import xml.etree.ElementTree as ET

In [9]:
settings = {
    'queries_xml': '../../data/web2008_adhoc.xml', # path to queries xml
    'relevance_2008_xml': '../../data/relevant_table_2008.xml', # path to relevance xml
    'relevance_2009_xml': '../../data/relevant_table_2009.xml', # path to relevance xml
    'collection_dir': '../../data/all_info2/', # folder with contents of all_info.zip
    'pagerank_json' : '../../data/pagerank.json' # json file with pageranks
}

In [10]:
es = Elasticsearch([{'host': 'localhost', 'port': 9200, 'timeout': 360, 'maxsize': 25}])
if not es.indices.exists(index='byweb'):
    es.indices.create(index='byweb')
    
index_settings = {
    'mappings': {
        'properties': {
            'content': {
                'type': 'text'
            },
            'stem_content': {
                'type': 'text'
            },
            'title': {
                'type': 'text'
            },
            'id': {
                'type': 'keyword'
            },
            'url': {
                'type': 'keyword'
            },
            'pagerank': {
                'type': 'rank_feature'
            }
        }
    },
    'settings': {
       'analysis': {
            'analyzer': {
                'white_lover': {
                    'tokenizer': 'white_20',
                    'filter': [
                        'lowercase'
                    ]
                }
            },
            'tokenizer': {
                'white_20': {
                    'type': 'whitespace'
                }
            },
        }
    }
}

In [11]:
def recreate_index():
    es.indices.delete(index='byweb')
    es.indices.create(index='byweb', body=index_settings)
    
%time
recreate_index()

CPU times: user 8 µs, sys: 1e+03 ns, total: 9 µs
Wall time: 11.7 µs


In [12]:
# Retrieval of pagerank
def get_rank_jsons():
    file = settings['pagerank_json']
    with open(file) as f:
        l = json.load(f)
        return dict(l)

In [13]:
pagerank_dict = get_rank_jsons()

In [14]:
from urllib.parse import urlparse
def get_pagerank_by_url(url):
    domain = urlparse(url).netloc
    # lonely vertex -> pagerank is small
    return pagerank_dict.get(domain, 1e-9)

In [15]:
def get_all_needed_documents(text_dir, docs):
    docs = set(docs)
    collection = dict()
    
    for doc in os.listdir(text_dir):
        doc = os.path.join(text_dir, doc)

        with open(doc) as d:
            data = json.load(d)
            for document in data:
                if (not document['id'] in docs):
                    continue
                document['pagerank'] = get_pagerank_by_url(document["url"])
                collection[document['id']] = document
        
    return collection

all_collection = get_all_needed_documents(DATA_DIR, [])

In [16]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

def es_actions_generator(docs):
    for doc in os.listdir(docs):
        doc = os.path.join(docs, doc)
        with open(doc) as d:
            data = json.load(d)
            for document in data:
                document['pagerank'] = get_pagerank_by_url(document["url"])
                yield create_es_action('byweb', document['id'], document)

In [17]:
generator = es_actions_generator(settings['collection_dir'])
for ok, result in tqdm(parallel_bulk(es, generator, queue_size=4, thread_count=4, chunk_size=1000)):
    if not ok:
        print(result)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [18]:
Query = namedtuple('Query', ['query_id', 'text', 'relevant', 'not_relevant'])

In [19]:
def parse_query(query):
    text = query.find('{http://www.romip.ru/data/adhoc}querytext').text
    query_id = query.attrib['id']
    return Query(query_id=query_id, text=text, relevant=[], not_relevant=[])


def extract_queries():
    filename = settings['queries_xml']
    with open(filename) as f:
        tree = ET.parse(filename)
        root = tree.getroot()
        tasks = root.findall('{http://www.romip.ru/data/adhoc}task')
        qs = list(map(parse_query, tasks))
        queries = dict()
        for x in qs:
            queries[x.query_id] = x
        return queries

In [20]:
def extract_relevance_2009():
    queries_dict = extract_queries()
    def parse_relevance(query):
        query_id = query.attrib['id']
        q = queries_dict[query_id]
        for doc in query.findall('./{http://www.romip.ru/common/merged-results}document'):
            if (doc.attrib['relevance'] == 'cantbejudged'):
                continue # skip
            if doc.attrib['relevance'] == 'vital':
                q.relevant.append(doc.attrib['id'])
            else:
                q.not_relevant.append(doc.attrib['id'])
        return q

    filename = settings['relevance_2009_xml']
    with open(filename) as f:
        tree = ET.parse(filename)
        root = tree.getroot()
        tasks = root.findall('{http://www.romip.ru/common/merged-results}task')
        return list(map(parse_relevance, tasks))
    
def extract_relevance_2008():
    queries_dict = extract_queries()
    def parse_relevance(query):
        global all_2008_cnt
        global not_found_2008_cnt
        
        query_id = query.attrib['id']
        q = queries_dict[query_id]
        for doc in query.findall('./{http://www.romip.ru/common/merged-results}document'):
            all_2008_cnt = all_2008_cnt + 1
            
            if (doc.attrib['relevance'] == 'cantbejudged'):
                continue # skip
            
            if (not doc.attrib['id'] in document_ids_map):
                not_found_2008_cnt = not_found_2008_cnt + 1
                continue
            
            if doc.attrib['relevance'] == 'notrelevant':
                q.not_relevant.append(document_ids_map[doc.attrib['id']])
            else:
                q.relevant.append(document_ids_map[doc.attrib['id']])
        return q

    filename = settings['relevance_2008_xml']
    with open(filename) as f:
        tree = ET.parse(filename)
        root = tree.getroot()
        tasks = root.findall('{http://www.romip.ru/common/merged-results}task')
        return list(filter(lambda q: isinstance(q.text, str), map(parse_relevance, tasks)))

In [37]:
queries_2009 = extract_relevance_2009()
queries_2009 = list(filter(lambda q: q.relevant, queries_2009))
print(len(queries_2009))

not_found_2008_cnt = 0 
all_2008_cnt = 0
queries_2008 = extract_relevance_2008()
queries_2008 = list(filter(lambda q: q.relevant, queries_2008))
print(len(queries_2008))
print('percentile of not found documents in relevance table 2008: ' + str(not_found_2008_cnt / all_2008_cnt))

495
419
percentile of not found documents in relevance table 2008: 0.15788306764516774


In [22]:
def search(query, result_size=20):
    """
    Use this function for search.
    """
    
    query = {
        'query': {
             'bool': {
                'should': 
                    {
                        'match': {
                            'content': query.text.lower()
                        }
                    }
             }
        }
    }

    result = es.search(index='byweb', body=query, size=result_size)
    return list(map(lambda x: x['_id'], result['hits']['hits']))

def pretty_print_result(search_result):
    res = search_result['hits']
    print(f'Total documents: {res["total"]["value"]}')
    for hit in res['hits']:
        print(f'Doc {hit["_id"]}, score is {hit["_score"]}')
                  
def get_doc_by_id(doc_id):
    return es.get(index='byweb', id=doc_id)['_source']

In [23]:
# returns [precision, recall, recall-]. Last one is for educational purpose only.
def stats_at(k, query, query_result):
    relevant = query.relevant
    result_at_k = query_result[:k]

    good = set(result_at_k).intersection(relevant)
    return [len(good) / k, len(good) / len(relevant), len(good) / min(k, len(relevant))]

# returns [ap, ap-r, ap-k, ap-]. Last 3 are for educational purpose only
def get_ap_at(k, query, query_result):
    s = 0
    n = 0
    relevant = query.relevant
    result_at_k = query_result[:k]

    for i in range(len(result_at_k)):
        result = result_at_k[i]
        if result in relevant:
            s += stats_at(i + 1, query, result_at_k)[0]
            n += 1
    
    return [s / n if n != 0 else 0, s / len(relevant), s / k, s / min(k, len(relevant))]

# returns [map, map-r, map-k, map-]. Last 3 are for educational purpose only
def get_map_at(k, queries, q_results):
    s1 = 0
    s2 = 0
    s3 = 0
    s4 = 0
    for query, query_result in zip(queries, q_results):
        ap = get_ap_at(k, query, query_result)
        s1 += ap[0]
        s2 += ap[1]
        s3 += ap[2]
        s4 += ap[3]
    return [s1 / len(queries), s2 / len(queries), s3 / len(queries), s4 / len(queries)]

def get_queries_results(qs, search_fun=search, k=20):
    # important: k = max(k, len(q.relevant)), because for r-precision we may need more than k.
    return [search_fun(q, max(k, len(q.relevant))) for q in qs]

In [25]:
# Ячейка для лемматизации запроса; совпадает с лемматизацией для коллекции.

stop_words = {'г', '©'}
def get_stop_words(files):
    for file in files:
        with open(file) as f:
            for word in f:
                stop_words.add(word.split()[0])

get_stop_words(['../../extractor/stopwords/english', '../../extractor/stopwords/russian'])

def is_not_stop_word(d):
    return not getLexOrText(d) in stop_words


normal_word = re.compile('^[A-Za-z0-9Ѐ-ӿ]*$')
def is_normal_word(d):
    return normal_word.match(d['text']) is not None


def getText(d):
    return d['text'].lower()

def getLexOrText(d):
    if 'analysis' not in d or not d['analysis']:
        return getText(d)
 
    analysis = d['analysis'][0]
    return analysis['lex'] if 'lex' in analysis else getText(d)


from pymystem3 import Mystem
m = Mystem()

def lemmatize(query_text):
    result = m.analyze(query_text)

    result = list(filter(bool, result))
    result = list(filter(lambda x: 'analysis' in x or is_normal_word(x), result))
    json_results = list(filter(is_not_stop_word, result))

    lexed_content = " ".join(list(map(getLexOrText, json_results)))
    return lexed_content

In [26]:
def l_search(query, result_size=20):
    query_text = lemmatize(query.text)
    query = {
        'query': {
             'bool': {
                'should': 
                    {
                        'match': {
                            'stem_content': query_text
                        }
                    }
            }
        }
    }
    
    result = es.search(index='byweb', body=query, size=result_size)
    return list(map(lambda x: x['_id'], result['hits']['hits']))

In [27]:
get_queries_stats(l_search)

Средняя точность на уровне k=20 по всем запросам: 0.39838383838383895
Средняя полнота на уровне k=20 по всем запросам: 0.27127363878837407
Средняя полнота-дэш (в знаментеле стоит k, если релевантных документов больше, чем k) на уровне k=20 по всем запросам: 0.468363161597518
Средняя R-точность по всем запросам: 0.3389855248333655
MAP на уровне k=20: 0.5537058670781846
MAP-r (в знаменателе AP - количество всех релевантных документов для запроса) на уровне k=20: 0.1856531340559127
MAP-k (в знаменателе AP - k) на уровне k=20: 0.3160643670445948
MAP-дэш (в знаменателе AP - k, если релевантных документов больше k, иначе - количество всех релевантных документов для запроса) на уровне k=20: 0.3535702214401734


Будем использовать поиск по леммам, чтобы доставать возможные документы

In [28]:
def queries_results(qs, search_fun, k=20):
    return [search_fun(q, k) for q in qs]

In [34]:
import numpy as np

def get_lemmas(text): # from already stemmed text
    return list(filter(bool, text.split()))

def tf(text, word_lemma):
    lemmas = get_lemmas(text)
    cnt = 0
    total = len(lemmas)
    if (total == 0):
        return 0
    for lemma in lemmas:
        if (lemma == word_lemma):
            cnt += 1
    return cnt / total

def idf(word_lemma):
    docs_total = len(document_ids_map)
    docs_inside = int(word_statistics[word_lemma]['occurences_documents'])
    return np.log(docs_total / docs_inside)

def bm25(text, word_lemmas, k1, b):
    result = 0
    for word_lemma in word_lemmas:
        if (not word_lemma in word_statistics):
            continue # skip unknown words
        the_idf = idf(word_lemma)
        the_tf = tf(text, word_lemma)
        docs_total = len(document_ids_map)

        score = the_idf * the_tf * (k1 + 1)
        score = score / (the_tf + k1 * (1 - b + b * docs_total / avg_len))
        result += score
    return result

In [35]:
def coverage(query_lemmas, text):
    text_lemmas = get_lemmas(text)
    result = 0.0
    for lemma in query_lemmas:
        if (lemma in text_lemmas):
            result += 1 / len(query_lemmas)
    return result

def span(query_lemmas, text):
    if (len(query_lemmas) == 0):
        return 0

    text_lemmas = get_lemmas(text)
    
    if (len(text_lemmas) == 0):
        return 0.1
    
    word_cnts = {}
    for word in query_lemmas:
        word_cnts[word] = 0

    def check_fine():
        for key, value in word_cnts.items():
            if (value == 0):
                return False
        return True
        
    l = 0
    min_span = 10 * len(text_lemmas)
    for r in range(0, len(text_lemmas)):
        word = text_lemmas[r]
        if word in word_cnts:
            word_cnts[word] += 1
            
        while (check_fine()):
            min_span = min(min_span, r - l + 1)
            word = text_lemmas[l]
            l += 1
            if word in word_cnts:
                word_cnts[word] -= 1
        
    return len(text_lemmas) / min_span

def get_vector(document, query_lemmas):
    text = document["stem_content"]
    title = document["title"]
    if (not isinstance(title, str)):
        title = ""
    else:
        title = lemmatize(title)
    result = []
    # bm25
    result.append(bm25(text, query_lemmas, 2.0, 0.75))
    # normalized bm25
    result.append(bm25(text, query_lemmas, 2.0, 0.75) / len(query_lemmas))
    # bm25 of title
    result.append(bm25(title, query_lemmas, 2.0, 0.75))
    # normalized bm25 of title
    result.append(bm25(title, query_lemmas, 2.0, 0.75) / len(query_lemmas))
    # pagerank
    result.append(document['pagerank'])
    # query length
    result.append(len(query_lemmas))
    # document length
    result.append(len(get_lemmas(text)))
    # url length
    result.append(len(document['url']))
    # number of references
    result.append(len(document['references']))
    # query coverage by title
    result.append(coverage(query_lemmas, document["stem_content"]))
    # query coverage by text
    result.append(coverage(query_lemmas, title))
    # min span
    result.append(span(query_lemmas, document["stem_content"]))
    return result

def write_vector_data(queries, filename):
    needed_docs = []
    for query in queries:
        needed_docs.extend(query.relevant)
        needed_docs.extend(query.not_relevant)
    docs = get_all_needed_documents(DATA_DIR, needed_docs)
    
    qid = 0
    
    with open(filename, 'w') as file:
        def write_query(relevance, vector):
            line = str(relevance) + ' qid:' + str(qid)
            for i, x in enumerate(vector):
                line += ' ' + str(i + 1) + ":" + np.format_float_positional(x, trim='-')
            file.write(line + '\n')
        
        for query in queries:
            qid += 1
            query_lemmas = get_lemmas(lemmatize(query.text))
            
            for doc in query.relevant:
                write_query(1, get_vector(docs[doc], query_lemmas))
            for doc in query.not_relevant:
                write_query(0, get_vector(docs[doc], query_lemmas))

In [40]:
write_vector_data(queries_2008, TRAIN_FILE)

In [41]:
def write_test_vector_data(queries, filename):
    def is_not_in_2008(query):
        for q in queries_2008:
            if q.text == query.text:
                return False
        return True
    
    queries = list(filter(is_not_in_2008, queries))
    
    docids = queries_results(queries, l_search, 100)
    
    for i in range(len(docids)):
        docids[i] = list(map(str, docids[i]))
        
    needed_docs = []
    for docss in docids:
        needed_docs.extend(docss)
    docs = get_all_needed_documents(DATA_DIR, needed_docs)
    
    qid = 0
    
    with open(filename, 'w') as file:
        def write_query(relevance, vector):
            line = str(relevance) + ' qid:' + str(qid)
            for i, x in enumerate(vector):
                line += ' ' + str(i + 1) + ":" + np.format_float_positional(x, trim='-')
            file.write(line + '\n')
        
        for i, query in enumerate(queries):
            qid += 1
            query_lemmas = get_lemmas(lemmatize(query.text))
            
            docss = docids[i]
            
            for docid in docss:
                if (docid in query.relevant):
                    relevance = 1
                else:
                    relevance = 0
                write_query(relevance, get_vector(docs[docid], query_lemmas))

In [42]:
write_test_vector_data(queries_2009, TEST_FILE)

In [None]:
!java -jar ../../lib/RankLib-2.12.jar -load mart.txt -test ../../data/test.txt -ranker 0 -metric2t NDCG@20 -norm linear

In [48]:
for query in queries_2008:
    print(query.text)

значение группы грови и резус фактор при беремонности
номос
edo бумага
рустам солнцев
мукоза
Следует однако отметить
СТИХИ ТОЛЬКО ЧТО РОДИВШИМСЯ МАЛЬЧИКАМ
эротика
НОРМЫ РАСХОДА
список всех коммерчиских и государственных
квартирант
Поковки стальные штамповочные ГОСТ 1505-89
Воротынские унитазы отзывы
сурдология
фото голых малолеток
что такое торрент файлы
велком
незаконное проживание мигрантов
что положено вдове военнослужащего пенсионера список документов
автозапчасти на ваз 2105
Бесплатная загрузка фотографий
база серебряное копытце
APE TO MP3
печать свадебных книг
международня логистика и ее особенности
карта москвы
ОАО Отечественные лекарства ,
Honda XR650R XR 650 R SM
административное право как отрасль права
франция румыния евро-2008
обучение английскому языку на мальте
заказ билетов
color night скачать музыка
реферат История Конституцияя РФ
авангард
книга кэттролл дом книги
праздники
цирк ШАПИТО
Автосигнализация Mongoose CYBORG
расхода газовоздушной смеси автомобиля
коды к играм
с

In [47]:
import jamspell

ModuleNotFoundError: No module named 'jamspell'