In [5]:
import math
import argparse
import linecache
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem.porter import *
from collections import defaultdict
#from english_indexer import *
import re
import heapq
import os
import sys
import time
import xml.sax
import time
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

from hindi_search_note_mod import run_hindi_search


html_tags = re.compile('&amp;|&apos;|&gt;|&lt;|&nbsp;|&quot;')
stemmer = PorterStemmer()
stop_words=''
stem_words = ''
num_pages = 1
num_tokens = 1
lang = 0

def remove_stopwords(text_data):

    cleaned_text = [word for word in text_data if word not in stop_words]

    return cleaned_text

def stem_word(word):
    for wrd in stem_words:
        if word.endswith(wrd):
            word = word[:-len(wrd)]
            return word
    return word

def stem_text(text_data):
    cleaned_text = ''
    if lang:
        cleaned_text = [stem_word(word) for word in text_data]
    else:
        cleaned_text = [stemmer.stem(x) for x in text_data]

    return cleaned_text

def remove_non_ascii(text_data):
#     text_data  = text_data.encode("ascii", errors="ignore").decode()
    cleaned_text = ''.join([i if ord(i) < 128 else ' ' for i in text_data])

    return cleaned_text

def remove_html_tags(text_data):

    cleaned_text = re.sub(html_tags, ' ', text_data)

    return cleaned_text

def remove_special_chars(text_data):

    cleaned_text = ''.join(ch if ch.isalnum() else ' ' for ch in text_data)

    return cleaned_text

def remove_select_keywords(text_data):

    text_data = text_data.replace('\n', ' ').replace('File:', ' ')
    text_data = re.sub('(http://[^ ]+)', ' ', text_data)
    text_data = re.sub('(https://[^ ]+)', ' ', text_data)

    return text_data

def tokenize_sentence(text_data, flag=False):

    if flag:
        text_data = remove_select_keywords(text_data)
        text_data = re.sub('\{.*?\}|\[.*?\]|\=\=.*?\=\=', ' ', text_data)
    cleaned_text = remove_non_ascii(text_data)
    cleaned_text = remove_html_tags(cleaned_text)
    cleaned_text = remove_special_chars(cleaned_text)

    return cleaned_text.split()

def preprocess_text(text_data, flag=False):

    cleaned_data = tokenize_sentence(text_data.lower(), flag)
    cleaned_data = remove_stopwords(cleaned_data)
    cleaned_data = stem_text(cleaned_data)

    return cleaned_data

In [6]:

def search_binary(h, filename, input1):
    l = 0
    while l < h:
        mid = (l + h)//2
        line = linecache.getline(filename, mid)
        token = line.split('-')[0]
        if input1 == token:
            return line.split('-')[1:-1]
        elif input1 > token:
            l = mid + 1
        else:
            h = mid

    return None
def title_search(page_id):
    if lang:
        t = linecache.getline('hindi_wiki_index/id_title_map.txt', int(page_id)+1).strip()
        t = t.split('-', 1)[1]
        return t
    else:
        t = linecache.getline('english_wiki_index/id_title_map.txt', int(page_id)+1).strip()
        t = t.split('-', 1)[1]
        return t


def search_field_file(field, file_num, line_num):
    if lang:
        if line_num != '':
            line = linecache.getline(f'hindi_wiki_index/{field}_data_{str(file_num)}.txt', int(line_num)).strip()
            postings = line.split('-')[1]
            return postings
        return ''
    else:
        if line_num != '':
            line = linecache.getline(f'english_wiki_index/{field}_data_{str(file_num)}.txt', int(line_num)).strip()
            postings = line.split('-')[1]
            return postings
        return ''

def get_token_info(token):
    if lang:
        with open('hindi_wiki_index/num_tokens.txt', 'r', encoding='utf8') as f:
            num_tokens = int(f.readline().strip())
        token_info_pointer =  'hindi_wiki_index/tokens_info.txt'
        token_info =  search_binary(num_tokens, token_info_pointer, token)
        return token_info
    
    chars = [chr(i) for i in range(97,123)] 
    nums = [str(i) for i in range(0,10)]
    


    if token[0] in nums:
        with open(f'english_wiki_index/tokens_info_{token[0]}_count.txt', 'r') as f:
            num_tokens = int(f.readline().strip())

        tokens_info_pointer = f'english_wiki_index/tokens_info_{token[0]}.txt'
        token_info = search_binary(num_tokens, tokens_info_pointer, token)
        
    
    elif token[0] in chars:
        with open(f'english_wiki_index/tokens_info_{token[0]}_count.txt', 'r') as f:
            num_tokens = int(f.readline().strip())

        tokens_info_pointer =f'english_wiki_index/tokens_info_{token[0]}.txt'
        token_info = search_binary(num_tokens, tokens_info_pointer, token)

    else:
        with open(f'english_wiki_index/tokens_info_others_count.txt', 'r') as f:
            num_tokens = int(f.readline().strip())

        tokens_info_pointer = f'english_wiki_index/tokens_info_others.txt'
        token_info = search_binary(num_tokens, tokens_info_pointer, token)

    return token_info



def do_ranking(page_freq, page_postings):

    result = defaultdict(float)
    weightage_dict = {'title':1.0, 'body':0.6, 'category':0.4, 'infobox':0.75, 'link':0.20, 'reference':0.25}
    for token, field_post_dict in page_postings.items():
        for field, postings in field_post_dict.items():
            weightage = weightage_dict[field]
            if len(postings)>0:
                for post in postings.split(';'):
                    id, post = post.split(':')
                    result[id] += weightage*(1+math.log(int(post)))*math.log((num_pages-int(page_freq[token]))/int(page_freq[token]))
    return result

def simple_query(preprocessed_query):

    page_freq, page_postings = {}, defaultdict(dict)

    for token in preprocessed_query:
        token_info = get_token_info(token)

        if token_info:
            file_num, freq, title_line, body_line, category_line, infobox_line, link_line, reference_line = token_info
            line_map = {
                    'title' : title_line, 'body' : body_line, 'category' : category_line, 'infobox' : infobox_line, 'link' : link_line, 'reference' : reference_line
                }

            for field_name, line_num in line_map.items():
                if line_num!='':
                    posting = search_field_file(field_name, file_num, line_num)
                    page_freq[token] = len(posting.split(';'))
                    page_postings[token][field_name] = posting


    return page_freq , page_postings


def field_query(preprocessed_query):

    page_freq, page_postings = {}, defaultdict(dict)

    for field, token in preprocessed_query:
        token_info = get_token_info(token)

        if token_info:
            file_num, freq, title_line, body_line, category_line, infobox_line, link_line, reference_line = token_info
            line_map = {
                'title':title_line, 'body':body_line, 'category':category_line, 'infobox':infobox_line, 'link':link_line, 'reference':reference_line
            }
            field_map = {
                't':'title', 'b':'body', 'c':'category', 'i':'infobox', 'l':'link', 'r':'reference'
            }

            field_name = field_map[field]
            line_num = line_map[field_name]

            posting = search_field_file(field_name, file_num, line_num)
            page_freq[token] = len(posting)
            page_postings[token][field_name] = posting

    return page_freq, page_postings

def identify_query_type(query):
    field_replace_map = {
            ' t:':';t:',
            ' b:':';b:',
            ' c:':';c:',
            ' i:':';i:',
            ' l:':';l:',
            ' r:':';r:',
        }

    if ('t:' in query or 'b:' in query or 'c:' in query or 'i:' in query or 'l:' in query or 'r:' in query) and query[0:2] not in ['t:', 'b:', 'i:', 'c:', 'r:', 'l:']:

        for k, v in field_replace_map.items():
            if k in query:
                query = query.replace(k, v)

        query = query.lstrip(';')

        return query.split(';')[0], query.split(';')[1:]

    elif 't:' in query or 'b:' in query or 'c:' in query or 'i:' in query or 'l:' in query or 'r:' in query:

        for k, v in field_replace_map.items():
            if k in query:
                query = query.replace(k, v)

        query = query.lstrip(';')

        return query.split(';'), None

    else:
        return query, None
def return_query_results(query, query_type):
    if query_type=='field':
        preprocessed_query = [[qry.split(':')[0], preprocess_text(qry.split(':')[1])] for qry in query]
    else:
        preprocessed_query = preprocess_text(query)

    if query_type == 'field':

        preprocessed_query_final = []
        for field, words in preprocessed_query:
            for word in words:
                preprocessed_query_final.append([field, word])

        page_freq, page_postings = field_query(preprocessed_query_final)

    else:

        page_freq, page_postings = simple_query(preprocessed_query)

    ranked_results = do_ranking(page_freq, page_postings)

    return ranked_results

def take_input_from_file(file_name, num_results):
    results_file = file_name.split('.txt')[0]

    with open(file_name, 'r') as f:
        fp = open(results_file+'_op.txt', 'w')
        for i, query in enumerate(f):
            s = time.time()

            query = query.strip()
            query1, query2 = identify_query_type(query)

            if query2:
                ranked_results1 = return_query_results(query1, 'simple')

                ranked_results2 = return_query_results(query2, 'field')

                ranked_results = Counter(ranked_results1) + Counter(ranked_results2)
                results = sorted(ranked_results.items(), key = lambda item : item[1], reverse=True)
                results = results[:num_results]

                if results:
                    for id, _ in results:
                        title= title_search(id)
                        fp.write(id + ', ' + title)
                        fp.write('\n')
                else:
                    fp.write('No matching Doc found')
                    fp.write('\n')

            elif type(query1)==type([]):

                ranked_results = return_query_results(query1, 'field')

                results = sorted(ranked_results.items(), key = lambda item : item[1], reverse=True)
                results = results[:num_results]

                if results:
                    for id, _ in results:
                        title= title_search(id)
                        fp.write(id + ', ' + title)
                        fp.write('\n')
                else:
                    fp.write('No matching Doc found')
                    fp.write('\n')

            else:
                ranked_results = return_query_results(query1, 'simple')

                results = sorted(ranked_results.items(), key = lambda item : item[1], reverse=True)
                results = results[:num_results]

                if results:
                    for id, _ in results:
                        title= title_search(id)
                        fp.write(id + ', ' + title)
                        fp.write('\n')
                else:
                    fp.write('No matching Doc found')
                    fp.write('\n')

            e = time.time()
            fp.write('Finished in ' + str(e-s) + ' seconds')
            fp.write('\n\n')

            print('Done query', i+1)

        fp.close()

    print('Done writing results')

def take_input_from_user(query, num_results):
    start = time.time()
    if query=='close':
        return
    s = time.time()

    query = query.strip()
    query1, query2 = identify_query_type(query)

    if query2:
        ranked_results1 = return_query_results(query1, 'simple')

        ranked_results2 = return_query_results(query2, 'field')

        ranked_results = Counter(ranked_results1) + Counter(ranked_results2)
        results = sorted(ranked_results.items(), key = lambda item : item[1], reverse=True)
        results = results[:num_results]

        for id, _ in results:
            title= title_search(id)
            print(id+',', title)

    elif type(query1)==type([]):

        ranked_results = return_query_results(query1, 'field')

        results = sorted(ranked_results.items(), key = lambda item : item[1], reverse=True)
        results = results[:num_results]

        for id, _ in results:
            title= title_search(id)
            print(id+',', title)

    else:
        ranked_results = return_query_results(query1, 'simple')

        results = sorted(ranked_results.items(), key = lambda item : item[1], reverse=True)
        results = results[:num_results]

        for id, _ in results:
            title= title_search(id)
            link = ('https://en.wikipedia.org/wiki/'+ title.title()).replace(' ', '_')
            print(id+',', title+ '  ' +  link  )

    e = time.time()
    print('Finished in', e-s, 'seconds')
    print()
     

In [7]:
def search_english(query, num_results):
    global stop_words
    global num_pages
    global lang 
    lang = 0
    stop_words = (set(stopwords.words("english")))

    with open('english_wiki_index/num_pages.txt', 'r') as f:
        num_pages = float(f.readline().strip())

    temp = linecache.getline('english_wiki_index/id_title_map.txt', 0)
    
    take_input_from_user(query, num_results)

def search_hindi(query, num_results):
    global stop_words
    global num_pages
    global lang
    global stem_words
    lang = 1
    query = query.strip()
#     query = transliterate(query, sanscript.ITRANS, sanscript.DEVANAGARI)
    with open('hindi_stopwords.txt', 'r', encoding='utf8') as f:
        stop_words = [word.strip() for word in f]

    with open('hindi_stem_words.txt', 'r',encoding='utf8') as f:
        stem_words = [word.strip() for word in f]

    with open('hindi_wiki_index/num_pages.txt', 'r',encoding='utf8') as f:
        num_pages = float(f.readline().strip())

    with open('hindi_wiki_index/num_tokens.txt', 'r',encoding='utf8') as f:
        num_tokens = int(f.readline().strip())

    tokens_info_pointer = 'hindi_wiki_index/tokens_info.txt'

    temp = linecache.getline('hindi_wiki_index/id_title_map.txt', 0)
#     take_input_from_user(query, num_results)
    run_hindi_search(query,num_results)

def run_query():
    print('Loading search engine ')
    print('Enter language keyword to search (e for English, h for Hindi), c to close')
    while True:
        start = time.time()
        
        print('Querying: ')
        num_results = 10
        start = time.time()
        query = input('Enter Query:- ')
        language = query[0]
        query = query[2:-1]
        if language=='c':
            return
        if language=='h':
            search_hindi(query, num_results)
        else:
            search_english(query, num_results)
            take_input_from_user(query, num_results)
        print('Query time:- ', time.time()-start)


In [8]:
run_query()

Loading search engine 
Enter language keyword to search (e for English, h for Hindi), c to close
Querying: 
Enter Query:- h:Bhaarat
117554, विकिपीडिया:autowikibrowser/typos  https://hi.wikipedia.org/wiki/विकिपीडिया:autowikibrowser/typos
169725, विकिपीडिया:चौपाल/पुरालेख 42  https://hi.wikipedia.org/wiki/विकिपीडिया:चौपाल/पुरालेख_42
Query time:-  14.665740966796875
Querying: 
Enter Query:- h:mahaajanpad
5944, मगध महाजनपद  https://hi.wikipedia.org/wiki/मगध_महाजनपद
8585, मैरी १ (स्कॉटलैंड की रानी)  https://hi.wikipedia.org/wiki/मैरी_१_(स्कॉटलैंड_की_रानी)
82784, मैनहटन  https://hi.wikipedia.org/wiki/मैनहटन
86604, मैसाचुसेट्स प्रौद्योगिकी संस्थान  https://hi.wikipedia.org/wiki/मैसाचुसेट्स_प्रौद्योगिकी_संस्थान
6528, फ़्लोरिडा  https://hi.wikipedia.org/wiki/फ़्लोरिडा
76639, फेडरल ब्यूरो ऑफ इन्वेस्टिगेशन  https://hi.wikipedia.org/wiki/फेडरल_ब्यूरो_ऑफ_इन्वेस्टिगेशन
2445, भीमराव आम्बेडकर  https://hi.wikipedia.org/wiki/भीमराव_आम्बेडकर
211828, निरंजन नागराजन  https://hi.wikipedia.org/wiki/निरंजन_नाग