In [114]:
import pickle
import numpy as np
from collections import Counter

Load Hindi-Stopwords

In [85]:
f = open("final_stopwords.txt", encoding = "UTF-8")
stopWords = f.read().split("\n")

Load All Data

In [86]:
paras_h = open("paras_hindi.txt", encoding = "UTF-8").read().split("\n\n")
paras_g = open("paras_gujrati.txt", encoding = "UTF-8").read().split("\n\n")
paras_e = open("paras_english.txt", encoding = "UTF-8").read().split("\n\n")
slokas = open("slokas.txt", encoding = "UTF-8").read().split("\n\n")
with open('synonyms.pkl', 'rb') as f:
    syns = pickle.load(f)

Punctuations

In [87]:
punctuations=["।",";",",",":","!",'"',"?",":-","-","{","(","}",")","_","०","S","―","=","[","]","......",":-",".","॥",'”',"|","“","'"]

## Stemming Hindi Words

In [88]:
suffixes = {
    1: ["ो", "े", "ू", "ु", "ी", "ि", "ा"],
    2: ["कर", "ाओ", "िए", "ाई", "ाए", "ने", "नी", "ना", "ते", "ीं", "ती", "ता", "ाँ", "ां", "ों", "ें"],
    3: ["ाकर", "ाइए", "ाईं", "ाया", "ेगी", "ेगा", "ोगी", "ोगे", "ाने", "ाना", "ाते", "ाती", "ाता", "तीं", "ाओं", "ाएं", "ुओं", "ुएं", "ुआं"],
    4: ["ाएगी", "ाएगा", "ाओगी", "ाओगे", "एंगी", "ेंगी", "एंगे", "ेंगे", "ूंगी", "ूंगा", "ातीं", "नाओं", "नाएं", "ताओं", "ताएं", "ियाँ", "ियों", "ियां"],
    5: ["ाएंगी", "ाएंगे", "ाऊंगी", "ाऊंगा", "ाइयाँ", "ाइयों", "ाइयां"],
}

def hi_stem(word):
    for L in 5, 4, 3, 2, 1:
        if len(word) > L + 1:
            for suf in suffixes[L]:
                if word.endswith(suf):
                    return word[:-L]
    return word

## Tokenize

In [89]:
def token_stem(string, stopWords):
    string = "".join([w if w not in punctuations else " " for w in string])  #To remove punctuations
    tokens = string.split()
    tokens = [hi_stem(word) for word in tokens if word not in stopWords]
    tokens = [w for w in tokens if w not in stopWords]
    return tokens

## Creating Posting list

In [90]:
w_col = {}
for idx, para in enumerate(paras_h):
    words = token_stem(para, stopWords)
    
    for word in words:
        if word in w_col.keys():
            if idx in w_col[word].keys():
                w_col[word][idx] += 1
            else:
                w_col[word][idx] = 1
        else:
            temp = {idx : 1}
            w_col[word] = temp.copy()

In [91]:
with open('data.pkl', 'wb') as file:
    pickle.dump(w_col, file, protocol=pickle.HIGHEST_PROTOCOL)

## BM25

In [92]:
def BM25(query, w_coll, l = 5, b = 0.75, k = 2):
    q_tokens = token_stem(query, stopWords)
    lengths = {}
    N = len(paras_h)
    avg_len = 0
    for idx, para in enumerate(paras_h):
        lengths[idx] = len(para)             #cal no of words of each file
        avg_len += lengths[idx]
    avg_len /= N
    #Calculate idf of each token
    idf = {}
    for word in np.unique(q_tokens):
        if word in w_coll.keys():
            df = len(w_coll[word].keys())
        else:
            df = 0
        idf[word] = np.log((N - df + 0.5) / (df + 0.5))
    score = {}
    for idx, para in enumerate(paras_h):
        s = 0
        for word in np.unique(q_tokens):
            tf = 0
            if word in w_coll.keys() and idx in w_coll[word].keys():
                tf = w_coll[word][idx]
            s += idf[word] * (tf * (k + 1)) / (k*(1 - b + b*lengths[idx]/avg_len) + tf)
        score[idx] = s
    return sorted(score, key = score.get, reverse=True)[:l]

## Translator

In [93]:
from googletrans import Translator

def get_translation(data, dest):
    translator = Translator()
    text = translator.translate(data, dest).text
    return text

## Final Search Function

In [221]:
def search(query, w_col, lang, l = 5):
    query = get_translation(query, "hi")
    tokens = token_stem(query, stopWords)
    for idx, token in enumerate(tokens):
#         if token not in w_col:
        if token in syns.keys():
            tokens[idx] += " " + " ".join([w for w in syns[token]])
    query = " ".join(tokens)
#     print(query)
    indxs = BM25(query, w_col, l)
    print(indxs)
    if lang == 'g':
        return [f'{slokas[idx]}\n\n{paras_g[idx]}' for idx in indxs], indxs
    elif lang == 'e':
        return [f'{slokas[idx]}\n\n{paras_e[idx]}' for idx in indxs], indxs
    else:
        return [f'{slokas[idx]}\n\n{paras_h[idx]}' for idx in indxs], indxs

In [250]:
query = "studying religious scriptures like Vedas"
search(query, w_col, "e")

[1, 282, 238, 98, 280]


(['अधीत्येदं यथाशास्त्रं नरो जानाति सत्तम:।\nधर्मोपदेशविख्यातं कार्याकार्य शुभाशु भम्\u200c।।\n\nThrough the study of this manuscript, after thought and\nreflection, even an ordinary person will gain knowledge to\ndistinguish between capability-incompetence and\nrightwrong. Through this manuscript, I wish to create\nconsciousness in human beings towards good deeds versus\nsins, morality versus immorality and duty versus\nirresponsibility. By observing this moral behaviour, human\nbeings should enlighten their lives. Then, the purpose of this\nmanuscript will be fulfilled.',
  '|| पठन्ति चतुरो वेदान्\u200c धर्मशास्त्राण्यनेकश:।\nआत्मानं नैव जानन्ति दर्वी पाकरसं यथा।।\n\nPeople, who even after studying religious scriptures\nlike Vedas, etc. remain ignorant of its essential elements,\nwho do not have knowledge of Soul-God, remain devoid of\nknowledge of their own spiritual self. Chanakya has\ncompared such persons with a ladle, which stirs the juicy\ncurry but which remains unaware of its

## Tf-idf

In [118]:
lengths = {}
for idx, para in enumerate(paras_h):
    lengths[idx] = len(para)
idf = {}
N = len(slokas)
for word in w_col.keys():
    df = len(w_col[word].keys())
    idf[word] = np.log((N) / (df))
#Calculate tf-idf score vector of each file
norm_list = {}
for idx, para in enumerate(paras_h):
    for word in np.unique(para.split()):
        tf = 0
        try:
            if idx in w_col[word].keys():
                tf = w_col[word][idx] / lengths[idx]
        except:
            pass
        if idx not in norm_list.keys():
            norm_list[idx] = []
        try:
            norm_list[idx].append(tf * idf[word])
        except:
            pass
for idx in range(N-1):
    norm_list[idx] = np.linalg.norm(norm_list[idx])

In [266]:
# this function calculates cosine similarities between document vectors and query vector
def find_cosine_sim(n_docs, query_vector, doc_vectors):
    cosine_sims = []
    for i in range(n_docs-1):
        dot = np.dot(query_vector, doc_vectors[i])
        query_norm = np.linalg.norm(query_vector)
        doc_norm = norm_list[i]
        cosine_sims.append(dot/((query_norm + 0.5)*(doc_norm + 0.5)))
    return cosine_sims

In [267]:
# cleaned query must be sent to this model
def tfidf(query, main_dict):
    n_slokas = len(slokas)
    # finding document vectors
    tfidf_dict = {} # dictionary which stores tfidf of each document
    doc_vectors = [] 
    for i in range(n_slokas):
        n_words = len(paras_h[i].split())
        vector = []
        for word in np.unique(query):
            if word in main_dict.keys() and i in main_dict[word].keys():
#                 tf = main_dict[word][i]/n_words
                tf = main_dict[word][i]
                idf = np.log((n_slokas+1)/(len(main_dict[word].keys())+1))
                tf_idf = tf*idf
                vector.append(tf_idf)
            else:
                vector.append(0)
        doc_vectors.append(vector)
    word_freq_query_dict = Counter(query)
    # finding query vector
    query_vector = []
    # remember np.unique return result in alphabetical order
    for word in np.unique(query):
        if word in main_dict.keys():
#             tf = word_freq_query_dict[word]/len(query)
            tf = word_freq_query_dict[word]
            idf = np.log((n_slokas+1)/(len(main_dict[word].keys())+1))
            tf_idf = tf*idf
            query_vector.append(tf_idf)
        else:
            query_vector.append(0)
#     print(query_vector)
    scores = find_cosine_sim(n_slokas, query_vector, doc_vectors)
    return sorted(np.argsort(np.array(scores))[-5:], reverse=False)

In [274]:
def search_tfidf(query, w_col, lang, l = 5):
    query = get_translation(query, "hi")
    tokens = token_stem(query, stopWords)
    for idx, token in enumerate(tokens):
#         if token not in w_col:
        if token in syns.keys():
            tokens[idx] += " " + " ".join([w for w in syns[token]])
    query = " ".join(tokens)
    tokens = token_stem(query, stopWords)
    indxs = tfidf(tokens, w_col)
    print(indxs)
    if lang == 'g':
        return [f'{slokas[idx]}\n\n{paras_g[idx]}' for idx in indxs], indxs
    elif lang == 'e':
        return [f'{slokas[idx]}\n\n{paras_e[idx]}' for idx in indxs], indxs
    else:
        return [f'{slokas[idx]}\n\n{paras_h[idx]}' for idx in indxs], indxs

In [275]:
query = "काल: सुप्तेषु जागर्ति कालो हि दुरतिक्रम:"
search_tfidf(query, w_col, "e")

[43, 50, 103, 171, 186]


(["कोकिलानां स्वरो रूपं स्त्रीणां रूपं पतिव्रतम्\u200c।\nविद्या रूपं कुरूपाणां क्षमा रूपं तपस्विनाम्\u200c।।\n\nOn the importance of attributes, Chanakya says that like\nthe black colour of the koel bird (cuckoo) becomes\ninsignificant on account of its melodious voice; in other\nwords, its melodious voice depicts its character. Similarly,\nloyalty towards her husband depicts a woman's image and\nbeauty. An educated and intelligent ugly woman is much\nmore beautiful than a gorgeous but a characterless woman.\nThe attributes give her respect and status. The greatness of\nthe hermit lies in his ability to forgive. Not losing their\nbalance and patience, under any circumstances is the proof\nof their true devotion. These attributes enable assessment\nof the depth of their devotion.",
  'एकेना<पि सुपुत्रेण विद्यायुक्तेन साधुना।\nआहलादितं कुलं सर्व यथा चन्द्रेण शर्वरी।।\n\nAs a single moon can dispel darkness-a job which\ncannot be done, even by hundreds of stars, similarly, a\ntalented sch