In [218]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words
from typing import NamedTuple

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math

# %load_ext autotime
#nltk.download('stopwords')
#nltk.download('punkt')


## PreProcessing

In [219]:
def convert_lower_case(data):
    return np.char.lower(data)

In [220]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [221]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [222]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [223]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [224]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [225]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

## Build Index

In [226]:
def getDataset(title):
    dataset = []
    c = False
    folders = [x[0] for x in os.walk(str(os.getcwd())+'/'+title+'/')]
    folders[0] = folders[0][:len(folders[0])-1]
    for i in folders:
        file = open(i+"/index.html", 'r')
        text = file.read().strip()
        file.close()

        file_name = re.findall('><A HREF="(.*)">', text)
        file_title = re.findall('<BR><TD> (.*)\n', text)

        if c == False:
            file_name = file_name[2:]
            c = True
        
        #print(len(file_name), len(file_title))

        for j in range(len(file_name)):
            dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))

    return dataset

In [245]:
def extractData(title):
    processed_text = []
    processed_title = []
    dataset = []
    dataset = getDataset(title)
    N = len(dataset)
    for i in dataset[:N]:
        file = open(i[0], 'r', encoding="utf8", errors='ignore') # i[0] contains file location
        text = file.read().strip()
        file.close()

        processed_text.append(word_tokenize(str(preprocess(text))))
        processed_title.append(word_tokenize(str(preprocess(i[1]))))
    return processed_text, processed_title

In [228]:
def getPostingList(processed_text, processed_title):
    PL = {}
    N = len(processed_text)
    for i in range(N):
        tokens = processed_text[i]
        for w in tokens:
            try:
                PL[w].add(i)
            except:
                PL[w] = {i}

        tokens = processed_title[i]
        for w in tokens:
            try:
                PL[w].add(i)
            except:
                PL[w] = {i}
    return PL

In [229]:
def getDocumentMetadata(processed_text, processed_title):
    DMText = {}
    DMTitle = {}
    N = len(processed_text)
    for i in range(N):
        tokens = processed_text[i]
        counter = Counter(tokens)
        for token in np.unique(tokens):
            try:
                DMText[i].add(token, counter[token])
            except:
                DMText[i] = {token, counter[token]}
    for i in range(N):
        tokens = processed_title[1]
        counter = Counter(tokens)
        for token in np.unique(tokens):
            try:
                DMTitle[i].add(token, counter[token])
            except:
                DMTitle[i] = {token, counter[token]}

    return DMText, DMTitle

In [230]:
def printIndex(PL, DMText, DMTitle):
    N = len(PL) # doc list for each word in dataset vocab
    for w in PL:
        print ("word=",w[0]," list=", w[1])
    for m in DMText:
        print ("doc_id=",m[0]," term= ",m[1][0]," freq= ",m[1][1])
    for m in  DMTitle:
        print ("doc_id=",m[0]," term= ", m[1][0]," freq= ",m[1][1])

In [231]:
def buildIndex(title):
    processed_text = []
    processed_title = []
    processed_text, processed_title = extractData(title)
    PL = {}
    PL = getPostingList(processed_text, processed_title)
    DMText = {}
    DMTitle = {}
    DMText, DMTitle = getDocumentMetadata(processed_text, processed_title)
    #printIndex(PL, DMText, DMTitle)
    #print(len(PL))
    return PL, DMText, DMTitle
    

In [232]:
def docFreq(PL, word):
    c = 0
    try:
        c = len(PL[word])
    except:
        pass
    return c

In [233]:
def termFreq(DM, term, doc_id):
    tf = 0
    try:
        tf = DM[doc_id][term]
    except:
        pass
    return tf

In [243]:
def tfidf(PL, DMTitle, DMText, term, doc_id):
    alpha = 0.3
    N = len(DMTitle)
    tf = termFreq(DMTitle, term, doc_id) + alpha * termFreq(DMText, term, doc_id)
    df = docFreq(PL, term)
    idf = np.log((N+1)/(df+1))
    return tf*idf

In [235]:
def plMerge(pl1, pl2):
    #Merge dictionaries and keep values of common keys in list
    pl = {**pl1, **pl2}
    for key, value in pl2.items():
        if key in pl1 and key in pl2:
            pl[key] = [value , pl1[key]]
    return pl

In [236]:
def mergeIndices(pl1, pl2, dmtext1, dmtext2, dmtitle1, dmtitle2): #pl - posting list, dm - document metadata
    dmtext = {}
    dmtitle = {}
    pl = {}
    #dmtext = dm_merge(dmtext1, dmtext2)
    dmtext = {**dmtext1, **dmtext2}
    #dmtitle = dm_merge(dmtext1, dmtext2)
    dmtitle = {**dmtit1e1, **dmtitle2}
    pl = plMerge(pl1, pl2)
    return pl, dmtext, dmtitle
    

In [237]:
def queryfunc(title, k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("My Query Function")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    PL, DMText, DMTitle = buildIndex(title)
    query_weights = {}
    for w in tokens:
        for d in PL[w]:
            try:
                query_weights[d] += tfidf(PL, DMText, DMTitle, w, d)
            except:
                query_weights[d] = tfidf(PL, DMText, DMTitle, w, d)
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)

    print("")
    
    l = []
    
    for i in query_weights[:k]:
        l.append(i[0])
    
    print(l)
    return l

In [244]:
queryfunc("stories", 3, "Polymetals are alloys of metals that have polymer composites")

My Query Function

Query: Polymetals are alloys of metals that have polymer composites

['polymet', 'alloy', 'metal', 'polym', 'composit']

[118, 119, 356]


[118, 119, 356]