In [506]:
from nltk.corpus import stopwords 
import re
import unidecode
import os
import json
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jvsn1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [668]:
class QueryProcessor():
    def __init__(self, invIndexPath, documentsPath):
        self.descr = None
        self.queryTerms = None
        self.invIndex = None # inverted index file
        self.posting = None # posting file (document at a time)
        self.termIDF = None
        self.lengthDoc = None
        self.documentsPath = documentsPath
        self.setInvIndex(invIndexPath)
        self.wordsDocuments = list(self.invIndex.keys()) # all words of all documents
        self.numDocs = len(os.listdir(documentsPath)) # number of documents in database
        self.setLengthDocs() # set length of documents
        self.runIDF()
        
    '''
        * Method where receive an list of words and cleans it, removing
        * stopwords (if useStopwords == True), accentuation and punctuation
    '''
    def cleaning(self, words, useStopwords = False):
        if type(words) == str:
            words = words.strip().split(' ')
        toRemove = r'[.*,;\(\)\'\"\?\!%\$]'
        for i in range(len(words)):
            words[i] = re.sub(toRemove, '', words[i])
            words[i] = unidecode.unidecode(words[i])
        if useStopwords:
            set_sw = set(stopwords.words('english'))
            newWords = []
            for word in words:
                if not word in set_sw:
                    newWords.append(word)
            return list(set(newWords))
        return list(set(words))
            
    '''
        * Set the self.invIndex to a dict where key is a term and values is a tuple
        * (freq, doc), where freq is the frequency of that key term in the document
        * doc
    '''
    def setInvIndex(self, path):
        with open(path, encoding='utf-8') as file:
            file_text = file.read()[2:-1]
            json_dict = json.loads(file_text)
        self.invIndex = json_dict
    
    '''
        *Runs the term at a time algorithm and set self.posting with a dictionary 
        *with scores of all documents by frequency of terms
    '''
    def readPosting(self, queryWords):
        localReadPosting = {}
        for term in queryWords:
            freqDocList = self.invIndex.get(term, None)
            if freqDocList:
                for freq, doc in freqDocList:
                    localReadPosting[doc] = localReadPosting.get(doc, 0) + freq
        self.documentTermFrequency = localReadPosting
        
    def getPosting(self):
        return self.posting
            
    '''
        * TF algorithm returns the term frequency in a document. An different approach is divide
        * this frequency by the number of words in a document
    '''
    def runTF(self, term, document):
        for freq, doc in self.invIndex[term]:
            if doc == document:
                return freq
        return 0
    
    
    '''
        * IDF algorithm computes the importance of a term calculating the log of the division between the number
        * of documents by the number of documents that contains a term. 
    '''
    def runIDF(self):
        termIDF = {}
        for term in self.invIndex.keys():
            numDocsByTerm = len(self.invIndex[term])
            idf = log(self.numDocs/numDocsByTerm)
            termIDF[term] = idf
        self.termIDF = termIDF
                
            
    def setQuery(self, queryString):
        cleanQuery = self.cleaning(queryString, useStopwords=True)
        self.readPosting(cleanQuery)
        self.queryTerms = cleanQuery
        
    def performQuery(self, useTfIdf, K = 10):
        score = {}
        for term in self.queryTerms:
            for freq, doc in self.invIndex[term]:
                auxScore = freq
                if useTfIdf:
                    auxScore = auxScore*self.termIDF.get(term, 0)
                if not score.get(doc, False):
                    score[doc] = auxScore
                else:
                    score[doc] += auxScore
        for doc in score:
            score[doc] = score[doc]/self.lengthDoc[doc]
        topK = sorted(score.items(), key=lambda x:x[1], reverse=True)[:10]
        return topK
    
    
    def query(self, queryString, useTfIdf = False):
        self.setQuery(queryString)
        return self.performQuery(useTfIdf)
    
            
    def setLengthDocs(self):
        lengthDoc = {}
        for fileName in os.listdir(self.documentsPath):
            filePath = self.documentsPath + '\\' + fileName
            with open(filePath, 'r') as fp:
                lengthDoc[int(fileName)] = len(fp.read().split(' '))
        self.lengthDoc = lengthDoc

In [669]:
invIndexPath = os.path.abspath('../inverted_index/frequency.json')
documents = os.path.abspath('../inverted_index/db/')

In [670]:
qp = QueryProcessor(invIndexPath=invIndexPath, documentsPath=documents)

In [689]:
qp.query('challenging game', useTfIdf=True)

[(2702, 0.13633943695539996),
 (2097, 0.12517536493104287),
 (219, 0.12099644289164554),
 (1631, 0.09307418683972735),
 (328, 0.08490978448536529),
 (1050, 0.08344582268389349),
 (1674, 0.08344582268389349),
 (2549, 0.07723177205849716),
 (1152, 0.07562277680727847),
 (2406, 0.07445934947178187)]

In [690]:
qp.query('challenging game', useTfIdf=False)

[(2122, 0.04878048780487805),
 (1541, 0.041666666666666664),
 (1542, 0.038461538461538464),
 (1548, 0.038461538461538464),
 (1549, 0.038461538461538464),
 (1556, 0.038461538461538464),
 (1840, 0.037037037037037035),
 (1841, 0.037037037037037035),
 (2096, 0.03571428571428571),
 (2377, 0.030303030303030304)]