In [310]:
from nltk.corpus import stopwords 
import re
import unidecode
import os
import json
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jvsn1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [326]:
class QueryProcessor():
    def __init__(self, invIndexPath, documentsPath):
        self.descr = None
        self.queryVector = None
        self.invIndex = None # inverted index file
        self.posting = None # posting file (document at a time)
        self.termIDF = None
        self.setInvIndex(invIndexPath)
        self.wordsDocuments = list(self.invIndex.keys()) # all words of all documents
        self.numDocs = len(os.listdir(documentsPath)) # number of documents in database
        
    '''
        * Method where receive an list of words and cleans it, removing
        * stopwords (if useStopwords == True), accentuation and punctuation
    '''
    def cleaning(self, words, useStopwords = False):
        if type(words) == str:
            words = words.strip().split(' ')
        toRemove = r'[.*,;\(\)\'\"\?\!%\$]'
        for i in range(len(words)):
            words[i] = re.sub(toRemove, '', words[i])
            words[i] = unidecode.unidecode(words[i])
        if useStopwords:
            set_sw = set(stopwords.words('english'))
            newWords = []
            for word in words:
                if not word in set_sw:
                    newWords.append(word)
            return list(set(newWords))
        return list(set(words))
            
    '''
        * Set the self.invIndex to a dict where key is a term and values is a tuple
        * (freq, doc), where freq is the frequency of that key term in the document
        * doc
    '''
    def setInvIndex(self, path):
        with open(path, encoding='utf-8') as file:
            file_text = file.read()[2:-1]
            json_dict = json.loads(file_text)
        self.invIndex = json_dict
    
    '''
        *Runs the term at a time algorithm and set self.posting with a dictionary 
        *with scores of all documents by frequency of terms
    '''
    def runPosting(self, queryWords):
        localPosting = {}
        for word in queryWords:
            freqDocList = self.invIndex.get(word, None)
            if freqDocList:
                for freqDoc in freqDocList:
                    localPosting[freqDoc[1]] = localPosting.get(freqDoc[1], 0) + freqDoc[0]
        self.posting = localPosting
        
    def getPosting(self):
        return self.posting
            
            
    def buildDocumentVectors(self):
        pass
    
    
    def buildQueryVector(self):
        pass
    
    def runTF(self, term, document):
        for freq, doc in self.invIndex[term]:
            if doc == document:
                return freq
        return -1
    
    def runIDF(self):
        termIDF = {}
        for term in self.invIndex.keys():
            numDocsByTerm = len(self.invIndex[term])
            #print(term, numDocByTerm)
            idf = log(self.numDocs/numDocsByTerm)
            termIDF[term] = idf
        self.termIDF = termIDF
                
            
    def setQuery(self, query):
        cleanQuery = self.cleaning(query, useStopwords=True)
        self.runPosting(cleanQuery)
        self.query = cleanQuery
        

In [327]:
invIndexPath = os.path.abspath('../inverted_index/frequency.json')
documents = os.path.abspath('../inverted_index/db/')

In [328]:
qp = QueryProcessor(invIndexPath=invIndexPath, documentsPath=documents)

In [329]:
qp.setQuery('dark demon souls, creed')

In [330]:
qp.runIDF()

In [334]:
qp.runTF('assassins', 1)

3