In [16]:
from nltk.corpus import stopwords 
import re
import unidecode
import os
import json
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
#nltk.download('stopwords')

In [101]:
class QueryProcessor():
    def __init__(self, invIndexPath, documentsPath):
        self.descr = None
        self.queryTerms = None
        self.invIndex = None # inverted index file
        self.posting = None # posting file (document at a time)
        self.termIDF = None
        self.lengthDoc = None
        self.documentsPath = documentsPath
        self.setInvIndex(invIndexPath)
        self.wordsDocuments = list(self.invIndex.keys()) # all words of all documents
        self.numDocs = len(os.listdir(documentsPath)) # number of documents in database
        self.setLengthDocs() # set length of documents
        self.runIDF()
        
    '''
        * Method where receive an list of words and cleans it, removing
        * stopwords (if useStopwords == True), accentuation and punctuation
    '''
    def cleaning(self, words, field, useStopwords = False):
        if type(words) == str:
            words = words.strip().split(' ')
        toRemove = r'[.*,;\(\)\'\"\?\!%\$]'
        for i in range(len(words)):
            words[i] = re.sub(toRemove, '', words[i])
            words[i] = unidecode.unidecode(words[i])
        if useStopwords:
            set_sw = set(stopwords.words('english'))
            newWords = []
            for word in words:
                if not word in set_sw:
                    if(field):
                        newWords.append(word + '.' + field)
                    else:
                        newWords.append(word)
            return list(set(newWords))
        if(field):
            for i in range(len(words)):
                words[i] = words[i] + '.' + field
        return list(set(words))
            
    '''
        * Set the self.invIndex to a dict where key is a term and values is a tuple
        * (freq, doc), where freq is the frequency of that key term in the document
        * doc
    '''
    def setInvIndex(self, path):
        with open(path, encoding='unicode_escape') as file:
            file_text = file.read()[2:-1]
            json_dict = json.loads(file_text)
        self.invIndex = json_dict
    
    '''
        *Runs the term at a time algorithm and set self.posting with a dictionary 
        *with scores of all documents by frequency of terms
    '''
    def readPosting(self, queryWords):
        localReadPosting = {}
        for term in queryWords:
            freqDocList = self.invIndex.get(term, None)
            if freqDocList:
                for freq, doc in freqDocList:
                    localReadPosting[doc] = localReadPosting.get(doc, 0) + freq
        self.documentTermFrequency = localReadPosting
        
    def getPosting(self):
        return self.posting
            
    '''
        * TF algorithm returns the term frequency in a document. An different approach is divide
        * this frequency by the number of words in a document
    '''
    def runTF(self, term, document):
        for freq, doc in self.invIndex[term]:
            if doc == document:
                return freq
        return 0
    
    
    '''
        * IDF algorithm computes the importance of a term calculating the log of the division between the number
        * of documents by the number of documents that contains a term. 
    '''
    def runIDF(self):
        termIDF = {}
        for term in self.invIndex.keys():
            numDocsByTerm = len(self.invIndex[term])
            idf = log(self.numDocs/numDocsByTerm)
            termIDF[term] = idf
        self.termIDF = termIDF
                
            
    def setQuery(self, queryString, field):
        cleanQuery = self.cleaning(queryString, field, useStopwords=True)
        self.readPosting(cleanQuery)
        self.queryTerms = cleanQuery
        
    def performQuery(self, useTfIdf):
        score = {i: 0 for i in range(1,self.numDocs+1)}
        for term in self.queryTerms:
            for freq, doc in self.invIndex[term]:
                auxScore = freq
                if useTfIdf:
                    auxScore = auxScore*self.termIDF.get(term, 0)
                score[doc] += auxScore
        for doc in score:
            score[doc] = score[doc]/self.lengthDoc[doc]
        score = sorted(score.items(), key = lambda x : x[1], reverse=True)
        return score[:11]
    
    
    def query(self, queryString, useTfIdf = False, field = None):
        self.setQuery(queryString, field)
        return self.performQuery(useTfIdf)
    
            
    def setLengthDocs(self):
        lengthDoc = {}
        for fileName in os.listdir(self.documentsPath):
            filePath = self.documentsPath + '/' + fileName
            print(filePath)
            with open(filePath, 'r') as fp:
                lengthDoc[int(fileName)] = len(fp.read().split(' '))
        self.lengthDoc = lengthDoc

In [102]:
invIndexPath = os.path.abspath('../inverted_index/frequency.json')
twoTermsPath = os.path.abspath('../inverted_index/twoTerms.json')
documents = os.path.abspath('../inverted_index/db/')

In [105]:
qp = QueryProcessor(invIndexPath=invIndexPath, documentsPath=documents)

/home/rodrigo/Documents/ri/inverted_index/db/608
/home/rodrigo/Documents/ri/inverted_index/db/907
/home/rodrigo/Documents/ri/inverted_index/db/2044
/home/rodrigo/Documents/ri/inverted_index/db/2485
/home/rodrigo/Documents/ri/inverted_index/db/2039
/home/rodrigo/Documents/ri/inverted_index/db/1815
/home/rodrigo/Documents/ri/inverted_index/db/673
/home/rodrigo/Documents/ri/inverted_index/db/100
/home/rodrigo/Documents/ri/inverted_index/db/1346
/home/rodrigo/Documents/ri/inverted_index/db/936
/home/rodrigo/Documents/ri/inverted_index/db/325
/home/rodrigo/Documents/ri/inverted_index/db/1622
/home/rodrigo/Documents/ri/inverted_index/db/1966
/home/rodrigo/Documents/ri/inverted_index/db/2372
/home/rodrigo/Documents/ri/inverted_index/db/2793
/home/rodrigo/Documents/ri/inverted_index/db/667
/home/rodrigo/Documents/ri/inverted_index/db/343
/home/rodrigo/Documents/ri/inverted_index/db/2506
/home/rodrigo/Documents/ri/inverted_index/db/2574
/home/rodrigo/Documents/ri/inverted_index/db/1249
/home/ro

/home/rodrigo/Documents/ri/inverted_index/db/1288
/home/rodrigo/Documents/ri/inverted_index/db/606
/home/rodrigo/Documents/ri/inverted_index/db/746
/home/rodrigo/Documents/ri/inverted_index/db/2125
/home/rodrigo/Documents/ri/inverted_index/db/2100
/home/rodrigo/Documents/ri/inverted_index/db/2548
/home/rodrigo/Documents/ri/inverted_index/db/878
/home/rodrigo/Documents/ri/inverted_index/db/2177
/home/rodrigo/Documents/ri/inverted_index/db/1292
/home/rodrigo/Documents/ri/inverted_index/db/2217
/home/rodrigo/Documents/ri/inverted_index/db/316
/home/rodrigo/Documents/ri/inverted_index/db/1818
/home/rodrigo/Documents/ri/inverted_index/db/1420
/home/rodrigo/Documents/ri/inverted_index/db/2547
/home/rodrigo/Documents/ri/inverted_index/db/150
/home/rodrigo/Documents/ri/inverted_index/db/961
/home/rodrigo/Documents/ri/inverted_index/db/359
/home/rodrigo/Documents/ri/inverted_index/db/1905
/home/rodrigo/Documents/ri/inverted_index/db/1215
/home/rodrigo/Documents/ri/inverted_index/db/1094
/home/r

/home/rodrigo/Documents/ri/inverted_index/db/286
/home/rodrigo/Documents/ri/inverted_index/db/666
/home/rodrigo/Documents/ri/inverted_index/db/2242
/home/rodrigo/Documents/ri/inverted_index/db/1348
/home/rodrigo/Documents/ri/inverted_index/db/876
/home/rodrigo/Documents/ri/inverted_index/db/530
/home/rodrigo/Documents/ri/inverted_index/db/2728
/home/rodrigo/Documents/ri/inverted_index/db/76
/home/rodrigo/Documents/ri/inverted_index/db/2320
/home/rodrigo/Documents/ri/inverted_index/db/1139
/home/rodrigo/Documents/ri/inverted_index/db/1517
/home/rodrigo/Documents/ri/inverted_index/db/1415
/home/rodrigo/Documents/ri/inverted_index/db/714
/home/rodrigo/Documents/ri/inverted_index/db/15
/home/rodrigo/Documents/ri/inverted_index/db/1863
/home/rodrigo/Documents/ri/inverted_index/db/873
/home/rodrigo/Documents/ri/inverted_index/db/1034
/home/rodrigo/Documents/ri/inverted_index/db/1858
/home/rodrigo/Documents/ri/inverted_index/db/1740
/home/rodrigo/Documents/ri/inverted_index/db/472
/home/rodri

/home/rodrigo/Documents/ri/inverted_index/db/1469
/home/rodrigo/Documents/ri/inverted_index/db/731
/home/rodrigo/Documents/ri/inverted_index/db/1970
/home/rodrigo/Documents/ri/inverted_index/db/324
/home/rodrigo/Documents/ri/inverted_index/db/2643
/home/rodrigo/Documents/ri/inverted_index/db/794
/home/rodrigo/Documents/ri/inverted_index/db/1370
/home/rodrigo/Documents/ri/inverted_index/db/603
/home/rodrigo/Documents/ri/inverted_index/db/640
/home/rodrigo/Documents/ri/inverted_index/db/1562
/home/rodrigo/Documents/ri/inverted_index/db/1650
/home/rodrigo/Documents/ri/inverted_index/db/1875
/home/rodrigo/Documents/ri/inverted_index/db/567
/home/rodrigo/Documents/ri/inverted_index/db/352
/home/rodrigo/Documents/ri/inverted_index/db/2255
/home/rodrigo/Documents/ri/inverted_index/db/2503
/home/rodrigo/Documents/ri/inverted_index/db/2811
/home/rodrigo/Documents/ri/inverted_index/db/57
/home/rodrigo/Documents/ri/inverted_index/db/980
/home/rodrigo/Documents/ri/inverted_index/db/2024
/home/rodr

In [107]:
rankingTFIDF = dict(qp.query('dark souls', useTfIdf=True))
rankingTFIDF

{419: 0.1143449893708287,
 637: 0.1559249855056755,
 771: 0.1787407647407879,
 772: 0.1696522512793919,
 773: 0.17257729009455383,
 774: 0.25665340578164414,
 849: 0.18204267265167023,
 2005: 0.08343622496534886,
 2014: 0.14042367115715804,
 2048: 0.12127498872663649,
 2193: 0.09307150297625591}

In [90]:
ranking = dict(qp.query('dark souls', useTfIdf=False))
ranking

{419: 0.02857142857142857,
 637: 0.03896103896103896,
 771: 0.03571428571428571,
 772: 0.03389830508474576,
 773: 0.034482758620689655,
 774: 0.05128205128205128,
 849: 0.030303030303030304,
 1807: 0.02,
 2014: 0.03508771929824561,
 2048: 0.030303030303030304,
 2193: 0.023255813953488372}

In [91]:
numDocs = len(list(ranking.keys())+list(rankingTFIDF.keys()))

In [66]:
def getSumSquareDist(r1, r2):
    result = 0
    docs = list(ranking.keys())+list(rankingTFIDF.keys())
    for doc in docs:
        squareDistance = (r1.get(doc, 0)-r2.get(doc,0))**2
        result += squareDistance
    return result

In [67]:
sumSquareDistance = getSumSquareDist(rankingTFIDF, ranking)

In [68]:
def spearmanCorrelation(sumSquareDist, k):
    num = 6*sumSquareDist
    den = k*(k**2-1)
    return 1-(num/den)

In [69]:
spearmanCorrelation(sumSquareDistance, k = numDocs)

0.9998056212532003