In [60]:
from nltk.corpus import stopwords 
import re
import unidecode
import os
import json
from math import log
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngram
#nltk.download('stopwords')

In [95]:
class QueryProcessor():
    def __init__(self, invIndexPath, documentsPath):
        self.descr = None
        self.queryTerms = None
        self.invIndex = None # inverted index file
        self.posting = None # posting file (document at a time)
        self.termIDF = None
        self.lengthDoc = None
        self.documentsPath = documentsPath
        self.setInvIndex(invIndexPath)
        self.wordsDocuments = list(self.invIndex.keys()) # all words of all documents
        self.numDocs = len(os.listdir(documentsPath)) # number of documents in database
        self.setLengthDocs() # set length of documents
        self.runIDF()
        
    '''
        * Method where receive an list of words and cleans it, removing
        * stopwords (if useStopwords == True), accentuation and punctuation
    '''
    def cleaning(self, words, field, useStopwords = False):
        if type(words) == str:
            substr = re.findall(r'"(.*?)"', words)
            words = re.sub(r'"(.*?)"', '', words)
            words = words.strip().split(' ')
            words = words + substr
        toRemove = r'[.*,;\(\)\'\"\?\!%\$]'
        for i in range(len(words)):
            words[i] = re.sub(toRemove, '', words[i])
            words[i] = unidecode.unidecode(words[i])
        if useStopwords:
            set_sw = set(stopwords.words('english'))
            newWords = []
            for word in words:
                if not word in set_sw:
                    if(field):
                        newWords.append(word + '.' + field)
                    else:
                        newWords.append(word)
            return list(set(newWords))
        if(field):
            for i in range(len(words)):
                words[i] = words[i] + '.' + field
        return list(set(words))
            
    '''
        * Set the self.invIndex to a dict where key is a term and values is a tuple
        * (freq, doc), where freq is the frequency of that key term in the document
        * doc
    '''
    def setInvIndex(self, path):
        with open(path, encoding='unicode_escape') as file:
            file_text = file.read()[2:-1]
            json_dict = json.loads(file_text)
        self.invIndex = json_dict
    
    '''
        *Runs the term at a time algorithm and set self.posting with a dictionary 
        *with scores of all documents by frequency of terms
    '''
    def readPosting(self, queryWords):
        localReadPosting = {}
        for term in queryWords:
            freqDocList = self.invIndex.get(term, None)
            if freqDocList:
                for freq, doc in freqDocList:
                    localReadPosting[doc] = localReadPosting.get(doc, 0) + freq
        self.documentTermFrequency = localReadPosting
        
    def getPosting(self):
        return self.posting
            
    '''
        * TF algorithm returns the term frequency in a document. An different approach is divide
        * this frequency by the number of words in a document
    '''
    def runTF(self, term, document):
        for freq, doc in self.invIndex[term]:
            if doc == document:
                return freq
        return 0
    
    
    '''
        * IDF algorithm computes the importance of a term calculating the log of the division between the number
        * of documents by the number of documents that contains a term. 
    '''
    def runIDF(self):
        termIDF = {}
        for term in self.invIndex.keys():
            numDocsByTerm = len(self.invIndex[term])
            idf = log(self.numDocs/numDocsByTerm)
            termIDF[term] = idf
        self.termIDF = termIDF
                
            
    def setQuery(self, queryString, field):
        cleanQuery = self.cleaning(queryString, field, useStopwords=True)
        self.readPosting(cleanQuery)
        self.queryTerms = cleanQuery
        
    def performQuery(self, useTfIdf):
        score = {i: 0 for i in range(1,self.numDocs+1)}
        for term in self.queryTerms:
            invIndexValues = self.invIndex.get(term, False)
            if not invIndexValues:
                continue
            for freq, doc in self.invIndex[term]:
                auxScore = freq
                if useTfIdf:
                    auxScore = auxScore*self.termIDF.get(term, 0)
                score[doc] += auxScore
        for doc in score:
            score[doc] = score[doc]/self.lengthDoc[doc]
        score = sorted(score.items(), key = lambda x : x[1], reverse=True)
        return score[:11]
    
    
    def query(self, queryString, useTfIdf = False, field = None):
        self.setQuery(queryString, field)
        return self.performQuery(useTfIdf)
    
            
    def setLengthDocs(self):
        lengthDoc = {}
        for fileName in os.listdir(self.documentsPath):
            filePath = self.documentsPath + '/' + fileName
            print(filePath)
            with open(filePath, 'r') as fp:
                lengthDoc[int(fileName)] = len(fp.read().split(' '))
        self.lengthDoc = lengthDoc

In [96]:
invIndexPath = os.path.abspath('../inverted_index/frequency.json')
twoTermsPath = os.path.abspath('../inverted_index/twoTerms.json')
documents = os.path.abspath('../inverted_index/db/')

In [97]:
qp = QueryProcessor(invIndexPath=invIndexPath, documentsPath=documents)

C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/10
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/100
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1000
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1001
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1002
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1003
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1004
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1005
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1006
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1007
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1008
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1009
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverte

C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1790
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1791
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1792
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1793
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1794
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1795
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1796
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1797
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1798
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1799
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/18
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/180
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/1800
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inve

C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/23
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/230
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2300
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2301
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2302
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2303
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2304
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2305
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2306
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2307
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2308
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2309
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/231
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inver

C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2828
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2829
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/283
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2830
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2831
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2832
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2833
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2834
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2835
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2836
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2837
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2838
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/2839
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\in

C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/800
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/801
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/802
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/803
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/804
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/805
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/806
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/807
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/808
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/809
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/81
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/810
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\db/811
C:\Users\jvsn1\OneDrive\Documents\Github Repos\ri\inverted_index\

In [103]:
rankingTFIDF = dict(qp.query('dark souls', useTfIdf=True))
rankingTFIDF

{419: 0.1143449893708287,
 637: 0.1559249855056755,
 771: 0.1787407647407879,
 772: 0.1696522512793919,
 773: 0.17257729009455383,
 774: 0.25665340578164414,
 849: 0.18204267265167023,
 2005: 0.08343622496534886,
 2014: 0.14042367115715804,
 2048: 0.12127498872663649,
 2193: 0.09307150297625591}

In [104]:
ranking = dict(qp.query('dark souls', useTfIdf=False))
ranking

{419: 0.02857142857142857,
 637: 0.03896103896103896,
 771: 0.03571428571428571,
 772: 0.03389830508474576,
 773: 0.034482758620689655,
 774: 0.05128205128205128,
 849: 0.030303030303030304,
 1807: 0.02,
 2014: 0.03508771929824561,
 2048: 0.030303030303030304,
 2193: 0.023255813953488372}

In [105]:
numDocs = len(list(ranking.keys())+list(rankingTFIDF.keys()))

In [106]:
def getSumSquareDist(r1, r2):
    result = 0
    docs = list(ranking.keys())+list(rankingTFIDF.keys())
    for doc in docs:
        squareDistance = (r1.get(doc, 0)-r2.get(doc,0))**2
        result += squareDistance
    return result

In [107]:
sumSquareDistance = getSumSquareDist(rankingTFIDF, ranking)

In [108]:
def spearmanCorrelation(sumSquareDist, k):
    num = 6*sumSquareDist
    den = k*(k**2-1)
    return 1-(num/den)

In [109]:
spearmanCorrelation(sumSquareDistance, k = numDocs)

0.9998056212532003

In [110]:
qp.query('"dark souls" dark souls', useTfIdf=True)

[(774, 0.42503863617755544),
 (771, 0.2960090501950833),
 (773, 0.2858018415676666),
 (772, 0.2809577425580451),
 (849, 0.18204267265167023),
 (637, 0.1559249855056755),
 (2014, 0.14042367115715804),
 (2048, 0.12127498872663649),
 (419, 0.1143449893708287),
 (2193, 0.09307150297625591),
 (2005, 0.08343622496534886)]

In [99]:
qp.queryTerms

['', 'dark souls']