In [17]:
import pickle
import pandas as pd
from math import log
import re
import nltk
nltk.download(['punkt','stopwords','wordnet','words'])
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import metapy

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\julia\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [18]:
loaded_model = pickle.load(open('estructuraDatos.sav', 'rb'))
idexFiles = loaded_model['idexFiles']
vectorizer = loaded_model['vectorizer']
matrix = loaded_model['matriz']
indexMeta = loaded_model['metapyIndex']

In [19]:

# Encontrar los documentos que contiene una palabra en particular
def encontrarDoc(palabra):
    col = vectorizer.vocabulary_[palabra]
    matx = matrix[:,col]
    indx = matx.nonzero()[0]
    lista =indx.tolist() 
    dfresult = pd.DataFrame()
    for i in range(len(lista)):
        auxres= pd.DataFrame({'NombreArchivo': idexFiles[lista[i]], 'Frecuencia': [matx.data[i]]})
        dfresult = pd.concat([dfresult, auxres])
    dfresult.sort_values('Frecuencia',ascending = False,inplace = True)
    return dfresult

In [20]:
def indice_invertido(dic):
    inv = {}
    N = matrix.shape[0]
    for k, v in vectorizer.vocabulary_.items():
        inv.setdefault(k, {})
        #Los documentos que contienen la palabra v
        matx = matrix[:,v]
        #Indicador de los documentos que contienen la palabra
        indx = matx.nonzero()[0]
        lista =indx.tolist()
        docs = {}
        if len(lista)== 0:
            print(k)
        else:
            #Calculo del IDF, lista contiene todos los documentos que contienen la palabra
            inv[k]['IDF'] = log((N+1)/(len(lista)))
            for i in range(len(lista)):
                keys = docs.setdefault(idexFiles[lista[i]], [])
                #Frecuencia de la palabra V en el documento lista[i]
                keys.append(matx.data[i])
                #Las palabras que contiene el documento lista[i]
                matx2 = matrix[lista[i],:]
                #La frecuencia de cada palabra, que sumada el vector da el total de palabras en el documento
                keys.append(matx2.data.sum())
        inv[k]['Documentos'] = docs
    return inv

In [21]:
ind_inv = indice_invertido(vectorizer.vocabulary_)

across
all
almost
along
also
amount
an
anyhow
anyway
around
as
at
b
back
be
beforehand
behind
beyond
bill
bottom
c
call
co
con
d
de
detail
do
down
due
e
eight
either
eleven
enough
even
ever
except
f
few
fifteen
fill
find
fire
first
five
former
four
front
full
further
g
get
give
go
h
he
herein
i
ie
in
inc
interest
j
k
keep
l
last
latter
least
m
may
might
mill
mine
move
much
must
n
name
neither
never
nevertheless
next
nine
none
o
off
often
on
one
or
out
own
p
part
put
q
r
rather
re
s
same
see
seem
serious
show
side
six
so
somehow
still
system
t
take
ten
there
therein
these
thick
thin
third
though
three
throughout
top
two
u
un
up
us
v
w
well
whole
will
with
within
x
y
yet
z


In [22]:

stopWords = stopwords.words('english')
def queryClean(texto):
    #Pasar todo a minisculas
    texto = texto.lower()
    texto =re.sub('(á|à|ä)','a',texto) # Reemplazar a acentuada
    texto =re.sub('(é|è|ë)','e',texto) # Reemplazar e acentuada
    texto =re.sub('(í|ì|ï)','i',texto) # Reemplazar i acentuada
    texto =re.sub('(ó|ò|ö)','o',texto) # Reemplazar o acentuada
    texto =re.sub('(ú|ù|ü)','u',texto) # Reemplazar u acentuada
    texto =re.sub('[^a-zA-Z]',' ',texto) # Eliminar caracteres que no sean: letra, número o vocales acentuadas
    texto =re.sub(' +',' ',texto) # Eliminar espacios en blanco
    #Tokenizar
    tokens = texto.split()
    tokens = [w for w in tokens if (len(w)>1)&(w.isalpha())&(w not in stopWords)]
    #Lemma
    word_net_lemmatizar = WordNetLemmatizer()
    tokens = [word_net_lemmatizar.lemmatize(w, pos = "v") for w in tokens]

    #Stemmer
    ps = PorterStemmer() 
    tokens = [ps.stem(w) for w in tokens]

    return tokens


Rankin por Term Frequency

In [23]:
def queryTF(word,top):
    respuesta = sorted(ind_inv[word]['Documentos'].items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

Rankin por Term Frequency / Doc Length

In [24]:
def queryTFDL(word,top):
    aux = ind_inv[word]['Documentos']
    auxdic = {}
    for k,v in aux.items():
        keys = auxdic.setdefault(k, [])
        keys.append(v[0]/v[1])
    respuesta = sorted(auxdic.items(), key = lambda kv:(kv[1], kv[0]),reverse=True)
    return respuesta[0:top]

Rankin usando BM25

In [25]:
def cal_bm25(idf,frec,k,b,length,avgdl):
    aux = idf*((frec*(k+1))/(frec+k*(1-b+b*length/avgdl)))
    return aux

In [26]:
def queryBM25(query, vocabulary, prom, k1, b, top):
    query_word = queryClean(query)
    dfresultb25 = pd.DataFrame()
    resultadoBm25 = pd.DataFrame()
    for word in query_word:
        if (word in vocabulary):
            aux = ind_inv[word]['Documentos']
            IDF = ind_inv[word]['IDF']
            for k,v in aux.items():
            #     keys = bm25.setdefault(k, [])
                aux25 = cal_bm25(IDF,v[0],k1,b,v[1],prom)
                auxresb25= pd.DataFrame({'NombreArchivo': k.split('\\')[-1], 'Word': word, 'BM25' : [aux25]})
                dfresultb25 = pd.concat([dfresultb25, auxresb25])
            resultadoBm25 = dfresultb25.groupby('NombreArchivo').agg({'BM25':'sum'}).sort_values('BM25',ascending = False).reset_index()
            resultadoBm25.reset_index(inplace = True)
            resultadoBm25.rename(columns = {'index':'Ranking'}, inplace = True)
        else:
            print(f'{word} is not in the vocabulary')
    return resultadoBm25.head(top)

Definición de parámetros para el BM25

In [27]:
top = 20
prom = 27544.226762002043
k1 = 1.2
b = 0.75
vocabulary = vectorizer.vocabulary_

In [69]:
query = ("machine learning")

In [66]:
vocabulary = vectorizer.vocabulary_

In [67]:
top = 20
prom = 27544.226762002043
k1 = 1.2
b = 0.75

In [70]:
resultados = queryBM25(query, vocabulary, prom, k1, b,top)

In [71]:
resultados

Unnamed: 0,Ranking,NombreArchivo,BM25
0,0,1502.05767.txt,4.289243
1,1,1306.3726.txt,4.289187
2,2,1511.01258.txt,4.282112
3,3,1206.4656.txt,4.281008
4,4,1312.0049.txt,4.27983
5,5,1503.01239.txt,4.270601
6,6,1401.1061.txt,4.269371
7,7,1506.01110.txt,4.266588
8,8,1508.01993.txt,4.261827
9,9,1506.03410.txt,4.257082


Metapy

In [28]:
inv_idx = metapy.index.make_inverted_index("./build/config.toml")

In [52]:
inv_idx.num_docs()

980

In [53]:
inv_idx.unique_terms()

78549

In [54]:
inv_idx.avg_doc_length()

4215.095703125

In [55]:
inv_idx.total_corpus_terms()

4130794

In [56]:
ranker = metapy.index.OkapiBM25()

In [59]:
query = metapy.index.Document()
query.content("machine learning")

In [61]:
top_docs = ranker.score(inv_idx, query, num_results=20)
for [id,r] in top_docs:
    print(inv_idx.label(id),r)

../data/papers_own_impl/1206.4656.txt 4.202898979187012
../data/papers_own_impl/1306.3726.txt 4.180741310119629
../data/papers_own_impl/1502.05767.txt 4.1764373779296875
../data/papers_own_impl/1201.0490.txt 4.150365352630615
../data/papers_own_impl/1508.01993.txt 4.1466217041015625
../data/papers_own_impl/1506.01110.txt 4.142436504364014
../data/papers_own_impl/1312.0049.txt 4.140744209289551
../data/papers_own_impl/1307.7050.txt 4.118325233459473
../data/papers_own_impl/1401.1061.txt 4.1002044677734375
../data/papers_own_impl/1503.08381.txt 4.0904083251953125
../data/papers_own_impl/1506.03410.txt 4.069889545440674
../data/papers_own_impl/1505.05451.txt 4.065674781799316
../data/papers_own_impl/1507.01239.txt 4.05823278427124
../data/papers_own_impl/1408.0848.txt 4.054011821746826
../data/papers_own_impl/1006.1029.txt 4.051542282104492
../data/papers_own_impl/1502.07209.txt 4.050348281860352
../data/papers_own_impl/1511.06382.txt 4.043100833892822
../data/papers_own_impl/1502.05472.t

In [29]:
def rankerMeta(top, querywords):
    ranker = metapy.index.OkapiBM25(k1 = k1, b = b)
    query = metapy.index.Document()
    query.content(querywords) # query from AP news
    top_docs = ranker.score(inv_idx, query, num_results=top)
    metaresult = pd.DataFrame()
    for doc in top_docs:
        auxmeta= pd.DataFrame({'NombreArchivo': indexMeta[doc[0]],  'BM25_Meta' : [doc[1]]})
        metaresult = pd.concat([metaresult, auxmeta])
    metaresult = metaresult.reset_index(drop = True).reset_index()
    metaresult.rename(columns = {'index':'RankingMeta'},inplace = True)
    return metaresult

In [30]:
def calculate_sens(queries,top):
    sens = pd.DataFrame()
    for query in queries:
        resultados = queryBM25(query, vocabulary, prom, k1, b,top)
        metares = rankerMeta(top, query)
        merget = resultados.merge(metares, how = 'left', on = 'NombreArchivo')
        sensibilidad = (merget['RankingMeta']>=0).sum()/len(merget)
        auxsens= pd.DataFrame({'Query': query,  'Sensibilidad' : [sensibilidad]})
        sens = pd.concat([sens, auxsens])
    return sens

In [46]:
queries = ["Data Science","Machine Learning","Computer Science","Algorithms in dynamic networks", "triangle free process","biology"]
sensibilidad = calculate_sens(queries,top)



In [51]:
sensibilidad

Unnamed: 0,Query,Sensibilidad
0,Data Science,0.65
0,Machine Learning,0.7
0,Computer Science,0.4
0,Algorithms in dynamic networks,0.75
0,triangle free process,0.5
0,biology,0.7


In [39]:
query = "Data Science"

In [40]:
resultados = queryBM25(query, vocabulary, prom, k1, b,top)
metares = rankerMeta(top, query)

In [41]:
resultados.merge(metares, how = 'left', on = 'NombreArchivo')

Unnamed: 0,Ranking,NombreArchivo,BM25,RankingMeta,BM25_Meta
0,0,1509.02900.txt,1.881402,1.0,1.595062
1,1,1506.00768.txt,1.875594,3.0,1.583507
2,2,1503.06483.txt,1.873385,0.0,1.59827
3,3,1503.01239.txt,1.872566,,
4,4,1408.0135.txt,1.87071,4.0,1.5827
5,5,1412.5902.txt,1.870023,7.0,1.565338
6,6,1308.0776.txt,1.862415,,
7,7,1504.05895.txt,1.859011,9.0,1.557693
8,8,1407.5117.txt,1.857507,2.0,1.584339
9,9,1502.05767.txt,1.851488,,
