# INF-483 - Tarea 1: Neural Information Retrieval

##### Juan Pablo Muñoz

En esta tarea se implementa un flujo de procesos para la recuperación de información en texto. El corpus utilizado corresponde a Wall Street Journal 92, y se implementan Averaged Word Embeddings de skip-grams para la fase de recuperación. Finalmente, para rankear, se usa BM25.

En un intento por mejorar la calidad de la recuperación, se propone evaluar las consultas sobre los títulos y los cuerpos de los documentos por separado, y combinar los resultados como se explica más adelante.

Finalmente, se evalúa la calidad de la recuperación calculando las métricas de precision y recall en base a los datos provistos por el dataset y se concluye al respecto

### Definición del Índice Invertido

In [1]:
class InvertedIndex:

    def __init__(self):
        self.index = dict()

    def __contains__(self, item):
        return item in self.index

    def __getitem__(self, item):
        return self.index[item]

    def add(self, word, docid):
        if word in self.index:
            if docid in self.index[word]:
                self.index[word][docid] += 1
            else:
                self.index[word][docid] = 1
        else:
            d = dict()
            d[docid] = 1
            self.index[word] = d

    #frequency of word in document
    def get_document_frequency(self, word, docid):
        if word in self.index:
            if docid in self.index[word]:
                return self.index[word][docid]
            else:
                raise LookupError('%s not in document %s' % (str(word), str(docid)))
        else:
            raise LookupError('%s not in index' % str(word))

    #frequency of word in index, i.e. number of documents that contain word
    def get_index_frequency(self, word):
        if word in self.index:
            return len(self.index[word])
        else:
            raise LookupError('%s not in index' % word)


class DocumentLengthTable:

    def __init__(self):
        self.table = dict()

    def __len__(self):
        return len(self.table)

    def add(self, docid, length):
        self.table[docid] = length

    def get_length(self, docid):
        if docid in self.table:
            return self.table[docid]
        else:
            raise LookupError('%s not found in table' % str(docid))

    def get_average_length(self):
        sum = 0
        for length in self.table.values():
            sum += length
        return float(sum) / float(len(self.table))


def build_data_structures(corpus):
    idx = InvertedIndex()
    dlt = DocumentLengthTable()
    for docid in corpus:

        #build inverted index
        for word in corpus[docid]:
            idx.add(str(word), str(docid))

        #build document length table
        length = len(corpus[str(docid)])
        dlt.add(docid, length)
    return idx, dlt

### Definir generador de batches de entrenamiento

Esta función lee un archivo desde el disco y va generando pares (skipgram, label) para entrenar la red neuronal de embedding de más adelante. Este método se descartó por no dar resultados satisfactorios.

In [2]:
from sklearn.utils import shuffle
import keras.preprocessing.sequence as seq
import numpy as np
import os
import random

def read_large_file(file_object):
    """
    Uses a generator to read a large file lazily
    """
    while True:
        data = file_object.readline()
        if not data:
            break
        yield data

def batch_generator(batch_size):
    while True:
        couple_batch_word1 = []
        couple_batch_word2 = []
        label_batch = []
        try:
            with open(os.path.join('data', 'skipgrams_body.dataset'), 'r') as f:

                    for line in read_large_file(f):
                        if random.random() < 0.5: pass # tomar la mitad de las líneas al azar
                        word1, word2, label = line.split(',')
                        if int(label) == 0 and random.random() < 0.5: pass # 50% chance de no tomar un ejemplo negativo
                        couple_batch_word1.append(int(word1))
                        couple_batch_word2.append(int(word2))
                        label_batch.append(int(label))
                        if len(label_batch) >= batch_size:
                            couple_batch_word1, couple_batch_word2, label_batch = shuffle(couple_batch_word1, couple_batch_word2, label_batch, random_state=0)
                            couple_batch_word1_return = couple_batch_word1
                            couple_batch_word2_return = couple_batch_word2
                            label_batch_return = label_batch
                            couple_batch_word1 = []
                            couple_batch_word2 = []
                            label_batch = []
                            yield ([np.array(couple_batch_word1_return), np.array(couple_batch_word2_return)], np.array(label_batch_return))
        except Exception as e:
            pass

Using TensorFlow backend.


### Definir constructor de skip-grams

In [3]:
import keras.preprocessing.sequence as seq
import numpy as np

def skip_grams(sentences, window, vocab_size, nb_negative_samples=5.):

    def sg(sentence):
        return seq.skipgrams(sentence, vocab_size, window_size=window, negative_samples=nb_negative_samples)

    couples = []
    labels = []

    for cpl, lbl in map(sg, sentences):
        couples.extend(cpl)
        labels.extend(lbl)
        
    return np.asarray(couples), np.asarray(labels)
    

### Leer y parsear corpus Wall Street Journal '92

In [19]:
from bs4 import BeautifulSoup
import os
import json

infile = open(os.path.join("data","wsj2.xml"),"r")
contents = infile.read()
soup = BeautifulSoup(contents,'lxml-xml')
docs = soup.find_all('DOC')
corpus = {}

p = True
for doc in docs:
    docid = doc.DOCNO.string.strip()
    head = doc.HL
    head = head.string.strip()
    body = doc.TEXT
    body = body.get_text().strip()
    corpus[docid] = {'head': head, 'body': body}
    
with open(os.path.join("data","corpus.json"), 'w') as outfile:
    json.dump(corpus, outfile)

### Cargar corpus

In [42]:
import os
import json

with open(os.path.join("data","corpus.json")) as f:
    corpus = json.load(f)
#corpus_head: {doc_id: doc_headline}
corpus_head = dict([(doc[0], doc[1]['head']) for doc in corpus.items()])
#corpus_body: {doc_id: doc_body}
corpus_body = dict([(doc[0], doc[1]['body']) for doc in corpus.items()])

### Normalizar corpus

En este punto se cargan por los títulos y los cuerpos en corpus separados. La idea es generar un embedding para cada corpus y luego evaluar las consultas por separado en cada uno.

Para cada corpus:
- Transformar tokens a minúsculas
- Quitar tokens correspondientes a signos de puntuación, números y stopwords
- Quitar tokens de longitud < 2 luego de los pasos anteriores
- Quitar tokens con frecuencia = 1

In [43]:
from string import punctuation
import nltk
from nltk.tokenize import RegexpTokenizer
import collections

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stop_words = nltk.corpus.stopwords.words('english')
    
remove_terms = punctuation + '0123456789'

# Normalizar corpus_head
for doc_id, doc_head in corpus_head.items():
    clean_head = [word.lower() for word in tokenizer.tokenize(doc_head) if word.lower() not in remove_terms]
    clean_head = [tok_sent for tok_sent in clean_head if tok_sent not in stop_words]
    clean_head = [tok_sent for tok_sent in clean_head if len(' '.join(tok_sent).split()) >= 2]
    corpus_head[doc_id] = clean_head

# Obtener frecuencias de palabras post-limpieza
freq_head_pre = collections.Counter([token for doc_id, doc_head in corpus_head.items() for token in doc_head])   
print('Tamaño vocabulario corpus_head antes:', len(freq_head_pre))

# última pasada para eliminar palabras poco frecuentes
min_freq_head = 0
for doc_id, doc_head in corpus_head.items():
    clean_head = [token for token in doc_head if freq_head_pre[token] > min_freq_head]
    corpus_head[doc_id] = clean_head   

freq_head_post = collections.Counter([token for doc_id, doc_head in corpus_head.items() for token in doc_head])   
print('Tamaño vocabulario corpus_head después:', len(freq_head_post))
    
# Normalizar corpus_body
for doc_id, doc_body in corpus_body.items():
    clean_body = [word.lower() for word in tokenizer.tokenize(doc_body) if word.lower() not in remove_terms]
    clean_body = [tok_sent for tok_sent in clean_body if tok_sent not in stop_words]
    clean_body = [tok_sent for tok_sent in clean_body if len(' '.join(tok_sent).split()) > 2]
    corpus_body[doc_id] = clean_body
    
# Obtener frecuencias de palabras post-limpieza
freq_body_pre = collections.Counter([token for doc_id, doc_body in corpus_body.items() for token in doc_body])   
print('Tamaño vocabulario corpus_body antes:', len(freq_body_pre))

# última pasada para eliminar palabras poco frecuentes
min_freq_body = 0
for doc_id, doc_body in corpus_body.items():
    clean_body = [token for token in doc_body if freq_body_pre[token] > min_freq_body]
    corpus_body[doc_id] = clean_body   

freq_body_post = collections.Counter([token for doc_id, doc_body in corpus_body.items() for token in doc_body])      
print('Tamaño vocabulario corpus_body después:', len(freq_body_post))

Tamaño vocabulario corpus_head antes: 11990
Tamaño vocabulario corpus_head después: 11990
Tamaño vocabulario corpus_body antes: 59349
Tamaño vocabulario corpus_body después: 59349


Comprobar que los corpus se cargaron y normalizaron correctamente

In [33]:
count = 0
for id, doc in corpus_head.items():
    print(doc)
    count +=1
    if count > 4:
        break

['notice', 'readers']
['concerning', 'move', 'nice', 'yeltsin', 'better', 'judith', 'valente', 'staff', 'reporter', 'wall', 'street', 'journal']
['workplace', 'firms', 'train', 'bridge', 'still', 'laura', 'castro', 'staff', 'reporter', 'wall', 'street', 'journal']
['tucson', 'electric', 'power', 'creditors', 'fail', 'bid', 'force', 'utility', 'chapter', 'frederick', 'rose', 'staff', 'reporter', 'wall', 'street', 'journal']
['technology', 'healthcare', 'new', 'york', 'request', 'rates', 'rejected']


In [7]:
count = 0
for id, doc in corpus_body.items():
    print(doc)
    count +=1
    if count > 1:
        break

[]
['early', 'last', 'year', 'costa', 'wrote', 'gorbachev', 'seeking', 'donation', 'wartime', 'artillery', 'fledgling', 'wisconsin', 'military', 'history', 'museum', 'costa', 'promised', 'whatever', 'soviet', 'leader', 'sent', 'would', 'displayed', 'elements', 'away', 'vandals', 'exchange', 'offered', 'send', 'gorbachev', 'wisconsin', 'cheese', 'sausage', 'beer', 'cherry', 'wine', 'costa', 'great', 'surprise', 'gorbachev', 'responded', 'enthusiastically', 'ordered', 'defense', 'aides', 'scour', 'soviet', 'arsenal', 'world', 'war', 'tank', 'could', 'sent', 'museum', 'settled', 'model', 'russian', 'diplomat', 'calls', 'sacred', 'machine', 'saved', 'many', 'lives', 'war', 'gorbachev', 'ordered', 'tank', 'shipped', 'plant', 'ukraine', 'seven', 'mechanics', 'worked', 'three', 'months', 'restoring', 'folks', 'wisconsin', 'waited', 'tank', 'gorbachev', 'running', 'trouble', 'home', 'including', 'august', 'coup', 'thought', 'would', 'end', 'costa', 'says', 'still', 'plans', 'send', 'tank', 'so

### Crear Índices Invertidos

In [8]:
idx_head, dlt_head = build_data_structures(corpus_head)
idx_body, dlt_body = build_data_structures(corpus_body)

In [9]:
idx_head['man']

{'WSJ920102-0016': 2,
 'WSJ920107-0148': 1,
 'WSJ920109-0168': 1,
 'WSJ920116-0165': 1,
 'WSJ920122-0092': 1,
 'WSJ920211-0055': 1,
 'WSJ920212-0133': 1,
 'WSJ920218-0033': 1,
 'WSJ920219-0141': 1,
 'WSJ920219-0071': 1,
 'WSJ920220-0137': 1,
 'WSJ920220-0115': 1,
 'WSJ920227-0117': 1,
 'WSJ920302-0142': 1,
 'WSJ920303-0101': 1,
 'WSJ920313-0027': 1,
 'WSJ920320-0104': 2}

In [10]:
dlt_head.get_length('WSJ920102-0016')

14

In [11]:
idx_body['completes']

{'WSJ920106-0058': 1,
 'WSJ920122-0020': 1,
 'WSJ920127-0006': 1,
 'WSJ920129-0119': 1,
 'WSJ920203-0061': 1,
 'WSJ920203-0021': 1,
 'WSJ920203-0175': 1,
 'WSJ920204-0104': 1,
 'WSJ920205-0021': 1,
 'WSJ920206-0179': 1,
 'WSJ920206-0005': 1,
 'WSJ920207-0051': 1,
 'WSJ920211-0018': 1,
 'WSJ920212-0152': 1,
 'WSJ920228-0036': 1,
 'WSJ920302-0040': 1,
 'WSJ920303-0052': 1,
 'WSJ920319-0072': 1,
 'WSJ920320-0006': 1}

In [12]:
dlt_body.get_length('WSJ920304-0010')

102

### Crear embedder de términos
Para cada corpus:
- Crear skip-grams para entrenamiento
- Entrenar embedder
- Obtener y almacenar Averaged Word Embeddings de documentos en el corpus

#### Hacer encoding del corpus

In [16]:
import time

# Label encoding
from sklearn import preprocessing

start = time.time()

# flat_list = [item for sublist in l for item in sublist]

le_head = preprocessing.LabelEncoder()
le_head.fit([word for doc in corpus_head.values() for word in doc])

le_body = preprocessing.LabelEncoder()
le_body.fit([word for doc in corpus_body.values() for word in doc])

encoded_heads = [le_head.transform(head) for head in corpus_head.values() if len(head) > 0]
encoded_bodies = [le_body.transform(body) for body in corpus_body.values() if len(body) > 0]

print('label encodings: demoró {} [s]'.format(round(time.time()-start)))

label encodings: demoró 81 [s]


#### Generar skip-grams a partir del corpus_head

In [None]:
import time

# Head

start = time.time()

# Tamaño de vocabulario
# idx_corpus.index: {word1: [doc1,...]}

corpus_head_vocab_size = len(idx_head.index.keys())

# Tamaño de la ventana
corpus_head_window_size = 3. # <- Configurado arbitariamente

# Cantidad de ejemplos negativos versus positivos
corpus_head_negative_samples = 3 # <- Configurado arbitariamente

# Crear skip-grams
# skip_grams(sentences, window, vocab_size, nb_negative_samples=5.)

couples_head, labels_head = skip_grams(
    sentences=encoded_heads,
    window=corpus_head_window_size,
    vocab_size=corpus_head_vocab_size,
    nb_negative_samples=corpus_head_negative_samples,
)

print('skip-grams de corpus_head: demoró {} [s]'.format(round(time.time()-start)))

Using TensorFlow backend.


#### Generar skip-grams a partir del corpus_body

In [19]:
import time
import pickle

# Body

start = time.time()

# función auxiliar para particionar conjunto de documentos
def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))

# Tamaño de vocabulario
# idx_corpus.index: {word1: [doc1,...]}

corpus_body_vocab_size = len(idx_body.index.keys())

# Tamaño de la ventana
corpus_body_window_size = 5 # <- Configurado arbitariamente

# Cantidad de ejemplos negativos versus positivos
corpus_body_negative_samples = 3. # <- Configurado arbitariamente

In [28]:
# Crear skip-grams
# skip_grams(sentences, window, vocab_size, nb_negative_samples=5.)

n_partitions = 10 # <- Generar skip-grams en particiones, ojalá resulte
couples_body = []
labels_body = []
count = 0

for encoded_bodies_partition in list(split(encoded_bodies, n_partitions)):
    couples_body_part, labels_body_part = skip_grams(
        sentences=encoded_bodies_partition,
        window=corpus_body_window_size,
        vocab_size=corpus_body_vocab_size,
        nb_negative_samples=corpus_body_negative_samples,
    )
    #couples_body.extend(couples_body_part)
    #labels_body.extend(labels_body_part)
    count += 1
    with open(os.path.join('data', 'skipgrams_body.dataset'), 'a+') as f:
        #cada linea será: couple[0],couple[1],label
        for couple, label in zip(couples_body_part, labels_body_part):
            f.write(str(couple[0])+','+str(couple[1])+','+str(label)+'\n')
    
    print('{}% de skip-grams construidos'.format(int(100/n_partitions)*count))

print('skip-grams de corpus_body: demoró {} [s]'.format(round(time.time()-start)))

10% de skip-grams construidos
20% de skip-grams construidos
30% de skip-grams construidos
40% de skip-grams construidos
50% de skip-grams construidos
60% de skip-grams construidos
70% de skip-grams construidos
80% de skip-grams construidos
90% de skip-grams construidos
100% de skip-grams construidos
skip-grams de corpus_body: demoró 599 [s]


### Definir y entrenar embedder

In [85]:
import keras
from keras.layers import Input, Activation, Flatten, Reshape, Dense
from keras.layers.merge import Concatenate
from keras.models import Model
from keras.layers.embeddings import Embedding

In [117]:
nb_epoch = 10
body_datapoints_per_word = int(2*corpus_body_window_size*(corpus_body_negative_samples + 1))
body_batch_size = int(body_datapoints_per_word*100)
vec_dim = 256
body_samples_per_epoch = int(corpus_body_vocab_size*body_datapoints_per_word/body_batch_size)*50

In [118]:
import tensorflow as tf

keras.backend.clear_session()

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)

input_pvt = Input(batch_shape=(body_batch_size, 1), dtype='int32')
input_ctx = Input(batch_shape=(body_batch_size, 1), dtype='int32')
embedded_pvt = Embedding(input_dim=corpus_body_vocab_size, output_dim=vec_dim, input_length=1)(input_pvt)
flattened_embedded_pvt = Flatten()(embedded_pvt)
embedded_ctx = Embedding(input_dim=corpus_body_vocab_size, output_dim=vec_dim, input_length=1)(input_ctx)
flattened_embedded_ctx = Flatten()(embedded_ctx)
#merged = Concatenate([embedded_pvt, embedded_ctx], mode='concat', output_shape=(body_batch_size, 1))
merged = Concatenate()([flattened_embedded_pvt, flattened_embedded_ctx])
#flattened = Flatten()(merged)
dense = Dense(1)(merged)
predictions = Activation('sigmoid')(dense)
model_body = Model(inputs=[input_pvt, input_ctx], outputs=predictions)
model_body.compile(optimizer='rmsprop', loss='mse', metrics=['accuracy'])
model_body.summary()
model_body.fit_generator(generator=batch_generator(body_batch_size), steps_per_epoch=body_samples_per_epoch, epochs=nb_epoch, verbose=1)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (4000, 1)            0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (4000, 1)            0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (4000, 1, 256)       6539264     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (4000, 1, 256)       6539264     input_2[0][0]                    
__________________________________________________________________________________________________
flatten_1 

KeyboardInterrupt: 

### Conclusión sobre el método manual de creación de embeddings

Crear embeddings de términos con el enfoque y arquitectura anterior es infructuoso. El accuracy alcanzado no supera al de realizar elecciones al azar, a pesar de haber probado muchas variaciones en arquitectura y en el proceso de entrenamiento: tamaño de vector de embedding iguales a 64, 128 y 256; haciendo subsampling negativo, cantidad de pasos por epoch, tamaños de batch y cantidad de epochs totales.

Se sospecha que el modelo anterior no logra aprender de los ejemplos, ya que el ni loss no disminuye en ningún momento durante el entrenamiento. Esto podría resolverse ajustando el learning rate del modelo, y disminuyendo la proporción de ejemplos negativos versus positivos.

Crear y gestionar el gran dataset de skip-grams fue complicado debido a su gran tamaño. Se tuvo que escribir a un archivo en disco a medida que se generaban los ejemplos de entrenamiento, y luego, se tuvo que crear una función generadora que entregara batches de ejemplos desde ese archivo para el entrenamiento. Estas tareas tomaron bastante tiempo y esfuerzo para programar de manera de no sobrepasar la capacidad de memoria principal del computador con el que se trabaja.

### Cambio de metodología: usar Gensim

Gensim ofrece implementaciones optimizadas para la creación de skip-gram term embeddings y otras utilidades relevantes para esta tarea.

#### Crear term embeddings para el corpus body

Parámetros:
- Embedding size: 256
- Window size: 5
- Negative sampling (ejemplos negativos por cada ejemplo positivo): 5 (default)

In [44]:
from gensim.models import Word2Vec
import os

model_body = Word2Vec(corpus_body.values(), size=256, window=5, workers=4, sg=1, compute_loss=True)
model_body.train(corpus_body.values(), total_examples=len(corpus_body.values()), epochs=10)

(19482320, 20645420)

Chequear correctitud de creación del modelo

In [67]:
model_body.wv.most_similar(positive=['money'])

  if np.issubdtype(vec.dtype, np.int):


[('funds', 0.4545094668865204),
 ('ibc', 0.41579580307006836),
 ('laundering', 0.4112887680530548),
 ('launderers', 0.40672963857650757),
 ('wittbrodt', 0.4031805694103241),
 ('firas', 0.4000621438026428),
 ('inflow', 0.39657074213027954),
 ('deposited', 0.39371025562286377),
 ('birinyi', 0.3909085690975189),
 ('accrue', 0.39081132411956787)]

#### Crear term embeddings para el corpus head

Parámetros:
- Embedding size: 64 (menor vocabulario y menor tamaño de corpus -> menor tamaño de embedding)
- Window size: 2
- Negative sampling (ejemplos negativos por cada ejemplo positivo): 5 (default)

In [46]:
model_head = Word2Vec(corpus_head.values(), size=64, window=3, workers=4, sg=1, compute_loss=True)
model_head.train(corpus_head.values(), total_examples=len(corpus_head.values()), epochs=10)

(673093, 1011060)

Chequear correctitud de creación del modelo

In [66]:
model_head.wv.most_similar(positive=['money'])

  if np.issubdtype(vec.dtype, np.int):


[('profiting', 0.850208580493927),
 ('key', 0.7676757574081421),
 ('changing', 0.7649471163749695),
 ('raising', 0.7578659057617188),
 ('interest', 0.7559293508529663),
 ('low', 0.7543566226959229),
 ('lindley', 0.7482011318206787),
 ('matters', 0.740726888179779),
 ('bet', 0.7368081212043762),
 ('mortgages', 0.7363840341567993)]

In [48]:
model_head.wv['crisis']

array([-0.18567146, -0.02153717,  0.06885374,  0.03297263,  0.3131142 ,
        0.08975619, -0.11052741,  0.19241533,  0.11680879,  0.06733435,
        0.26860726,  0.05023226, -0.1991347 ,  0.18392667,  0.13218778,
       -0.08467516, -0.02753326,  0.03503089,  0.03599699, -0.4228239 ,
       -0.26320893, -0.11794525,  0.11193211, -0.1082921 ,  0.0407163 ,
       -0.0542684 , -0.11404806, -0.06996501,  0.2504337 ,  0.18256629,
       -0.06491038, -0.072845  ,  0.20264314,  0.41007587, -0.04655646,
        0.01319878,  0.16538344, -0.24558426,  0.2575402 ,  0.20509824,
       -0.26845592,  0.14231493, -0.06436489, -0.21730053,  0.45237666,
       -0.21092172, -0.12708688, -0.36677432,  0.05865473, -0.1540093 ,
       -0.09489243,  0.12610103,  0.06323103, -0.03468376,  0.01689659,
        0.25000334, -0.11046371,  0.18169317,  0.18925476, -0.08350138,
       -0.18891333, -0.08228879, -0.05616997,  0.08851402], dtype=float32)

### Cargar consultas

In [122]:
import os

with open(os.path.join('data', 'topics.51-100_'), 'r') as f:
    queries = {}
    for line in f.readlines():
        if '<num> Number:' in line:
            query_id = int(line.strip('<num> Number: ').strip('\n'))
        if '<title> Topic:' in line:
            q = line.strip('<title> Topic: ').strip('\n')
            queries[query_id] = q
    

Chequear correctitud en la lectura de las consultas

In [123]:
queries[51]

'Airbus Subsidies'

#### Realizar sobre las consultas mismo proceso de limpieza que a los documentos

In [126]:
from string import punctuation
import nltk
from nltk.tokenize import RegexpTokenizer
import collections

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
stop_words = nltk.corpus.stopwords.words('english')
    
remove_terms = punctuation + '0123456789'

normalized_queries = dict()

# Normalizar queries
token_min_len = 1
for q_id, q in queries.items():
    clean_q = [word.lower() for word in tokenizer.tokenize(q) if word.lower() not in remove_terms]
    clean_q = [tok_sent for tok_sent in clean_q if tok_sent not in stop_words]
    clean_q = [tok_sent for tok_sent in clean_q if len(tok_sent) > query_min_freq]
    normalized_queries[q_id] = clean_q

Chequear correctitud de normalización de las consultas

In [127]:
normalized_queries[51]

['airbus', 'subsidies']

#### Obtener lista de documentos candidatos a relevantes dada una consulta y un índice invertido

In [128]:
#idx_head, dlt_head
#idx_body, dlt_body

def get_doc_list(q, idx):
    doc_id_list = []
    for word in q:
        doc_id_list.extend(list(idx[word].keys()))
    return list(set(doc_id_list))

#### Obtener Averaged Word Embedding dada una consulta y un embedder
- Palabras de la consulta que no estén en el vocabulario del embedder, se quitan
- Si ninguna de las palabras en la consulta están en el vocabulario, retornar vector de ceros (nota: esta decisión se tomó de forma arbitraria)

In [129]:
from numpy.linalg import norm

def awe(doc, w2v):
    d_post = [term for term in doc if term in w2v.wv.vocab]
    #if len(list(set(doc) - set(d_post))) != 0: print('Palabras fuera del vocabulario:', list(set(doc) - set(d_post)))
    # si ninguna palabra de la consulta estaba en el vocabulario, retornar vector nulo
    if len(d_post) == 0: return np.zeros(len(w2v.wv[list(w2v.wv.vocab.keys())[0]]))
    # vector de cada término dividido en su módulo
    term_vectors = np.array([w2v.wv[term]/norm(w2v.wv[term]) for term in d_post])
    # se suman en un solo vector
    v = sum(term_vectors)
    # y se divide por la cantidad de palabras en la consulta (post eliminación de aquellas ausentes en el vocabulario)
    v = v/len(d_post)
    return v

#### Crear función de similitud coseno entre term embeddings

In [58]:
def sim(v1, v2):
    assert v1.size == v2.size
    return np.dot(v1/norm(v1),v2/norm(v2))

Chequear funcionamiento de función de similitud

In [65]:
sim(model_body.wv['canada'], model_body.wv['australia'])

0.3394552

In [60]:
sim(model_head.wv['money'], model_head.wv['poor'])

0.5398009

Chequear correctitud del constructor de AWEs

In [68]:
awe(normalized_queries[-1], model_head)

array([-0.03421513,  0.00891455, -0.02225829, -0.07041164,  0.20219119,
       -0.08714882, -0.02772672, -0.02434807, -0.04656893,  0.11163279,
        0.1304579 ,  0.06697559, -0.09622457, -0.00749928,  0.03979254,
        0.04682301, -0.08996513,  0.04634939,  0.12648556, -0.12847096,
       -0.03478785, -0.02540919,  0.06086544,  0.01533053, -0.02580533,
       -0.0261116 , -0.02289065,  0.10325227,  0.10931162, -0.0808276 ,
       -0.10290595,  0.06468564,  0.03423164,  0.24201924,  0.04245242,
       -0.13983841,  0.13245203, -0.00992088,  0.0516552 ,  0.1057655 ,
       -0.00293422,  0.06102055, -0.02226912, -0.1438515 ,  0.2279933 ,
       -0.05950605, -0.10459995, -0.25676426,  0.01612249, -0.11389931,
        0.01852683,  0.06698634,  0.07983918, -0.110659  ,  0.04674634,
        0.08703631,  0.01364428,  0.0887432 , -0.02945945,  0.00878621,
       -0.0267818 , -0.1130468 , -0.00357049,  0.08234187], dtype=float32)

### Evaluar consultas en corpus

- Para cada consulta:
    - recuperar doc_ids de los candidatos a relevantes desde el Índice invertido del corpus
    - calcular similitud entre AWE(consulta) y AWE(documento) para cada documento en los candidatos a relevantes
    - retornar los doc_ids de los 50 documentos más similares a la consulta

In [130]:
def eval_queries(q_dict, corpus, idx, w2v):
    candidates_to_be_rel = []
    for q in q_dict.values():
        for term in q:
            if term in idx:
                candidates_to_be_rel.extend([doc_id for doc_id, term_freq in idx[term].items()])
    candidates_to_be_rel = list(set(candidates_to_be_rel))
    
    # computar AWE de documentos candidatos a relevantes
    awe_docs = []
    for doc_id in candidates_to_be_rel:
            awe_docs.append(awe(corpus[doc_id], w2v))
    
    # Comparar AWE de cada query con los AWE de los documentos candidatos
    query_rels = dict()
    for q_id, q in q_dict.items():
        query_rels[q_id] = dict()
        for awe_doc, doc_id in zip(awe_docs, candidates_to_be_rel):
            awe_q = awe(q, w2v)
            query_rels[q_id][doc_id] = sim(awe_q, awe_doc)

    return query_rels

In [131]:
eval_queries_head = eval_queries(normalized_queries, corpus_head, idx_head, model_head)

  This is separate from the ipykernel package so we can avoid doing imports until


In [132]:
eval_queries_body = eval_queries(normalized_queries, corpus_body, idx_body, model_body)

  This is separate from the ipykernel package so we can avoid doing imports until


Chequear correctitud de cálculo de medidas de similitud

In [133]:
dict(list(eval_queries_head[51].items())[:10])

{'WSJ920310-0144': 0.6183162,
 'WSJ920128-0124': 0.7699244,
 'WSJ920204-0151': 0.7595831,
 'WSJ920226-0129': 0.782719,
 'WSJ920323-0110': 0.83635914,
 'WSJ920212-0065': 0.8541875,
 'WSJ920203-0053': 0.8955887,
 'WSJ920113-0031': 0.8684391,
 'WSJ920219-0094': 0.76710755,
 'WSJ920106-0064': 0.5371995}

In [134]:
dict(list(eval_queries_body[51].items())[:10])

{'WSJ920204-0151': 0.40141845,
 'WSJ920128-0124': 0.4434378,
 'WSJ920220-0054': 0.40581065,
 'WSJ920304-0083': 0.4648237,
 'WSJ920316-0040': 0.3737756,
 'WSJ920219-0094': 0.43492016,
 'WSJ920210-0101': 0.49381793,
 'WSJ920310-0049': 0.4232899,
 'WSJ920313-0142': 0.4130813,
 'WSJ920303-0064': 0.36430573}

#### Función para obtener los n documentos más similares a cada consulta, en el espacio de los embeddings

In [135]:
def get_top_n(qrels, n=50):
    top_n_qrels = {}
    for q_id, rels in qrels.items():
        top_n_qrels[q_id] = dict(sorted(rels.items(), key=lambda x: -x[1])[:n])
    return top_n_qrels

Obtener top-50 documentos más similares a cada consulta, para ambos corpus_head y corpus_body

In [136]:
eval_queries_head_top_50 = get_top_n(eval_queries_head, n=50)

In [137]:
eval_queries_body_top_50 = get_top_n(eval_queries_body, n=50)

Chequear correctitud del paso anterior

In [138]:
dict(list(eval_queries_head_top_50[51].items())[:10])

{'WSJ920124-0156': 0.9633757,
 'WSJ920204-0124': 0.94442385,
 'WSJ920323-0142': 0.93776816,
 'WSJ920218-0041': 0.9376886,
 'WSJ920316-0121': 0.9365615,
 'WSJ920115-0023': 0.93163246,
 'WSJ920130-0145': 0.9306904,
 'WSJ920306-0079': 0.9289214,
 'WSJ920302-0055': 0.92858034,
 'WSJ920108-0042': 0.92368937}

In [139]:
dict(list(eval_queries_body_top_50[51].items())[:10])

{'WSJ920110-0094': 0.64772415,
 'WSJ920228-0191': 0.61990196,
 'WSJ920116-0130': 0.606834,
 'WSJ920227-0147': 0.6032207,
 'WSJ920306-0058': 0.58715224,
 'WSJ920218-0155': 0.56357586,
 'WSJ920317-0149': 0.56099534,
 'WSJ920227-0058': 0.5549741,
 'WSJ920108-0164': 0.5498039,
 'WSJ920302-0119': 0.5485358}

### Rankear con BM25 sobre los top-50 documentos

In [140]:
import operator
from math import log

k1 = 1.2
k2 = 100
b = 0.75
R = 0.0


def score_BM25(n, f, qf, r, N, dl, avdl):
    K = compute_K(dl, avdl)
    first = log( ( (r + 0.5) / (R - r + 0.5) ) / ( (n - r + 0.5) / (N - n - R + r + 0.5)) )
    second = ((k1 + 1) * f) / (K + f)
    third = ((k2+1) * qf) / (k2 + qf)
    return first * second * third

def compute_K(dl, avdl):
    return k1 * ((1-b) + b * (float(dl)/float(avdl)) )


class QueryProcessor:
    def __init__(self, queries, corpus, candidate_docs):
        self.query_dict = queries
        self.index, self.dlt = build_data_structures(corpus)
        self.candidates = candidate_docs

    def run(self):
        results = dict()
        for q_id, query in self.query_dict.items():
            results[q_id] = self.run_query(query, q_id, self.candidates)
        return results

    def run_query(self, query, query_id, candidates_to_be_rel):
        query_result = dict()
        for term in query:
            if term in self.index:
                # retrieve index entry
                doc_dict = self.index[term]
                # for each document and its word frequency
                for docid, freq in doc_dict.items():
                    # Calcular y agregar score sólo si el documento está en el top-n de los
                    # recuperados con AWE
                    if docid in candidates_to_be_rel[query_id]:
                        score = score_BM25(
                            n=len(doc_dict),
                            f=freq,
                            qf=1,
                            r=0,
                            N=len(self.dlt),
                            dl=self.dlt.get_length(docid),
                            avdl=self.dlt.get_average_length()
                        ) # calculate score
                        if docid in query_result: #this document has already been scored once
                            query_result[docid] += score
                        else:
                            query_result[docid] = score
        return query_result
    

### Propuesta: procesar consultas sobre los títulos y los cuerpos de los documentos

Procesar las consultas de `q_list` sobre los corpus de los títulos `corpus_h` y los cuerpos `corpus_b` y combinar los resultados ponderando los scores usando el parámetro `p`:

\begin{align}
score(q) = p \cdot score_{head}(q) + (1 - p) \cdot score_{body}(q)
\end{align}

#### Definición de procesador de consultas especial
- evalúa sobre corpus_head y corpus_body automáticamente 
- es capaz de re-ponderar los puntajes al cambiar el parámetro p

In [170]:
class QueryProcessorHeadBody:
    def __init__(self, q_dict, corpus_h, corpus_b, candidates_h, candidates_b, p=0.1):
        try:
            assert p <= 1 and p >= 0
        except Exception as e:
            print('p debe estar en el rango [0, 1].')
            return
        self.results_head = QueryProcessor(
            normalized_queries, 
            corpus_head, 
            candidates_h
        ).run()
        self.results_body = QueryProcessor(
            normalized_queries, 
            corpus_body, 
            candidates_b
        ).run()
        self.p = p
        self.scores = self.combine_results()

    def change_weight(self, new_p=0.1):
        try:
            assert new_p <= 1 and new_p >= 0
        except Exception as e:
            print('p debe estar en el rango [0, 1].')
            return
        self.p = new_p
        return self.combine_results()
    
    def combine_results(self):
        results = dict()
        for q_id, doc_scores in self.results_head.items():
            results[q_id] = dict()
            for doc_id, score_head in doc_scores.items():
                results[q_id][doc_id] = self.p*score_head
        for q_id, doc_scores in self.results_body.items():
            for doc_id, score_body in doc_scores.items():
                if doc_id in results:
                    results[q_id][doc_id] += (1-self.p)*score_body
                else:
                    results[q_id][doc_id] = (1-self.p)*score_body
        return results

Chequear correctitud de creación de la clase

In [171]:
p=0.1

bm25_ranking = QueryProcessorHeadBody(
    normalized_queries,
    corpus_head,
    corpus_body,
    eval_queries_head_top_50,
    eval_queries_body_top_50,
    p   
)

In [172]:
bm25_ranking.scores[51]

{'WSJ920124-0156': 8.69706552569599,
 'WSJ920108-0037': 7.28163328402616,
 'WSJ920110-0094': 10.480623217742544,
 'WSJ920114-0073': 3.841290222900916,
 'WSJ920116-0130': 15.174131580075326,
 'WSJ920211-0134': 5.9071985775913065,
 'WSJ920213-0099': 4.238742985258622,
 'WSJ920213-0011': 6.1039044005424055,
 'WSJ920218-0155': 7.52804212173111,
 'WSJ920221-0165': 5.4943545082425596,
 'WSJ920225-0046': 7.923162483835888,
 'WSJ920227-0147': 12.22135459482911,
 'WSJ920227-0058': 7.302353369159894,
 'WSJ920228-0191': 13.483855154974844,
 'WSJ920302-0119': 10.953137060588778,
 'WSJ920306-0058': 12.886097423355427,
 'WSJ920316-0081': 7.743012408230292,
 'WSJ920317-0149': 4.109767311410256,
 'WSJ920103-0067': 2.721146218206719,
 'WSJ920110-0069': 5.696390136671308,
 'WSJ920113-0060': 5.230556072139496,
 'WSJ920114-0012': 0.6615115841714074,
 'WSJ920203-0042': 7.147498844460261,
 'WSJ920204-0079': 2.7853049664372818,
 'WSJ920210-0090': 6.656232330083361,
 'WSJ920306-0152': 3.8421963187743975,
 'WS

### Algunas consultas de ejemplo

- Se muestran los top-10 resultados de las tres primeras consultas del dataset, junto al puntaje BM25 evaluado para cada uno

In [173]:
top_10_bm25_ranking = get_top_n(bm25_ranking.scores, 10)

In [193]:
from IPython.display import display, Markdown
table = '`query_id` | top-10 docs. | BM25 Score\n'
table += '--- | --- | ---\n'
table += '51 | '+'<br>'.join([doc_id for doc_id in top_10_bm25_ranking[51].keys()])+' | '+'<br>'.join([str(round(doc_id, 2)) for doc_id in top_10_bm25_ranking[51].values()])+'\n'
table += '52 | '+'<br>'.join([doc_id for doc_id in top_10_bm25_ranking[52].keys()])+' | '+'<br>'.join([str(round(doc_id, 2)) for doc_id in top_10_bm25_ranking[52].values()])+'\n'
table += '53 | '+'<br>'.join([doc_id for doc_id in top_10_bm25_ranking[53].keys()])+' | '+'<br>'.join([str(round(doc_id, 2)) for doc_id in top_10_bm25_ranking[53].values()])+'\n'
display(Markdown(table))

`query_id` | top-10 docs. | BM25 Score
--- | --- | ---
51 | WSJ920116-0130<br>WSJ920228-0191<br>WSJ920306-0058<br>WSJ920227-0147<br>WSJ920302-0119<br>WSJ920110-0094<br>WSJ920124-0156<br>WSJ920225-0046<br>WSJ920316-0081<br>WSJ920218-0155 | 15.17<br>13.48<br>12.89<br>12.22<br>10.95<br>10.48<br>8.7<br>7.92<br>7.74<br>7.53
52 | WSJ920220-0099<br>WSJ920313-0015<br>WSJ920319-0062<br>WSJ920225-0080<br>WSJ920320-0115<br>WSJ920129-0169<br>WSJ920114-0114<br>WSJ920203-0038<br>WSJ920221-0125<br>WSJ920210-0070 | 18.39<br>17.43<br>16.74<br>14.1<br>13.36<br>10.85<br>10.4<br>10.33<br>9.25<br>8.44
53 | WSJ920122-0108<br>WSJ920129-0014<br>WSJ920221-0042<br>WSJ920220-0012<br>WSJ920131-0083<br>WSJ920225-0027<br>WSJ920312-0009<br>WSJ920316-0086<br>WSJ920205-0055<br>WSJ920318-0157 | 8.94<br>5.9<br>5.81<br>5.8<br>5.34<br>5.28<br>5.25<br>4.2<br>4.04<br>4.03


### A continuación se obtienen las métricas de precision y recall para los top-10 documentos resultantes de cada consulta

#### Cargar relevancias

In [84]:
import os

qrels = dict()

for i in range(1, 6):
    with open(os.path.join('data', 'qrels.51-100.disk1.disk2.part'+str(i)), 'r') as f:
        for line in f.readlines():
            if 'WSJ92' in line:
                splitted = line.split(' ')
                query_id = int(splitted[0])
                doc_id = splitted[2]
                is_rel = int(splitted[3])
                if query_id not in qrels: qrels[query_id] = dict()
                qrels[query_id][doc_id] = is_rel
        print('Se ha cargado qrels.51-100.disk1.disk2.part'+str(i))

Se ha cargado qrels.51-100.disk1.disk2.part1
Se ha cargado qrels.51-100.disk1.disk2.part2
Se ha cargado qrels.51-100.disk1.disk2.part3
Se ha cargado qrels.51-100.disk1.disk2.part4
Se ha cargado qrels.51-100.disk1.disk2.part5


Chequear correctitud de carga de relevancias

In [144]:
qrels[51]

{'WSJ920108-0037': 0,
 'WSJ920109-0101': 0,
 'WSJ920110-0067': 0,
 'WSJ920110-0069': 0,
 'WSJ920110-0094': 1,
 'WSJ920113-0060': 0,
 'WSJ920114-0054': 0,
 'WSJ920114-0073': 1,
 'WSJ920115-0015': 0,
 'WSJ920116-0018': 1,
 'WSJ920116-0047': 0,
 'WSJ920116-0080': 0,
 'WSJ920116-0130': 1,
 'WSJ920121-0016': 0,
 'WSJ920121-0151': 0,
 'WSJ920122-0161': 0,
 'WSJ920124-0156': 0,
 'WSJ920128-0088': 1,
 'WSJ920129-0082': 0,
 'WSJ920203-0175': 0,
 'WSJ920211-0134': 0,
 'WSJ920211-0165': 0,
 'WSJ920212-0034': 0,
 'WSJ920213-0011': 0,
 'WSJ920213-0030': 0,
 'WSJ920213-0099': 0,
 'WSJ920218-0155': 0,
 'WSJ920225-0046': 0,
 'WSJ920226-0072': 0,
 'WSJ920227-0058': 0,
 'WSJ920227-0147': 1,
 'WSJ920228-0191': 1,
 'WSJ920302-0119': 1,
 'WSJ920303-0065': 0,
 'WSJ920303-0073': 0,
 'WSJ920304-0144': 0,
 'WSJ920306-0058': 1,
 'WSJ920306-0080': 0,
 'WSJ920312-0029': 0,
 'WSJ920313-0135': 0,
 'WSJ920317-0149': 0,
 'WSJ920319-0151': 0,
 'WSJ920320-0188': 0}

### Definir función que obtiene precision, recall para cada consulta

In [184]:
def get_final_results(rankings, q_rels):
    top_10_bm25_ranking = rankings
    qrels = q_rels
    final_results = dict()
    sum_precision = 0
    non_null_precisions = 0
    sum_recall = 0
    non_null_recalls = 0

    for query_id, ranking in top_10_bm25_ranking.items():
        final_results[query_id] = dict()
        final_results[query_id]['ranking'] = ranking
        number_of_true_relevants = sum(qrels[query_id].values())
        final_results[query_id]['number_relevants'] = number_of_true_relevants
        number_of_selected_elements = len(ranking)
        #print('Query {}: {} docs in top-10'.format(query_id, number_of_selected_elements))
        correctly_selected = 0
        wrongly_selected = 0
        no_relevance_info = 0
        for doc_id, doc_rank in ranking.items():
            if doc_id in qrels[query_id]:
                if qrels[query_id][doc_id] == 1:
                    correctly_selected += 1
                elif qrels[query_id][doc_id] == 0:
                    wrongly_selected += 1
            else:
                no_relevance_info += 1
        final_results[query_id]['correct'] = correctly_selected
        final_results[query_id]['incorrect'] = wrongly_selected
        final_results[query_id]['no_info'] = no_relevance_info

        # Calcular precision y recall
        # precision = # of correctly selected elements / # of all selected elements
        # recall = # of correctly selected elements / # of positive elements

        if number_of_selected_elements != 0: 
            final_results[query_id]['precision'] = correctly_selected/number_of_selected_elements
            sum_precision += correctly_selected/number_of_selected_elements
            non_null_precisions += 1
        else: final_results[query_id]['precision'] = None
        if number_of_true_relevants != 0: 
            final_results[query_id]['recall'] = correctly_selected/number_of_true_relevants
            sum_recall += correctly_selected/number_of_true_relevants
            non_null_recalls += 1
        else: final_results[query_id]['recall'] = None
    avg_precision = sum_precision/non_null_precisions
    avg_recall = sum_recall/non_null_recalls
    #print('avg_precision = sum_precision/non_null_precisions: {} = {}/{}'.format(avg_precision,sum_precision,non_null_precisions))
    #print('avg_recall = sum_recall/non_null_recalls: {} = {}/{}'.format(avg_recall,sum_recall,non_null_recalls))
    return final_results, avg_precision, avg_recall

### Definición de función que genera una tabla de resumen para cada consulta evaluada

In [185]:
def print_table(results, p):

    from IPython.display import display, Markdown

    # precision & recall @ top-10

    table = '`query_id` | *Precision*@top-10 | *Recall*@top-10 | # of rel. Docs | Correct@top-10 | Incorrect@top-10 | No rel. Info@top-10\n'
    table += '--- | --- | --- | --- | --- | --- | ---\n'
    for q_id, result_dict in final_results.items():
        precision = result_dict['precision']
        if precision is None: precision = 'N/A'
        else: precision = round(precision, 2)
        recall = result_dict['recall']
        if recall is None: recall = 'N/A'
        else: recall = round(recall, 2)
        table += '{} | {} | {} | {} | {} | {} | {}\n'.format(q_id, precision, recall, result_dict['number_relevants'], result_dict['correct'], result_dict['incorrect'], result_dict['no_info'])

    display(Markdown('## p='+str(p)+': Precision & recall @ top-10'))
    display(Markdown(table))

### Observar cómo varía el precision y recall al cambiar el parámetro p

- p in [0.1, 0.3, 0.5, 0.7, 0.9]

In [186]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

average_presicion_recall_for_p = []
for p in [0.1, 0.3, 0.5, 0.7, 0.9]:
    bm25_ranking.change_weight(p)
    top_10_bm25_ranking = get_top_n(bm25_ranking.scores, 10)
    #pp.pprint(top_10_bm25_ranking[51])
    final_results, avg_precision, avg_recall = get_final_results(top_10_bm25_ranking, qrels)
    average_presicion_recall_for_p.append({'p': p, 'avg_precision': avg_precision, 'avg_recall': avg_recall})
    print_table(final_results, p)

## p=0.1: Precision & recall @ top-10

`query_id` | *Precision*@top-10 | *Recall*@top-10 | # of rel. Docs | Correct@top-10 | Incorrect@top-10 | No rel. Info@top-10
--- | --- | --- | --- | --- | --- | ---
51 | 0.6 | 0.67 | 9 | 6 | 3 | 1
52 | 0.7 | 0.64 | 11 | 7 | 1 | 2
53 | 0.0 | 0.0 | 9 | 0 | 2 | 8
54 | 0.0 | N/A | 0 | 0 | 3 | 7
55 | 0.2 | 0.29 | 7 | 2 | 2 | 6
56 | 0.4 | 0.22 | 18 | 4 | 2 | 4
57 | 0.8 | 0.67 | 12 | 8 | 0 | 2
58 | 0.0 | N/A | 0 | 0 | 2 | 8
59 | 0.0 | 0.0 | 3 | 0 | 2 | 8
60 | 0.1 | 0.25 | 4 | 1 | 3 | 6
61 | 0.1 | 1.0 | 1 | 1 | 6 | 3
62 | 0.0 | 0.0 | 2 | 0 | 3 | 7
63 | 0.0 | N/A | 0 | 0 | 4 | 6
64 | 0.0 | N/A | 0 | 0 | 3 | 7
65 | 0.1 | 0.33 | 3 | 1 | 2 | 7
66 | 0.0 | 0.0 | 1 | 0 | 1 | 9
67 | 0.0 | 0.0 | 1 | 0 | 2 | 8
68 | 0.0 | 0.0 | 1 | 0 | 2 | 8
69 | 0.0 | N/A | 0 | 0 | 0 | 10
70 | 0.0 | N/A | 0 | 0 | 1 | 0
73 | 0.1 | 0.11 | 9 | 1 | 3 | 6
74 | 0.0 | 0.0 | 18 | 0 | 2 | 8
75 | 0.0 | 0.0 | 6 | 0 | 0 | 5
76 | 0.5 | 0.56 | 9 | 5 | 3 | 2
77 | N/A | N/A | 0 | 0 | 0 | 0
78 | N/A | 0.0 | 2 | 0 | 0 | 0
79 | 0.0 | 0.0 | 4 | 0 | 2 | 8
80 | 0.0 | N/A | 0 | 0 | 8 | 2
81 | 0.0 | N/A | 0 | 0 | 1 | 9
82 | 0.3 | 0.09 | 32 | 3 | 1 | 6
83 | 0.1 | 0.11 | 9 | 1 | 1 | 8
84 | 0.0 | 0.0 | 3 | 0 | 1 | 9
85 | 0.3 | 0.21 | 14 | 3 | 2 | 5
86 | 0.0 | 0.0 | 5 | 0 | 8 | 2
87 | 0.1 | 0.09 | 11 | 1 | 0 | 9
88 | 0.0 | 0.0 | 4 | 0 | 3 | 7
89 | 0.0 | 0.0 | 3 | 0 | 9 | 1
90 | 0.6 | 0.35 | 17 | 6 | 4 | 0
91 | 0.0 | N/A | 0 | 0 | 5 | 5
92 | 0.0 | 0.0 | 1 | 0 | 2 | 8
93 | 0.0 | N/A | 0 | 0 | 1 | 9
94 | 0.0 | 0.0 | 2 | 0 | 3 | 7
95 | 0.0 | 0.0 | 3 | 0 | 1 | 9
96 | 0.1 | 0.2 | 5 | 1 | 2 | 7
97 | 0.3 | 0.3 | 10 | 3 | 4 | 3
98 | 0.1 | 0.06 | 18 | 1 | 3 | 6
99 | 0.0 | N/A | 0 | 0 | 2 | 8
100 | 0.1 | 0.2 | 5 | 1 | 2 | 7


## p=0.3: Precision & recall @ top-10

`query_id` | *Precision*@top-10 | *Recall*@top-10 | # of rel. Docs | Correct@top-10 | Incorrect@top-10 | No rel. Info@top-10
--- | --- | --- | --- | --- | --- | ---
51 | 0.6 | 0.67 | 9 | 6 | 3 | 1
52 | 0.7 | 0.64 | 11 | 7 | 1 | 2
53 | 0.0 | 0.0 | 9 | 0 | 2 | 8
54 | 0.0 | N/A | 0 | 0 | 3 | 7
55 | 0.2 | 0.29 | 7 | 2 | 2 | 6
56 | 0.4 | 0.22 | 18 | 4 | 2 | 4
57 | 0.8 | 0.67 | 12 | 8 | 0 | 2
58 | 0.0 | N/A | 0 | 0 | 2 | 8
59 | 0.0 | 0.0 | 3 | 0 | 2 | 8
60 | 0.1 | 0.25 | 4 | 1 | 3 | 6
61 | 0.1 | 1.0 | 1 | 1 | 6 | 3
62 | 0.0 | 0.0 | 2 | 0 | 3 | 7
63 | 0.0 | N/A | 0 | 0 | 4 | 6
64 | 0.0 | N/A | 0 | 0 | 3 | 7
65 | 0.1 | 0.33 | 3 | 1 | 2 | 7
66 | 0.0 | 0.0 | 1 | 0 | 1 | 9
67 | 0.0 | 0.0 | 1 | 0 | 2 | 8
68 | 0.0 | 0.0 | 1 | 0 | 2 | 8
69 | 0.0 | N/A | 0 | 0 | 0 | 10
70 | 0.0 | N/A | 0 | 0 | 1 | 0
73 | 0.1 | 0.11 | 9 | 1 | 3 | 6
74 | 0.0 | 0.0 | 18 | 0 | 2 | 8
75 | 0.0 | 0.0 | 6 | 0 | 0 | 5
76 | 0.5 | 0.56 | 9 | 5 | 3 | 2
77 | N/A | N/A | 0 | 0 | 0 | 0
78 | N/A | 0.0 | 2 | 0 | 0 | 0
79 | 0.0 | 0.0 | 4 | 0 | 2 | 8
80 | 0.0 | N/A | 0 | 0 | 8 | 2
81 | 0.0 | N/A | 0 | 0 | 1 | 9
82 | 0.3 | 0.09 | 32 | 3 | 1 | 6
83 | 0.1 | 0.11 | 9 | 1 | 1 | 8
84 | 0.0 | 0.0 | 3 | 0 | 1 | 9
85 | 0.3 | 0.21 | 14 | 3 | 2 | 5
86 | 0.0 | 0.0 | 5 | 0 | 8 | 2
87 | 0.1 | 0.09 | 11 | 1 | 0 | 9
88 | 0.0 | 0.0 | 4 | 0 | 3 | 7
89 | 0.0 | 0.0 | 3 | 0 | 9 | 1
90 | 0.6 | 0.35 | 17 | 6 | 4 | 0
91 | 0.0 | N/A | 0 | 0 | 5 | 5
92 | 0.0 | 0.0 | 1 | 0 | 2 | 8
93 | 0.0 | N/A | 0 | 0 | 1 | 9
94 | 0.0 | 0.0 | 2 | 0 | 3 | 7
95 | 0.0 | 0.0 | 3 | 0 | 1 | 9
96 | 0.1 | 0.2 | 5 | 1 | 2 | 7
97 | 0.3 | 0.3 | 10 | 3 | 4 | 3
98 | 0.1 | 0.06 | 18 | 1 | 3 | 6
99 | 0.0 | N/A | 0 | 0 | 2 | 8
100 | 0.1 | 0.2 | 5 | 1 | 2 | 7


## p=0.5: Precision & recall @ top-10

`query_id` | *Precision*@top-10 | *Recall*@top-10 | # of rel. Docs | Correct@top-10 | Incorrect@top-10 | No rel. Info@top-10
--- | --- | --- | --- | --- | --- | ---
51 | 0.6 | 0.67 | 9 | 6 | 3 | 1
52 | 0.7 | 0.64 | 11 | 7 | 1 | 2
53 | 0.0 | 0.0 | 9 | 0 | 2 | 8
54 | 0.0 | N/A | 0 | 0 | 3 | 7
55 | 0.2 | 0.29 | 7 | 2 | 2 | 6
56 | 0.4 | 0.22 | 18 | 4 | 2 | 4
57 | 0.8 | 0.67 | 12 | 8 | 0 | 2
58 | 0.0 | N/A | 0 | 0 | 2 | 8
59 | 0.0 | 0.0 | 3 | 0 | 2 | 8
60 | 0.1 | 0.25 | 4 | 1 | 3 | 6
61 | 0.1 | 1.0 | 1 | 1 | 6 | 3
62 | 0.0 | 0.0 | 2 | 0 | 3 | 7
63 | 0.0 | N/A | 0 | 0 | 4 | 6
64 | 0.0 | N/A | 0 | 0 | 3 | 7
65 | 0.1 | 0.33 | 3 | 1 | 2 | 7
66 | 0.0 | 0.0 | 1 | 0 | 1 | 9
67 | 0.0 | 0.0 | 1 | 0 | 2 | 8
68 | 0.0 | 0.0 | 1 | 0 | 2 | 8
69 | 0.0 | N/A | 0 | 0 | 0 | 10
70 | 0.0 | N/A | 0 | 0 | 1 | 0
73 | 0.1 | 0.11 | 9 | 1 | 3 | 6
74 | 0.0 | 0.0 | 18 | 0 | 2 | 8
75 | 0.0 | 0.0 | 6 | 0 | 0 | 5
76 | 0.5 | 0.56 | 9 | 5 | 3 | 2
77 | N/A | N/A | 0 | 0 | 0 | 0
78 | N/A | 0.0 | 2 | 0 | 0 | 0
79 | 0.0 | 0.0 | 4 | 0 | 2 | 8
80 | 0.0 | N/A | 0 | 0 | 8 | 2
81 | 0.0 | N/A | 0 | 0 | 1 | 9
82 | 0.3 | 0.09 | 32 | 3 | 1 | 6
83 | 0.1 | 0.11 | 9 | 1 | 1 | 8
84 | 0.0 | 0.0 | 3 | 0 | 1 | 9
85 | 0.3 | 0.21 | 14 | 3 | 2 | 5
86 | 0.0 | 0.0 | 5 | 0 | 8 | 2
87 | 0.1 | 0.09 | 11 | 1 | 0 | 9
88 | 0.0 | 0.0 | 4 | 0 | 3 | 7
89 | 0.0 | 0.0 | 3 | 0 | 9 | 1
90 | 0.6 | 0.35 | 17 | 6 | 4 | 0
91 | 0.0 | N/A | 0 | 0 | 5 | 5
92 | 0.0 | 0.0 | 1 | 0 | 2 | 8
93 | 0.0 | N/A | 0 | 0 | 1 | 9
94 | 0.0 | 0.0 | 2 | 0 | 3 | 7
95 | 0.0 | 0.0 | 3 | 0 | 1 | 9
96 | 0.1 | 0.2 | 5 | 1 | 2 | 7
97 | 0.3 | 0.3 | 10 | 3 | 4 | 3
98 | 0.1 | 0.06 | 18 | 1 | 3 | 6
99 | 0.0 | N/A | 0 | 0 | 2 | 8
100 | 0.1 | 0.2 | 5 | 1 | 2 | 7


## p=0.7: Precision & recall @ top-10

`query_id` | *Precision*@top-10 | *Recall*@top-10 | # of rel. Docs | Correct@top-10 | Incorrect@top-10 | No rel. Info@top-10
--- | --- | --- | --- | --- | --- | ---
51 | 0.6 | 0.67 | 9 | 6 | 3 | 1
52 | 0.7 | 0.64 | 11 | 7 | 1 | 2
53 | 0.0 | 0.0 | 9 | 0 | 2 | 8
54 | 0.0 | N/A | 0 | 0 | 3 | 7
55 | 0.2 | 0.29 | 7 | 2 | 2 | 6
56 | 0.4 | 0.22 | 18 | 4 | 2 | 4
57 | 0.8 | 0.67 | 12 | 8 | 0 | 2
58 | 0.0 | N/A | 0 | 0 | 2 | 8
59 | 0.0 | 0.0 | 3 | 0 | 2 | 8
60 | 0.1 | 0.25 | 4 | 1 | 3 | 6
61 | 0.1 | 1.0 | 1 | 1 | 6 | 3
62 | 0.0 | 0.0 | 2 | 0 | 3 | 7
63 | 0.0 | N/A | 0 | 0 | 4 | 6
64 | 0.0 | N/A | 0 | 0 | 3 | 7
65 | 0.1 | 0.33 | 3 | 1 | 2 | 7
66 | 0.0 | 0.0 | 1 | 0 | 1 | 9
67 | 0.0 | 0.0 | 1 | 0 | 2 | 8
68 | 0.0 | 0.0 | 1 | 0 | 2 | 8
69 | 0.0 | N/A | 0 | 0 | 0 | 10
70 | 0.0 | N/A | 0 | 0 | 1 | 0
73 | 0.1 | 0.11 | 9 | 1 | 3 | 6
74 | 0.0 | 0.0 | 18 | 0 | 2 | 8
75 | 0.0 | 0.0 | 6 | 0 | 0 | 5
76 | 0.5 | 0.56 | 9 | 5 | 3 | 2
77 | N/A | N/A | 0 | 0 | 0 | 0
78 | N/A | 0.0 | 2 | 0 | 0 | 0
79 | 0.0 | 0.0 | 4 | 0 | 2 | 8
80 | 0.0 | N/A | 0 | 0 | 8 | 2
81 | 0.0 | N/A | 0 | 0 | 1 | 9
82 | 0.3 | 0.09 | 32 | 3 | 1 | 6
83 | 0.1 | 0.11 | 9 | 1 | 1 | 8
84 | 0.0 | 0.0 | 3 | 0 | 1 | 9
85 | 0.3 | 0.21 | 14 | 3 | 2 | 5
86 | 0.0 | 0.0 | 5 | 0 | 8 | 2
87 | 0.1 | 0.09 | 11 | 1 | 0 | 9
88 | 0.0 | 0.0 | 4 | 0 | 3 | 7
89 | 0.0 | 0.0 | 3 | 0 | 9 | 1
90 | 0.6 | 0.35 | 17 | 6 | 4 | 0
91 | 0.0 | N/A | 0 | 0 | 5 | 5
92 | 0.0 | 0.0 | 1 | 0 | 2 | 8
93 | 0.0 | N/A | 0 | 0 | 1 | 9
94 | 0.0 | 0.0 | 2 | 0 | 3 | 7
95 | 0.0 | 0.0 | 3 | 0 | 1 | 9
96 | 0.1 | 0.2 | 5 | 1 | 2 | 7
97 | 0.3 | 0.3 | 10 | 3 | 4 | 3
98 | 0.1 | 0.06 | 18 | 1 | 3 | 6
99 | 0.0 | N/A | 0 | 0 | 2 | 8
100 | 0.1 | 0.2 | 5 | 1 | 2 | 7


## p=0.9: Precision & recall @ top-10

`query_id` | *Precision*@top-10 | *Recall*@top-10 | # of rel. Docs | Correct@top-10 | Incorrect@top-10 | No rel. Info@top-10
--- | --- | --- | --- | --- | --- | ---
51 | 0.6 | 0.67 | 9 | 6 | 3 | 1
52 | 0.7 | 0.64 | 11 | 7 | 1 | 2
53 | 0.0 | 0.0 | 9 | 0 | 2 | 8
54 | 0.0 | N/A | 0 | 0 | 3 | 7
55 | 0.2 | 0.29 | 7 | 2 | 2 | 6
56 | 0.4 | 0.22 | 18 | 4 | 2 | 4
57 | 0.8 | 0.67 | 12 | 8 | 0 | 2
58 | 0.0 | N/A | 0 | 0 | 2 | 8
59 | 0.0 | 0.0 | 3 | 0 | 2 | 8
60 | 0.1 | 0.25 | 4 | 1 | 3 | 6
61 | 0.1 | 1.0 | 1 | 1 | 6 | 3
62 | 0.0 | 0.0 | 2 | 0 | 3 | 7
63 | 0.0 | N/A | 0 | 0 | 4 | 6
64 | 0.0 | N/A | 0 | 0 | 3 | 7
65 | 0.1 | 0.33 | 3 | 1 | 2 | 7
66 | 0.0 | 0.0 | 1 | 0 | 1 | 9
67 | 0.0 | 0.0 | 1 | 0 | 2 | 8
68 | 0.0 | 0.0 | 1 | 0 | 2 | 8
69 | 0.0 | N/A | 0 | 0 | 0 | 10
70 | 0.0 | N/A | 0 | 0 | 1 | 0
73 | 0.1 | 0.11 | 9 | 1 | 3 | 6
74 | 0.0 | 0.0 | 18 | 0 | 2 | 8
75 | 0.0 | 0.0 | 6 | 0 | 0 | 5
76 | 0.5 | 0.56 | 9 | 5 | 3 | 2
77 | N/A | N/A | 0 | 0 | 0 | 0
78 | N/A | 0.0 | 2 | 0 | 0 | 0
79 | 0.0 | 0.0 | 4 | 0 | 2 | 8
80 | 0.0 | N/A | 0 | 0 | 8 | 2
81 | 0.0 | N/A | 0 | 0 | 1 | 9
82 | 0.3 | 0.09 | 32 | 3 | 1 | 6
83 | 0.1 | 0.11 | 9 | 1 | 1 | 8
84 | 0.0 | 0.0 | 3 | 0 | 1 | 9
85 | 0.3 | 0.21 | 14 | 3 | 2 | 5
86 | 0.0 | 0.0 | 5 | 0 | 8 | 2
87 | 0.1 | 0.09 | 11 | 1 | 0 | 9
88 | 0.0 | 0.0 | 4 | 0 | 3 | 7
89 | 0.0 | 0.0 | 3 | 0 | 9 | 1
90 | 0.6 | 0.35 | 17 | 6 | 4 | 0
91 | 0.0 | N/A | 0 | 0 | 5 | 5
92 | 0.0 | 0.0 | 1 | 0 | 2 | 8
93 | 0.0 | N/A | 0 | 0 | 1 | 9
94 | 0.0 | 0.0 | 2 | 0 | 3 | 7
95 | 0.0 | 0.0 | 3 | 0 | 1 | 9
96 | 0.1 | 0.2 | 5 | 1 | 2 | 7
97 | 0.3 | 0.3 | 10 | 3 | 4 | 3
98 | 0.1 | 0.06 | 18 | 1 | 3 | 6
99 | 0.0 | N/A | 0 | 0 | 2 | 8
100 | 0.1 | 0.2 | 5 | 1 | 2 | 7


Se observa que, sin importar el valor de `p`, los resultados son bastante deficientes en términos de precision y recall. Este resultado no es de extrañar ya que, para la recuperación de información en texto, la cantidad de documentos relevantes es mucho menor a la cantidad de no relevantes. Una rápida mirada sobre la columna `# of rel. Docs` de las tablas generadas confirma que los documetnos relevantes para cada consulta no suelen superar los 10. Es necesario aplicar técnicas  adicionales al proceso que se ha realizado para mitigar este desbalance.

A continuación se observa cómo el valor del ponderador `p` afectó la calidad de los resultados:

In [187]:
from IPython.display import display, Markdown

# averaged precision & recall for each p

table = '`p` | *Avg. Precision* | *Avg. Recall*\n'
table += '--- | --- | --- \n'
for result in average_presicion_recall_for_p:
    table += '{} | {} | {} \n'.format(result['p'],round(result['avg_precision'], 2),round(result['avg_recall'], 2))

display(Markdown('## Avg. Precision & recall para cada valor de ponderación p'))
display(Markdown(table))

## Avg. Precision & recall para cada valor de ponderación p

`p` | *Avg. Precision* | *Avg. Recall*
--- | --- | --- 
0.1 | 0.12 | 0.18 
0.3 | 0.12 | 0.18 
0.5 | 0.12 | 0.18 
0.7 | 0.12 | 0.18 
0.9 | 0.12 | 0.18 


La ponderación por `p` parece no afectar al precision y recall en absoluto. Se chequea cuáles son los documentos en el top-10 en común que retornan las evaluaciones sobre `corpus_body` y `corpus_head`.

In [194]:
for q_id, doc_scores in bm25_ranking.results_head.items():
    print('query_id={}\t{} docs. en común'.format(q_id, len(set(doc_scores.keys()).intersection(set( bm25_ranking.results_body[q_id].keys())))))

query_id=51	1 docs. en común
query_id=52	4 docs. en común
query_id=53	0 docs. en común
query_id=54	3 docs. en común
query_id=55	17 docs. en común
query_id=56	4 docs. en común
query_id=57	4 docs. en común
query_id=58	0 docs. en común
query_id=59	0 docs. en común
query_id=60	2 docs. en común
query_id=61	0 docs. en común
query_id=62	0 docs. en común
query_id=63	0 docs. en común
query_id=64	0 docs. en común
query_id=65	3 docs. en común
query_id=66	2 docs. en común
query_id=67	1 docs. en común
query_id=68	1 docs. en común
query_id=69	0 docs. en común
query_id=70	0 docs. en común
query_id=73	1 docs. en común
query_id=74	2 docs. en común
query_id=75	0 docs. en común
query_id=76	0 docs. en común
query_id=77	0 docs. en común
query_id=78	0 docs. en común
query_id=79	1 docs. en común
query_id=80	3 docs. en común
query_id=81	1 docs. en común
query_id=82	0 docs. en común
query_id=83	0 docs. en común
query_id=84	0 docs. en común
query_id=85	0 docs. en común
query_id=86	6 docs. en común
query_id=87	0

Por lo tanto, la razón de que el ponderador `p` no tiene efectos en precision y recall, es que las evaluaciones de las consultas sobre los títulos y los cuerpos por separados entregan conjuntos de documentos demasiado distintos, causando poca probabilidad de realizar cambios significativos al momento de combinar los resultados. Se asumió incorrectamente que los documentos retornados por la consulta evaluada sobre los títulos estarían probablemente también en los retornados al evaluar sobre los cuerpos.

Se concluye que es posible reimplementar la ponderación, pero cambiando la metodología: por ejemplo, asignar score BM25 adicional por cada palabra que esté en el título del documento siendo rankeado.