In [1]:
%%capture
import math
import numpy as np
import pandas as pd
import nltk 
from nltk import RegexpTokenizer as rpt
from nltk.corpus import stopwords as sw

nltk.download('punkt')
nltk.download('stopwords')
stopwords = sw.words('portuguese')

data_url="https://raw.githubusercontent.com/liraop/recinfo_lab2/master/data/results.csv"
data = pd.read_csv(data_url).replace(np.nan, '', regex=True)
documents = data.text.count()
N = documents 

def parse(text):
    words = []
    word_pattern = rpt(r'\w+')
    year_pattern = rpt(r'\d{4}')
    
    patterns = [word_pattern, year_pattern]
    
    for pattern in patterns:
        tokens = []
        for token in pattern.tokenize(text):
            if token not in stopwords and len(token) > 3:
                tokens.append(token)
        words.extend(tokens)
    return words


def get_idf(index):
    for ngram in index:
        k = len(index[ngram])
        m = documents + 1
        index[ngram]['idf'] = math.log(m/k)
         

def build_index(dataset):
    document_index = 0
    index = {}
    
    for entry in dataset.text:
        document_index = document_index + 1
            
        for ngram in parse(entry):
                if ngram in index: #is ngram already on index?
                    if document_index in index[ngram]: # is it in the same document?
                        index[ngram][document_index] = index[ngram][document_index] + 1                        
                    else: # nope
                        index[ngram][document_index] = 1 
                else: # no, sir
                    index[ngram] = {document_index: 1}
    get_idf(index)           
    
    return index
  
def get_top10rank(doc_score):
   
    df_tmp = pd.DataFrame(doc_score.items(), columns=["document", "score"])
    df_tmp['r']= df_tmp.score.rank(ascending=False, method="first")
    df_tmp.sort_values("r", inplace = True)
    df_tmp = df_tmp[:10]
        
    return df_tmp
    
index = build_index(data)

def bin_query_vector(index, query):
    query_vector = []
    
    for word in index:
        if word in query.split():
            query_vector.append(1)
        else:
            query_vector.append(0)
            
    return query_vector

def bin_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                doc_vec.append(1)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
    
    return document_vector
                

def f_bin(query_vector, doc_vector):
    rec = {}

    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            sim += (query_vector[i] * vector[i])
        rec[doc_id+1] = sim
    
    return rec

def binary_vsm(index, query):
    query_vector = bin_query_vector(index, query)   
    doc_vector = bin_document_vector(index)
    
    return f_bin(query_vector, doc_vector)

def tf_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                y = index[ngram][doc_id]
                doc_vec.append(y)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
        
    return document_vector

def tf_query_vector(index, query):
    query_vector = []
    
    for ngram in index:
        w = 0
        for term in query.split():
            if ngram == term:
                w += 1
        query_vector.append(w)
        
    return query_vector

def f_tf(query_vector, doc_vector):
    rec = {}
    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            sim += (query_vector[i] * vector[i])
        rec[doc_id+1] = sim
    return rec

def tf_vsm(index, query):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tf_document_vector(index)
    
    return f_tf(query_vector, doc_vector)

def tfidf_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                y = index[ngram][doc_id] * index[ngram]['idf']
                doc_vec.append(y)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
        
    return document_vector


def tfidf_vsm(index, query):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tfidf_document_vector(index)
    
    return f_tf(query_vector, doc_vector)


def f_bm25(query_vector, doc_vector, k):
    rec = {}
    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            if vector[i] != 0:
                y = (k+1) * query_vector[i]
                dom = (query_vector[i] * y)/(query_vector[i]+k)
                sim += (dom * math.log10((documents + 1)/vector[i]))
        rec[doc_id+1] = sim
    return rec
    

def bm25_vsm(index, query, k):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tf_document_vector(index)
    
    return f_bm25(query_vector, doc_vector, k)


### Q1: Escolha um documento dentre aqueles da base do aluno Bernardi e crie uma consulta que você acha que tem boas chances de recuperar este documento. Em seguida, avalie os resultados de tal consulta usando a métrica de avaliação Reciprocal Rank ( 10 pts)

In [2]:
chosen_one = 13
doc_url = "https://brasil.elpais.com/brasil/2019/03/15/cultura/1552681746_926411.html"
query = "Gabo colombiano solidão"

def reciprocal_rank(rank, selected_doc):
    position = 0
    for index, row in rank.iterrows():
        position += 1
        if row["document"] == chosen_one:
            return 1.0/position
     
rank_binario = get_top10rank(binary_vsm(index, query))
print(rank_binario.to_string(index=False))
print("Reciprocal rank:", reciprocal_rank(rank_binario, chosen_one))

UsageError: Line magic function `%%capture` not found.


#### Vemos acima que o reciprocal rank retorna o documento com maior score retornado pelo vsm. Está correto pois o documento, além de ser o escolhido, tem o maior numero de termos da query nele.

In [None]:
rank_tf = get_top10rank(tf_vsm(index, query))
print(rank_tf.to_string(index=False))
print("Reciprocal rank:", reciprocal_rank(rank_tf, chosen_one))

In [None]:
rank_tfidf = get_top10rank(tfidf_vsm(index, query))
print(rank_tfidf.to_string(index=False))
print("Reciprocal rank:", reciprocal_rank(rank_tfidf, chosen_one))

In [None]:
rank_bm25 = get_top10rank(bm25_vsm(index, query, 10))
print(rank_bm25.to_string(index=False))
print("Reciprocal rank:", reciprocal_rank(rank_bm25, chosen_one))

#### O mesmo ocorre com os outros modelos. 

### Q2: A partir do gabarito fornecido em OBS1, calcule o MAP para cada algoritmo abaixo e aponte qual obteve o melhor resultado. Para os cálculos do MAP, considere que um documento é relevante para uma dada consulta se este documento estiver entre os documentos do gabarito para essa consulta, senão ele deve ser considerado irrelevante. (10 pts)

In [None]:
query = "golpe militar"
### id dos documentos no gabarito da query
gabarito = [1, 120, 208]

def MAP(rank, relevant_docs):
    relevant_documents = 0
    position = 0
    tmp_rank = 0.0
    for index, row in rank.iterrows():
        position += 1
        if row["document"] in relevant_docs:
                relevant_documents += 1
                tmp_rank += 1.0/position
            
    return tmp_rank/relevant_documents

rank_binario = get_top10rank(binary_vsm(index, query))
print(rank_binario.to_string(index=False))
print("MAP metric:", MAP(rank_binario, gabarito))

In [None]:
rank_tf = get_top10rank(tf_vsm(index, query))
print(rank_tf.to_string(index=False))
print("MAP metric:", MAP(rank_tf, gabarito))

In [None]:
rank_tfidf = get_top10rank(tfidf_vsm(index, query))
print(rank_tfidf.to_string(index=False))
print("MAP metric:", MAP(rank_tfidf, gabarito))

In [None]:
rank_bm25 = get_top10rank(bm25_vsm(index, query, 10))
print(rank_bm25.to_string(index=False))
print("MAP metric:", MAP(rank_bm25, gabarito))

### Q3: Repita Q2 usando a avaliação multi-nível DCG. Utilize o campo "level" do gabarito para o cálculo do DCG e do idealDCG. (10 pts)

In [None]:
query = "golpe militar"
### id dos documentos no gabarito da query
gabarito = {1:6, 120:9, 208:5}

def DCG(rank, relevant_docs):
    relevant_documents = 0
    position = 0
    dcg_rank = []
    for index, row in rank.iterrows():
        position += 1
        tmp_rank = 0.0
        for d_id, rel in gabarito.items():
            if d_id == row["document"]:
                if relevant_documents == 0:
                    tmp_rank = rel
                else:
                    tmp_rank += (rel/math.log10(position))
                relevant_documents += 1
        dcg_rank.append(tmp_rank)
                    
            
    return dcg_rank

In [None]:
rank_binario = get_top10rank(binary_vsm(index, query))
rank_binario["DCG"] = DCG(rank_binario, gabarito)
print(rank_binario.to_string(index=False))

In [None]:
rank_tf = get_top10rank(tf_vsm(index, query))
rank_tf["DCG"] = DCG(rank_tf, gabarito)
print(rank_tf.to_string(index=False))

In [None]:
rank_tfidf = get_top10rank(tfidf_vsm(index, query))
rank_tfidf["DCG"] = DCG(rank_tfidf, gabarito)
print(rank_tfidf.to_string(index=False))

In [None]:
rank_bm25 = get_top10rank(bm25_vsm(index, query, 10))
rank_bm25["DCG"] = DCG(rank_bm25, gabarito)
print(rank_bm25.to_string(index=False))