In [10]:
%%capture
import math
import numpy as np
import pandas as pd
import nltk 
from nltk import RegexpTokenizer as rpt
from nltk.corpus import stopwords as sw

nltk.download('punkt')
nltk.download('stopwords')
stopwords = sw.words('portuguese')

data_url="https://raw.githubusercontent.com/liraop/recinfo_lab2/master/data/results.csv"
data = pd.read_csv(data_url).replace(np.nan, '', regex=True)
documents = data.text.count()
N = documents 

def parse(text):
    words = []
    word_pattern = rpt(r'\w+')
    year_pattern = rpt(r'\d{4}')
    
    patterns = [word_pattern, year_pattern]
    
    for pattern in patterns:
        tokens = []
        for token in pattern.tokenize(text):
            if token not in stopwords and len(token) > 3:
                tokens.append(token)
        words.extend(tokens)
    return words


def get_idf(index):
    for ngram in index:
        k = len(index[ngram])
        m = documents + 1
        index[ngram]['idf'] = math.log(m/k)
         

def build_index(dataset):
    document_index = 0
    index = {}
    
    for entry in dataset.text:
        document_index = document_index + 1
            
        for ngram in parse(entry):
                if ngram in index: #is ngram already on index?
                    if document_index in index[ngram]: # is it in the same document?
                        index[ngram][document_index] = index[ngram][document_index] + 1                        
                    else: # nope
                        index[ngram][document_index] = 1 
                else: # no, sir
                    index[ngram] = {document_index: 1}
    get_idf(index)           
    
    return index
  
def get_top5rank(doc_score):
   
    df_tmp = pd.DataFrame(doc_score.items(), columns=["document", "score"])
    df_tmp['r']= df_tmp.score.rank(ascending=False, method="first")
    df_tmp.sort_values("r", inplace = True)
    df_tmp = df_tmp[:5]
        
    return df_tmp
    
index = build_index(data)

queries = ["juíza","governo","seleção"]

[nltk_data] Downloading package punkt to /Users/liraop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liraop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Representação binária

In [30]:
def bin_query_vector(index, query):
    query_vector = []
    
    for word in index:
        if word in query.split():
            query_vector.append(1)
        else:
            query_vector.append(0)
            
    return query_vector

def bin_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                doc_vec.append(1)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
    
    return document_vector
                

def f_bin(query_vector, doc_vector):
    rec = {}

    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            sim += (query_vector[i] * vector[i])
        rec[doc_id+1] = sim
    
    return rec

def binary_vsm(index, query):
    query_vector = bin_query_vector(index, query)   
    doc_vector = bin_document_vector(index)
    
    return f_bin(query_vector, doc_vector)

pd.concat([get_top5rank(binary_vsm(index, "juíza")),
          get_top5rank(binary_vsm(index, "governo")),
          get_top5rank(binary_vsm(index, "seleção"))]
          ,keys=["juíza","governo","seleção"])

Unnamed: 0,Unnamed: 1,document,score,r
juíza,0,1,1,1.0
juíza,1,2,1,2.0
juíza,2,3,0,3.0
juíza,3,4,0,4.0
juíza,4,5,0,5.0
governo,2,3,1,1.0
governo,20,21,1,2.0
governo,24,25,1,3.0
governo,41,42,1,4.0
governo,72,73,1,5.0


### TF

In [32]:
def tf_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                y = index[ngram][doc_id]
                doc_vec.append(y)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
        
    return document_vector

def tf_query_vector(index, query):
    query_vector = []
    
    for ngram in index:
        w = 0
        for term in query.split():
            if ngram == term:
                w += 1
        query_vector.append(w)
        
    return query_vector

def f_tf(query_vector, doc_vector):
    rec = {}
    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            sim += (query_vector[i] * vector[i])
        rec[doc_id+1] = sim
    return rec

def tf_vsm(index, query):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tf_document_vector(index)
    
    return f_tf(query_vector, doc_vector)

pd.concat([get_top5rank(tf_vsm(index, "juíza")),
          get_top5rank(tf_vsm(index, "governo")),
          get_top5rank(tf_vsm(index, "seleção"))]
          ,keys=["juíza","governo","seleção"])

Unnamed: 0,Unnamed: 1,document,score,r
juíza,0,1,2,1.0
juíza,1,2,1,2.0
juíza,2,3,0,3.0
juíza,3,4,0,4.0
juíza,4,5,0,5.0
governo,172,173,7,1.0
governo,165,166,6,2.0
governo,83,84,5,3.0
governo,209,210,5,4.0
governo,24,25,4,5.0


### TF-IDF

In [31]:
def tfidf_document_vector(index):
    document_vector = []
    
    for doc_id in range(1,documents+1):
        doc_vec = []
        for ngram in index:
            if doc_id in index[ngram].keys():
                y = index[ngram][doc_id] * index[ngram]['idf']
                doc_vec.append(y)
            else:
                doc_vec.append(0)
                
        document_vector.append(doc_vec)
        
    return document_vector


def tfidf_vsm(index, query):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tfidf_document_vector(index)
    
    return f_tf(query_vector, doc_vector)

pd.concat([get_top5rank(tfidf_vsm(index, "juíza")),
          get_top5rank(tfidf_vsm(index, "governo")),
          get_top5rank(tfidf_vsm(index, "seleção"))]
          ,keys=["juíza","governo","seleção"])

Unnamed: 0,Unnamed: 1,document,score,r
juíza,0,1,9.656627,1.0
juíza,1,2,4.828314,2.0
juíza,2,3,0.0,3.0
juíza,3,4,0.0,4.0
juíza,4,5,0.0,5.0
governo,172,173,15.079156,1.0
governo,165,166,12.924991,2.0
governo,83,84,10.770825,3.0
governo,209,210,10.770825,4.0
governo,24,25,8.61666,5.0


### BM25

In [36]:
def f_bm25(query_vector, doc_vector, k):
    rec = {}
    for doc_id in range(len(doc_vector)):
        sim = 0
        vector = doc_vector[doc_id]
        for i in range(len(vector)):
            if vector[i] != 0:
                y = (k+1) * query_vector[i]
                dom = (query_vector[i] * y)/(query_vector[i]+k)
                sim += (dom * math.log10((documents + 1)/vector[i]))
        rec[doc_id+1] = sim
    return rec
    

def bm25_vsm(index, query, k):
    query_vector = tf_query_vector(index, query)   
    doc_vector = tf_document_vector(index)
    
    return f_bm25(query_vector, doc_vector, k)

pd.concat([get_top5rank(bm25_vsm(index, "juíza", 100)),
          get_top5rank(bm25_vsm(index, "governo", 100)),
          get_top5rank(bm25_vsm(index, "seleção", 100))]
          ,keys=["juíza","governo","seleção"])

Unnamed: 0,Unnamed: 1,document,score,r
juíza,1,2,2.39794,1.0
juíza,0,1,2.09691,2.0
juíza,2,3,0.0,3.0
juíza,3,4,0.0,4.0
juíza,4,5,0.0,5.0
governo,2,3,2.39794,1.0
governo,20,21,2.39794,2.0
governo,41,42,2.39794,3.0
governo,72,73,2.39794,4.0
governo,94,95,2.39794,5.0


#### Observando os resultados, o modelo binário foi o mais fraco, naturalmente. TF e TF-IDF retornaram documentos similares, tendo o TF-IDF uma precisão um tanto maior, exatamente pelo fato de usar o IDF. 
#### O resultado mais diferente que podemos observar é o bm25. Todos os outros modelos retornaram o documento 1 em primeiro e o 2 em segundo. Investigando os dois documentos, vemos que o documento 2 é um tanto menor que o 1. Logo, pelo cálculo de relevância do bm25, faz sentido que este seja realmente o primeiro colocado. Tendo em vista esta análise, o modelo bm25 é de fato o que retornou melhores recomendações.