In [None]:
%%capture
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import nltk 
import heapq
from nltk import RegexpTokenizer as rpt
from nltk.corpus import stopwords as sw
from string import punctuation 

nltk.download('punkt')
nltk.download('stopwords')
stopwords = sw.words('portuguese')

data_url="https://raw.githubusercontent.com/liraop/recinfo_lab2/master/data/results.csv"
data = pd.read_csv(data_url).replace(np.nan, '', regex=True)
documents = data.text.count()

def parse(text):
    words = []
    word_pattern = rpt(r'\w+')
    year_pattern = rpt(r'\d{4}')
    
    patterns = [word_pattern, year_pattern]
    
    for pattern in patterns:
        tokens = []
        for token in pattern.tokenize(text):
            if token not in stopwords and len(token) > 3:
                tokens.append(token)
        words.extend(tokens)
    return words


def build_index(dataset):
    document_index = 0
    index = {"doc_row": []}
    
    for entry in dataset.text:
        document_index = document_index + 1
        index["doc_row"].append(document_index)
            
        for ngram in parse(entry):
                if ngram in index: #is ngram already on index?
                    if document_index in index[ngram]: # is it in the same document?
                        index[ngram][document_index] = index[ngram][document_index] + 1
                    else: # nope
                        index[ngram][document_index] = 1 
                else: # no, sir
                    index[ngram] = {document_index: 1}
    
    return index
                        
index = build_index(data)

queries = ["juíza","federal","governo","Brasil","presidente"]

#### Calcule as top-10 palavras mais associadas a cada uma dessas 5 palavras de acordo com as 4 métricas que vimos na aula. Você deve produzir uma tabela similar à tabela 6.3 do capítulo 6 do livro texto (pág. 204). Qual métrica você acha que obteve os melhores resultados? Por que? (20 pts.)

In [None]:
def mim(index, word1, word2):
    n_word1 = 0.0
    n_word2 = 0.0
    n_w1w2 = 0.0
    
    if word1 in index:
        inverted_list = index[word1]
        n_word1 = len(inverted_list)
    
    if word2 in index:
        inverted_list = index[word1]
        n_word2 = len(inverted_list)
    
    if n_word1 != 0 and n_word2 != 0:
        word1_list = index[word1]
        word2_list = index[word2]
        
        for document in word1_list.keys():
            if document in word2_list:
                n_w1w2 += 1
    
    n_w1xw2 = n_word1 * n_word2
    return n_w1w2/n_w1xw2


def dice(index, word1, word2):
    n_word1 = 0.0
    n_word2 = 0.0
    n_w1w2 = 0.0
        
    if word1 in index:
        inverted_list = index[word1]
        n_word1 = len(inverted_list)
    
    if word2 in index:
        inverted_list = index[word1]
        n_word2 = len(inverted_list)
    
    if n_word1 != 0 and n_word2 != 0:
        word1_list = index[word1]
        word2_list = index[word2]
        
        for document in word1_list.keys():
            if document in word2_list:
                n_w1w2 += 1
                
    n_w1pw2 = n_word1 + n_word2
    return n_w1w2/n_w1pw2



def get_query_top10rank(index, query, metric):
    lst=[]
    for word in index.keys():
        if word != 'doc_row' and word != query:
          lst.append([word, metric(index, query, word)])
    
    df_tmp = pd.DataFrame(lst, columns=["word", "metric"])
    df_tmp['r']= df_tmp.metric.rank(ascending=False, method="first")
    df_tmp.sort_values("r", inplace = True) 
    
    y = []
    for i in range(10):
        y.append(df_tmp[:10].word.get(i))
        
    return y

#### De acordo com a métrica que deu os melhores resultados na sua opinião, execute agora cada consulta (usando a abordagem documento- ou termo-por-vez)  expandido-a com: os top-3, top-5 e top-10 documentos. O que acontece com a qualidade dos resultados em cada caso? Aumenta ou diminui? Justifique bem sua resposta. (25 pts.)