In [None]:
import pandas as pd
import numpy as np
import nltk
from textblob import TextBlob
from math import log
import ast

In [None]:
dados = pd.read_csv('../data/estadao_noticias_eleicao.csv')
dados = dados.replace(np.nan, '', regex=True)

### Visão geral dos dados

In [None]:
dados.head()

In [None]:
print("Os dados tem %d observações e %d colunas" % dados.shape)

In [None]:
def calculaTF(documento, termo):
    return len(list(filter((lambda x: x.lower() == termo), documento)))

In [None]:
def calcula_indices_invertidos(data):
    indices = {}
    for index, row in data.iterrows():
        print(index, ",", row)
        titulo = TextBlob(row["titulo"]).words
        subTitulo = TextBlob(row["subTitulo"]).words
        conteudo = TextBlob(row["conteudo"]).words
        
        texto = titulo + subTitulo + conteudo
        
        for palavra in texto:
            palavra_low = palavra.lower()
            if(palavra_low in indices):
                temp = indices[palavra_low]
                if(row["idNoticia"] not in temp):
                    temp[row["idNoticia"]] = calculaTF(texto, palavra_low)
                    indices[palavra_low] = temp
            else:
                indices[palavra_low] = {row["idNoticia"]: calculaTF(texto, palavra_low)}
    return(indices)

indices_invertidos = calcula_indices_invertidos(dados)

In [None]:
n_documentos = dados.shape[0]

def idf(palavra):
    palavra = palavra.lower()
    k = len(indices_invertidos[palavra])
    idf = log((n_documentos+1)/k)
    
    return(idf)

In [None]:
#representação binária;
#TF;
#TF-IDF; 
# BM25

In [None]:
def calcula_binario(query, documento):
    total = 0
    for palavra in query:
        if(palavra in indices_invertidos):
            temp = indices_invertidos[palavra]
            if(documento in temp):
                total += 1
    return(total) 

def calcula_tf(query, documento):
    total = 0
    for palavra in query:
        if(palavra in indices_invertidos):
            temp = indices_invertidos[palavra]
            if(documento in temp):
                total += temp[documento]
    return(total)

def calcula_tf_idf(query, documento):
    total = 0
    for palavra in query:
        if(palavra in indices_invertidos):
            temp = indices_invertidos[palavra]
            if(documento in temp):
                total += (temp[documento] * idf(palavra))
    return(total)

def calcula_bm25(query, documento):
    total = 0
    k = 1.5
    for palavra in query:
        if(palavra in indices_invertidos):
            temp = indices_invertidos[palavra]
            if(documento in temp):
                total += ( ((temp[documento]*(k+1)) / (temp[documento]*k)) * idf(palavra) )
    return(total)

In [None]:
def busca_documentos(query):
    documentos = set()
    for palavra in query:
        if(palavra in indices_invertidos):
            documentos = documentos | set(indices_invertidos[palavra])
    return(list(documentos))

In [None]:
def ranking_binario(query):
    docs = busca_documentos(query)
    bin_docs = []
    for doc in docs:
        bin_docs.append(calcula_binario(query, doc))
    docs_ordenado = [docs[i] for i in np.argsort(bin_docs)[::-1]]
    return(docs_ordenado[0:5])

ranking_binario(["lava", "jato"])

In [None]:
def ranking_tf(query):
    docs = busca_documentos(query)
    tf_docs = []
    for doc in docs:
        tf_docs.append(calcula_tf(query, doc))
    docs_ordenado = [docs[i] for i in np.argsort(tf_docs)[::-1]]
    return(docs_ordenado[0:5])

ranking_tf(["segundo", "turno"])

In [None]:
def ranking_tf_idf(query):
    docs = busca_documentos(query)
    idf_docs = []
    for doc in docs:
        idf_docs.append(calcula_tf_idf(query, doc))
    docs_ordenado = [docs[i] for i in np.argsort(idf_docs)[::-1]]
    return(docs_ordenado[0:5])

ranking_tf_idf(["segundo", "turno"])   

In [None]:
def ranking_BM25(query):
    docs = busca_documentos(query)
    bm25_docs = []
    for doc in docs:
        bm25_docs.append(calcula_bm25(query, doc))
    docs_ordenado = [docs[i] for i in np.argsort(bm25_docs)[::-1]]
    return(docs_ordenado[0:5])

ranking_BM25(["compra", "de", "voto"])


In [172]:
gabarito = pd.read_csv('../data/gabarito.csv')
gabarito.head()

Unnamed: 0,str_busca,google,busca_binaria,tf,tfidf,bm25
0,segundo turno,"[1062, 1942, 2161, 2078, 2073]","[2048, 1, 2049, 2050, 4096]","[2744, 7, 2112, 7672, 2388]","[2744, 2112, 7672, 1235, 2388]","[2744, 2112, 7672, 2388, 2178]"
1,lava jato,"[616, 164, 1734, 163, 6716]","[3, 13, 15, 27, 6177]","[163, 353, 2807, 127, 359]","[163, 353, 2807, 127, 359]","[163, 353, 2807, 127, 359]"
2,projeto de lei,"[2853, 275, 978, 7092, 3171]","[3584, 6145, 8194, 8706, 6660]","[7, 3942, 7017, 1250, 6942]","[2232, 6461, 2853, 3171, 3942]","[2232, 6461, 3171, 2853, 3170]"
3,compra de voto,"[2200, 8615, 2265, 7746, 82]","[7424, 2178, 6531, 5122, 2311]","[3942, 7017, 5129, 2047, 748]","[7343, 7293, 6791, 3942, 2047]","[7343, 7293, 6791, 7329, 8615]"
4,ministério público,"[64, 6652, 164, 6550, 8615]","[8194, 7, 4104, 8201, 4109]","[6798, 8018, 6244, 6965, 6550]","[6798, 8018, 6244, 6965, 6550]","[6798, 8018, 6244, 6965, 6550]"


In [None]:
def apk(actual, predicted, k=10):
    if (len(predicted)>k):
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if (not actual):
        return 0.0

    return (score / min(len(actual), k))

def mapk(actual, predicted, k=10):
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [162]:
consultas = [
    ["segundo", "turno"],
    ["lava", "jato"],
    ["projeto", "de", "lei"],
    ["compra", "de", "voto"],
    ["ministério", "público"]
]

In [173]:
result_rep_binaria = []
result_tf = []
result_tf_idf = []
result_bm25 = []
for query in consultas:
    result_rep_binaria.append(ranking_binario(query))
    result_tf.append(ranking_tf(query))
    result_tf_idf.append(ranking_tf_idf(query))
    result_bm25.append(ranking_BM25(query))

In [183]:
expected_rep_binaria = gabarito["busca_binaria"].tolist()
expected_tf = gabarito["tf"].tolist()
expected_tf_idf = gabarito["tfidf"].tolist()
expected_bm25 = gabarito["bm25"].tolist()
expected_google = gabarito["google"].tolist()

for l in range(len(expected_rep_binaria)):
    expected_rep_binaria[l] = ast.literal_eval(expected_rep_binaria[l])
    expected_tf[l] = ast.literal_eval(expected_tf[l])
    expected_tf_idf[l] = ast.literal_eval(expected_tf_idf[l])
    expected_bm25[l] = ast.literal_eval(expected_bm25[l])
    expected_google[l] = ast.literal_eval(expected_google[l])

In [187]:
print(mapk(expected_rep_binaria, result_rep_binaria, k=5))
print(mapk(expected_tf, result_tf, k=5))
print(mapk(expected_tf_idf, result_tf_idf, k=5))
print(mapk(expected_bm25, result_bm25, k=5))


0.04
0.63
0.5940000000000001
0.013333333333333332
