In [1]:
import numpy as np
import pandas as pd
import collections
import nltk
from time import time
from abc import ABC, abstractmethod
import re
from inverted_index import get_inverted_index
from search_engine import search
from tokenizer import tokenize

### Creating constants that will be used over this report

In [2]:
COLUMN_AXIS = 1
FULL_REPORT_COLNAME = 'noticia'
CONTENT_COLNAME = 'conteudo'
SUBTITLE_COLNAME = 'subTitulo'
TITLE_COLNAME = 'titulo'
TOKENS_COLNAME = 'tokens'
TERM_COLNAME = 'term'
REPORT_ID_COLNAME = 'idNoticia'

In [3]:
df = pd.read_csv('../../data/estadao_noticias_eleicao.csv')

### Preprocessing data

Concatenating alls reports' title and content in just one column.

In [4]:
def concatenate_report(row):
    """Concatenate report title and content in just one column.
        
    Args:
        row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

    Return:
        str: full report (content with title) in lowercase.
    """
    title = str(row[TITLE_COLNAME])
    subtitle = str(row[SUBTITLE_COLNAME])
    content = str(row[CONTENT_COLNAME])
    full_report = title + " " + subtitle + " " + content
    return full_report.lower()

Replacing content values that are NaN (not a number) for an empty string.

In [5]:
empty_str = ""
df[TITLE_COLNAME].fillna(empty_str, inplace=True)
df[SUBTITLE_COLNAME].fillna(empty_str, inplace=True)
df[CONTENT_COLNAME].fillna(empty_str, inplace=True)

In [6]:
df[FULL_REPORT_COLNAME] = df.apply(
    lambda row: concatenate_report(row), axis=COLUMN_AXIS)

Selecting just report's id and full content columns:

In [7]:
df = df[[REPORT_ID_COLNAME, FULL_REPORT_COLNAME]]

### Tokenizing report's text and saving tokens in another column in dataframe

In [9]:
df = tokenize(df, FULL_REPORT_COLNAME, TOKENS_COLNAME)

Dataframe now looks like:

In [10]:
df.head()

Unnamed: 0,idNoticia,noticia,tokens
0,1,pt espera 30 mil pessoas em festa na esplanada...,"[pt, espera, 30, mil, pessoas, em, festa, na, ..."
1,2,alckmin toma posse de olho no planalto governa...,"[alckmin, toma, posse, de, olho, no, planalto,..."
2,3,seis obstáculos e desafios do segundo mandato ...,"[seis, obstáculos, e, desafios, do, segundo, m..."
3,4,veja as principais fotos do dia e dos eventos...,"[veja, as, principais, fotos, do, dia, e, dos,..."
4,5,veja as principais fotos do dia e dos eventos...,"[veja, as, principais, fotos, do, dia, e, dos,..."


### Creating inverted index

In [11]:
inverted_index = get_inverted_index(df)

# PARTE DOIS

### Anotações do lab

A decisão de salvar a frequência do documennto em uma estrutura fora do inverted_index é que por definição o inverted_index só contem a chave termo e o valor docs onde ele aparece e a frequência do termo em todos os docs

In [12]:
class TermEstimator:
    
    def __init__(self, df, inverted_index):
        self.df = df
        self.big_term_freq_dict = dict()
        self.inverted_index = inverted_index
        self._calc_terms_frequency()
    
    def get_tf(self, term, doc_id):
        return self.big_term_freq_dict[doc_id][term]
    
    def calc_idf(self, term):
        n_documents = self.get_number_of_docs()
        n_containing_term = self.get_number_of_docs_containing(term)
        idf = np.log((n_documents + 1) / (n_containing_term))
        return idf
    
    def calc_tfidf(self, term, doc_id):
        tf = self.get_tf(term, doc_id)
        idf = self.calc_idf(term)
        return tf * idf     
    
    def get_number_of_docs(self):
        NUMBER_OF_ROWS_INDEX = 0
        n_documents = self.df.shape[NUMBER_OF_ROWS_INDEX] 
        return n_documents
    
    def get_number_of_docs_containing(self, term):
        return len(self.inverted_index[term].get_docs_ids())
    
    def _calc_doc_terms_frequency(self, doc_id, doc_terms):
        terms_frequencies = collections.Counter(doc_terms)
        self.big_term_freq_dict[doc_id] = terms_frequencies
    
    def _calc_terms_frequency(self):
        self.df.apply(
            lambda row: self._calc_doc_terms_frequency(
                row[REPORT_ID_COLNAME], row[TOKENS_COLNAME]), 
            axis=COLUMN_AXIS)

In [13]:
class Scorer(ABC):
    
    def __init__(self, term_estimator):
        self.term_estimator = term_estimator

    @abstractmethod
    def sim(self, query_terms, doc_id):
        pass
        
    def ranking_search(self, query, search_result, k):
        ranking = []
        query_terms = query.split(" AND ")
        for doc_id in search_result:
            score = self.sim(query_terms, doc_id)
            ranking.append((doc_id, score))
        ranking = sorted(ranking, key=lambda t: t[1], reverse=True)
        top_k = ranking[:k]
        top_k = list(map(lambda x: x[0], top_k))
        return top_k

In [14]:
class BinaryScorer(Scorer):
    #TO DO: para esse falta retornar o ranking de modo não ordenado, para bater com o gabarito
    def __init__(self, term_estimator):
        Scorer.__init__(self, term_estimator)
        
    # essa função poderia ser simplificada retornando a quantidade de termos na query, visto que todos os docs
    # retornados num resultado de uma busca (que é uma conjuntiva múltipla) vão ter todos os termos da query
    def sim(self, query_terms, doc_id):
        n_matches_with_query = 0
        for query_term in query_terms:
            if doc_id in self.term_estimator.inverted_index[query_term].get_docs_ids():
                n_matches_with_query += 1
        return n_matches_with_query

In [15]:
class FrequencyScorer(Scorer):

    def __init__(self, term_estimator):
        Scorer.__init__(self, term_estimator)
        
    def sim(self, query_terms, doc_id):
        tf_accumulated = 0
        for query_term in query_terms:
            tf = self.term_estimator.get_tf(query_term, doc_id)
            tf_accumulated += tf
        return tf_accumulated

In [16]:
class FrequencyIDFScorer(Scorer):
    
    def __init__(self, term_estimator):
        Scorer.__init__(self, term_estimator)
        
    def sim(self, query_terms, doc_id):
        tfidf_accumulated = 0
        for query_term in query_terms:
            tfidf = self.term_estimator.calc_tfidf(query_term, doc_id)
            tfidf_accumulated += tfidf
        return tfidf_accumulated

In [17]:
class BM25Scorer(Scorer):
    
    def __init__(self, term_estimator):
        Scorer.__init__(self, term_estimator)
        
    def sim(self, query_terms, doc_id):
        score_accumulated = 0
        for query_term in query_terms:
            k = np.random.uniform(low=1.2, high=2)
            idf = self.term_estimator.calc_idf(query_term)
            tf = self.term_estimator.get_tf(query_term, doc_id)
            score = idf * (tf * (k+1) / (tf + k))
            score_accumulated += score
        return score_accumulated

## Consultas

In [18]:
querys = [
    "segundo turno",
    "lava jato",
    "projeto de lei",
    "compra de voto",
    "ministério público"
]

In [19]:
term_estimator = TermEstimator(df, inverted_index)

In [20]:
def search_ranked(query, scorer, k=5):
    boolean_query = " AND ".join(query.split(" "))
    search_result = search(boolean_query, inverted_index)
    ranked_result = scorer.ranking_search(boolean_query, search_result, k)
    return ranked_result

In [21]:
scorer = BinaryScorer(term_estimator)
binary_scorer_results = []
for query in querys:
    result = search_ranked(query, scorer)
    binary_scorer_results.append(result)

In [22]:
scorer = FrequencyScorer(term_estimator)
frequency_scorer_results = []
for query in querys:
    result = search_ranked(query, scorer)
    frequency_scorer_results.append(result)

In [23]:
scorer = FrequencyIDFScorer(term_estimator)
frequency_idf_scorer_results = []
for query in querys:
    result = search_ranked(query, scorer)
    frequency_idf_scorer_results.append(result)

In [24]:
scorer = BM25Scorer(term_estimator)
bm25_scorer_results = []
for query in querys:
    result = search_ranked(query, scorer)
    bm25_scorer_results.append(result)

## Comparando com Gabarito

In [25]:
gabarito = pd.read_csv('gabarito.csv')

In [26]:
from average_precision import mapk

In [27]:
GOOGLE_COL_NAME = 'google'
BINARY_SEARCH_COL_NAME = 'busca_binaria'
TF_COL_NAME = 'tf'
TFIDF_COL_NAME = 'tfidf'
BM25_COL_NAME = 'bm25'

In [28]:
def get_expected_results(expected_result_type):
    expected_answers = []
    from_df = gabarito[expected_result_type]
    for query_result in from_df:        
        as_str = re.sub('[,\[\]]', '', query_result)
        as_list = as_str.split(" ")
        list_of_int = list(map(int, as_list))
        expected_answers.append(list_of_int)
    return expected_answers

## Comparando minhas repostas com os modelos do gabarito

In [29]:
expected_binary_scorer_results = get_expected_results(BINARY_SEARCH_COL_NAME)
expected_frequency_scorer_results = get_expected_results(TF_COL_NAME)
expected_frequency_idf_scorer_results = get_expected_results(TFIDF_COL_NAME)
expected_bm25_scorer_results = get_expected_results(BM25_COL_NAME)

In [30]:
print(mapk(expected_binary_scorer_results, binary_scorer_results, k=5))
print(mapk(expected_frequency_scorer_results, frequency_scorer_results, k=5))
print(mapk(expected_frequency_idf_scorer_results, frequency_idf_scorer_results, k=5))
print(mapk(expected_bm25_scorer_results, bm25_scorer_results, k=5))

0.9199999999999999
1.0
0.7606666666666666
0.368


In [31]:
google_results = get_expected_results(GOOGLE_COL_NAME)

In [32]:
print(mapk(google_results, binary_scorer_results, k=5))
print(mapk(google_results, frequency_scorer_results, k=5))
print(mapk(google_results, frequency_idf_scorer_results, k=5))
print(mapk(google_results, bm25_scorer_results, k=5))

0.0
0.048
0.08399999999999999
0.13999999999999999


TODO: 
- documentar métodos