In [179]:
import numpy as np
import pandas as pd
from textblob import TextBlob as tb
from time import time

In [180]:
df = pd.read_csv('../data/noticias_estadao.csv')

# Meeting data

In [181]:
df.head()

Unnamed: 0,titulo,conteudo,idNoticia
0,11 dos eleitores do País são filiados a legendas,Há porém variações regionais nesse fenômeno En...,7617
1,11 executivos integram 1º pedido de condenação...,CURITIBA A força-tarefa da Operação Lava Jato ...,412
2,11 executivos integram 1º pedido de condenação...,CURITIBA A força-tarefa da Operação Lava Jato ...,415
3,13 de deputados do PMDB quer romper com PT,O Estado ouviu 54 dos 74 deputados do PMDB em ...,6736
4,2014 começou em 2007,O estudo do Estadão Dados publicado ontem sobr...,7611


In [182]:
print("Data has %d rows and %d columns" % df.shape)

Data has 7643 rows and 3 columns


# Creating constants that will be used over this report

In [183]:
COLUMN_AXIS = 1
FULL_REPORT_COLNAME = 'noticia'
CONTENT_COLNAME = 'conteudo'
TITLE_COLNAME = 'titulo'
TOKENS_COLNAME = 'tokens'
TERM_COLNAME = 'term'
REPORT_ID_COLNAME = 'idNoticia'
AND = 'AND'
OR = 'OR'

# Concatenate alls reports' title and content in just one column.

In [184]:
def concatenate_report(row):
    """Concatenate report title and content in just one column.
        
        Args:
            row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            str: full report (content with title) in lowercase.
    """

    full_report = row[TITLE_COLNAME] + " " + row[CONTENT_COLNAME]
    return full_report.lower()

In [185]:
df[FULL_REPORT_COLNAME] = df.apply(
    lambda row: concatenate_report(row), 
    axis=COLUMN_AXIS
)

Selecting just report's id and full content columns:

In [186]:
df = df[[REPORT_ID_COLNAME, FULL_REPORT_COLNAME]]

Dataframe now looks like:

In [187]:
df.head()

Unnamed: 0,idNoticia,noticia
0,7617,11 dos eleitores do país são filiados a legend...
1,412,11 executivos integram 1º pedido de condenação...
2,415,11 executivos integram 1º pedido de condenação...
3,6736,13 de deputados do pmdb quer romper com pt o e...
4,7611,2014 começou em 2007 o estudo do estadão dados...


# Tokenizing report's text and saving tokens in another column in dataframe

In [188]:
def tokenize_text(row):
    """Tokenize the text content of a report given as a row from a DataFrame
        
        Args:
            row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            set: a report content turned into a set of tokens.
    """    
    
    text_blob = tb(row[FULL_REPORT_COLNAME]) 
    m_tokens = set(text_blob.words)
    return m_tokens

In [189]:
df[TOKENS_COLNAME] = df.apply(
    lambda row: tokenize_text(row), 
    axis=COLUMN_AXIS
)

# Creating inverted index

First, we will create a intermediate structure called unnested_tokens. This structure will save each token of a report, associating it to the report's id. After this step, we will group this unnested_tokens structure by tokens, getting all reports' ids where one specific token appears.

In [190]:
def unnest_tokens_report(unnested_tokens_list, row):
    """Given a row observation of a DataFrame to represent a report (with content,
    tokens and id), iterate over the set of tokens and save each one as a dict with 
    token value and report id. Each dict is appended in the unnested_tokens_list
    passed as param.
        
    Args:
        unnested_tokens_list (list): list of dicts, each dict containing a token value 
            and the report id where it occured.
        row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            
    """  
    
    for token in row[TOKENS_COLNAME]:
        new_row = {
            TERM_COLNAME: token.strip('\'').strip(),
            REPORT_ID_COLNAME: row[REPORT_ID_COLNAME]
        }
        unnested_tokens_list.append(new_row)

In [191]:
unnested_tokens_list = []
df.apply(lambda row: unnest_tokens_report(unnested_tokens_list, row), axis=COLUMN_AXIS)

print("The unnested_tokens_list looks like: \n")
print(unnested_tokens_list[:9])
print("\nThe 'list of dicts' format will be used to create a pandas.DataFrame:")

The unnested_tokens_list looks like: 

[{'idNoticia': 7617, 'term': 'maneira'}, {'idNoticia': 7617, 'term': 'vereador'}, {'idNoticia': 7617, 'term': 'filiação'}, {'idNoticia': 7617, 'term': 'médios'}, {'idNoticia': 7617, 'term': 'eleitores'}, {'idNoticia': 7617, 'term': 'dinâmica'}, {'idNoticia': 7617, 'term': 'dados'}, {'idNoticia': 7617, 'term': 'semelhante'}, {'idNoticia': 7617, 'term': 'santa'}]

The 'list of dicts' format will be used to create a pandas.DataFrame:


In [192]:
unnested_tokens_df = pd.DataFrame(unnested_tokens_list)
unnested_tokens_df.head(10)

Unnamed: 0,idNoticia,term
0,7617,maneira
1,7617,vereador
2,7617,filiação
3,7617,médios
4,7617,eleitores
5,7617,dinâmica
6,7617,dados
7,7617,semelhante
8,7617,santa
9,7617,ranking


### Grouping by term to create inverted index

In [193]:
class InvertedIndexTerm:
    """Class for register term frequency and docs' ids in which a 
    term of a inverted index structure appears.
    
    Attributes:
        term_freq (int): Quantity of docs in which term appears.
        docs_ids (list): ids of docs in which term appears.
    """
    
    def __init__(self, term, freq, docs_ids):
        self.term = term
        self.freq = freq
        self.docs_ids = docs_ids
    
    def get_term(self):
        return self.term
    
    def get_freq(self):
        return self.freq
    
    def get_docs_ids(self):
        return self.docs_ids

In [194]:
def create_inverted_index_structure(unnested_tokens_df):
    
    inverted_index = dict()

    for term, group_itens in unnested_tokens_df.groupby([TERM_COLNAME]):

        term_freq = len(group_itens.get_values())
        docs_ids = set(group_itens[REPORT_ID_COLNAME])

        inverted_index[term] = InvertedIndexTerm(term, term_freq, docs_ids)
    
    return inverted_index

In [195]:
m_inverted_index = create_inverted_index_structure(unnested_tokens_df)

## Processing Queries

In [196]:
def is_one_term_query(query):
    
    empty_str, space_str = "", " "    
    
    if query == empty_str or query == space_str:
        raise ValueError('You should search for a non empty string.')        
    else:
        return len(query.split(space_str)) == 1

In [197]:
def get_query_operator(query):    
    return AND if AND in query else OR

In [198]:
def lowercase_iterable_itens(iterable):
    return list(map(lambda term: term.lower(), iterable))

In [199]:
def sort_terms_list_per_freq(terms, inverted_index):
    
    terms_obj_list = list(map(lambda term: inverted_index[term], terms))    
    terms_obj_list.sort(key= lambda term: term.get_freq())
    return terms_obj_list

In [200]:
def _boolean_search(terms_to_search, operator, inverted_index):
    
    sorted_terms_per_freq = sort_terms_list_per_freq(terms_to_search, inverted_index)
    
    docs_ids = sorted_terms_per_freq[0].get_docs_ids()   
    result = docs_ids
    
    for another_term in sorted_terms_per_freq:
        
        docs_ids = another_term.get_docs_ids()  
    
        if operator == AND:
            result = result & docs_ids
        elif operator == OR:
            result = result | docs_ids
            
    return list(result)

In [221]:
def search(query, inverted_index):
    
    if is_one_term_query(query):
        term = query.lower()
        return inverted_index[term].get_docs_ids()  
    
    else:    
        operator = get_query_operator(query)
        terms_to_search = query.split(" " + operator + " ")
        terms_to_search = lowercase_iterable_itens(terms_to_search)
        
        return _boolean_search(terms_to_search, operator, inverted_index)

# Sanities checks

In [222]:
search_result = sorted(search("Campina AND Grande", m_inverted_index))
correct_answer = sorted([1952, 4802, 1987, 6694, 5382, 1770, 2763, 1068, 5870, 2777, 1370, 2779])
assert search_result == correct_answer

# Tests

### 1. debate, presidenciável (AND e OR)

In [223]:
assert len(search("debate OR presidencial", m_inverted_index)) == 1770
assert len(search("debate AND presidencial", m_inverted_index)) == 201

### 2. presidenciáveis, corruptos (AND e OR)

In [224]:
assert len(search("presidenciáveis OR corruptos", m_inverted_index)) == 164
assert len(search("presidenciáveis AND corruptos", m_inverted_index)) == 0

### 3. Belo, Horizonte (AND e OR)

In [225]:
assert len(search("Belo OR Horizonte", m_inverted_index)) == 331
assert len(search("Belo AND Horizonte", m_inverted_index)) == 242

### 4. candidatos (one term query)

In [226]:
msg_out = "Searching for word 'candidatos' results in %d reports containing this word."
n_reports = len(search("candidatos", m_inverted_index))
print(msg_out % n_reports)

Searching for word 'candidatos' results in 1395 reports containing this word.


# Bonus

In [None]:
# inverted_index = dict()

# ROW = 0
# for row_index in range(df.shape[ROW]):
#     for token in df[TOKENS_COL][row_index]:
        
#         report_id = data.at[row_index, REPORT_ID_COLNAME]
        
#         if token in inverted_index:
#             inverted_index[token].append(report_id)
#         else:
#             inverted_index[token] = [report_id]