In [1]:
import numpy as np
import pandas as pd
from textblob import TextBlob as tb
from time import time

In [2]:
df = pd.read_csv('../data/noticias_estadao.csv', sep=',')

# Meeting data

In [3]:
df.head()

Unnamed: 0,titulo,conteudo,idNoticia
0,PT espera 30 mil pessoas em festa na Esplanada,BRASÍLIA - Após o desgaste provocado com o lan...,1
1,Alckmin toma posse de olho no Planalto,"Reeleito em outubro, o governador tucano Geral...",2
2,Seis obstáculos e desafios do segundo mandato ...,1. Rearranjo das contas A nova equipe econôm...,3
3,Veja os desafios dos governadores que assumem ...,"No Acre, governador reeleito quer erradicar an...",4
4,PT impulsiona cerimônia de posse da Dilma nas ...,"Os perfis da presidente Dilma Rousseff, nas re...",5


In [4]:
print("Data has %d rows and %d columns" % df.shape)

Data has 7643 rows and 3 columns


# Creating constants that will be used over this report

In [5]:
COLUMN_AXIS = 1
FULL_REPORT_COLNAME = 'noticia'
CONTENT_COLNAME = 'conteudo'
TITLE_COLNAME = 'titulo'
TOKENS_COLNAME = 'tokens'
TERM_COLNAME = 'term'
REPORT_ID_COLNAME = 'idNoticia'
AND = 'AND'
OR = 'OR'

# Concatenate alls reports' title and content in just one column.

In [6]:
def concatenate_report(df_row):
    """Concatenate report title and content in just one column.
        
        Args:
            df_row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            str: full report (content with title) in lowercase.
    """

    full_report = df_row[TITLE_COLNAME] + " " + df_row[CONTENT_COLNAME]
    return full_report.lower()

In [7]:
df[FULL_REPORT_COLNAME] = df.apply(
    lambda row: concatenate_report(row), 
    axis=COLUMN_AXIS
)

Selecting just report's id and full content columns:

In [8]:
df = df[[REPORT_ID_COLNAME, FULL_REPORT_COLNAME]]

Dataframe now looks like:

In [9]:
df.head()

Unnamed: 0,idNoticia,noticia
0,1,pt espera 30 mil pessoas em festa na esplanada...
1,2,alckmin toma posse de olho no planalto reeleit...
2,3,seis obstáculos e desafios do segundo mandato ...
3,4,veja os desafios dos governadores que assumem ...
4,5,pt impulsiona cerimônia de posse da dilma nas ...


# Tokenizing report's text and saving tokens in another column in dataframe

In [10]:
def tokenize_text(df_row):
    """Tokenize the text content of a report given as a row from a DataFrame
        
        Args:
            df_row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            set: a report content turned into a set of tokens.
    """    
    
    text_blob = tb(df_row[FULL_REPORT_COLNAME]) 
    m_tokens = set(text_blob.words)
    return m_tokens

In [11]:
df[TOKENS_COLNAME] = df.apply(
    lambda row: tokenize_text(row), 
    axis=COLUMN_AXIS
)

# Creating inverted index

First, we will create a intermediate structure called unnested_tokens. This structure will save each token of a report, associating it to the report's id. After this step, we will group this unnested_tokens structure by tokens, getting all reports' ids in which a specific token appears.

In [12]:
def unnest_tokens_report(unnested_tokens_list, df_row):
    """Given a row observation of a DataFrame to represent a report (with content,
    tokens and id), iterate over the set of tokens and save each one as a dict with 
    token value and report id. Each dict is appended in the unnested_tokens_list
    passed as param.
        
        Args:
            unnested_tokens_list (list): list of dicts, each dict containing a token value 
                and the report id where it occured.
            df_row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            
    """  
    
    for token in df_row[TOKENS_COLNAME]:
        new_row = {
            TERM_COLNAME: token.strip('\'').strip(),
            REPORT_ID_COLNAME: df_row[REPORT_ID_COLNAME]
        }
        unnested_tokens_list.append(new_row)

In [13]:
unnested_tokens_list = []
df.apply(
    lambda row: unnest_tokens_report(unnested_tokens_list, row), 
    axis=COLUMN_AXIS
)

print("The unnested_tokens_list looks like: \n")
print(unnested_tokens_list[:9])
print("\nThe 'list of dicts' format will be used to create a pandas.DataFrame:")

The unnested_tokens_list looks like: 

[{'term': 'edna', 'idNoticia': 1}, {'term': 'mandato', 'idNoticia': 1}, {'term': 'nos', 'idNoticia': 1}, {'term': 'viagem', 'idNoticia': 1}, {'term': 'entre', 'idNoticia': 1}, {'term': 'mulher', 'idNoticia': 1}, {'term': 'alguém', 'idNoticia': 1}, {'term': 'e', 'idNoticia': 1}, {'term': 'poderes', 'idNoticia': 1}]

The 'list of dicts' format will be used to create a pandas.DataFrame:


In [14]:
unnested_tokens_df = pd.DataFrame(unnested_tokens_list)
unnested_tokens_df.head(10)

Unnamed: 0,idNoticia,term
0,1,edna
1,1,mandato
2,1,nos
3,1,viagem
4,1,entre
5,1,mulher
6,1,alguém
7,1,e
8,1,poderes
9,1,rolls-royce


### Grouping by term to create inverted index

In [15]:
class InvertedIndexTermOccurrence(object):
    """Class for register term frequency and docs' ids in which a 
    term of a inverted index structure appears.
    
    Attributes:
        term_freq (int): Quantity of docs in which term appears.
        docs_ids (list): ids of docs in which term appears.
    """
    
    def __init__(self, term_freq, docs_ids):
        self.term_freq = term_freq
        self.docs_ids = docs_ids
    
    def get_term_freq(self):
        return self.term_freq
    
    def get_docs_ids(self):
        return self.docs_ids

In [16]:
def create_inverted_index_structure(unnested_tokens_df):
    
    inverted_index = dict()

    for term, obs in unnested_tokens_df.groupby([TERM_COLNAME]):

        term_freq = len(obs.get_values())
        docs_ids = set(obs[REPORT_ID_COLNAME])

        inverted_index[term] = InvertedIndexTermOccurrence(term_freq, docs_ids)
    
    return inverted_index

In [17]:
inverted_index = create_inverted_index_structure(unnested_tokens_df)

## Processing Queries

In [18]:
def is_one_term_query(query):
    
    empty_str = ""
    space_str = " "
    
    if query == empty_str:
        raise ValueError('You should search for a non empty string.')
    
    elif query == space_str:
        return True
    
    else:
        return len(query.split(space_str)) == 1

In [19]:
def get_query_operator(query):
    return AND if AND in query else OR

In [20]:
def search_two_terms(terms_to_search, operator):
    
    term1 = terms_to_search[0]
    term2 = terms_to_search[1]
    docs_ids_term1 = inverted_index[term1].get_docs_ids()
    docs_ids_term2 = inverted_index[term2].get_docs_ids()                
    
    if operator == AND:
        result = docs_ids_term1 & docs_ids_term2
    elif operator == OR:
        result = docs_ids_term1 | docs_ids_term2
        
    return list(result)

In [21]:
def search(query, inverted_index):
    
    if is_one_term_query(query):
        return inverted_index[query].get_docs_ids()  
    
    else:    
        operator = get_query_operator(query)
        terms_to_search = query.split(" " + operator + " ")
        terms_to_search = list(map(lambda term: term.lower(), terms_to_search)) # lowercase all terms to search
        
        return search_two_terms(terms_to_search, operator)        

# Sanities checks

In [22]:
query = "Campina AND Grande"
search_result = sorted(search(query, inverted_index))
correct_answer = sorted([1952, 4802, 1987, 6694, 5382, 1770, 2763, 1068, 5870, 2777, 1370, 2779])
assert search_result == correct_answer

In [24]:
# inverted_index = dict()

# ROW = 0
# for row_index in range(df.shape[ROW]):
#     for token in df[TOKENS_COL][row_index]:
        
#         report_id = data.at[row_index, REPORT_ID_COLNAME]
        
#         if token in inverted_index:
#             inverted_index[token].append(report_id)
#         else:
#             inverted_index[token] = [report_id]

As consultas devem ser caseSensitive?
E as stopwords?
Pode usar funções de set?