In [None]:
import numpy as np
import pandas as pd
from textblob import TextBlob as tb
from time import time

In [None]:
df = pd.read_csv('../data/noticias_estadao.csv')

# Meeting data

In [None]:
df.head()

In [None]:
print("Data has %d rows and %d columns" % df.shape)

# Creating constants that will be used over this report

In [None]:
COLUMN_AXIS = 1
FULL_REPORT_COLNAME = 'noticia'
CONTENT_COLNAME = 'conteudo'
TITLE_COLNAME = 'titulo'
TOKENS_COLNAME = 'tokens'
TERM_COLNAME = 'term'
REPORT_ID_COLNAME = 'idNoticia'
AND = 'AND'
OR = 'OR'

# Concatenate alls reports' title and content in just one column.

In [None]:
def concatenate_report(row):
    """Concatenate report title and content in just one column.
        
        Args:
            row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            str: full report (content with title) in lowercase.
    """

    full_report = row[TITLE_COLNAME] + " " + row[CONTENT_COLNAME]
    return full_report.lower()

In [None]:
df[FULL_REPORT_COLNAME] = df.apply(
    lambda row: concatenate_report(row), 
    axis=COLUMN_AXIS
)

Selecting just report's id and full content columns:

In [None]:
df = df[[REPORT_ID_COLNAME, FULL_REPORT_COLNAME]]

Dataframe now looks like:

In [None]:
df.head()

# Tokenizing report's text and saving tokens in another column in dataframe

In [None]:
def tokenize_text(row):
    """Tokenize the text content of a report given as a row from a DataFrame
        
        Args:
            row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            set: a report content turned into a set of tokens.
    """    
    
    text_blob = tb(row[FULL_REPORT_COLNAME]) 
    m_tokens = set(text_blob.words)
    return m_tokens

In [None]:
df[TOKENS_COLNAME] = df.apply(
    lambda row: tokenize_text(row), 
    axis=COLUMN_AXIS
)

# Creating inverted index

First, we will create a intermediate structure called unnested_tokens. This structure will save each token of a report, associating it to the report's id. After this step, we will group this unnested_tokens structure by tokens, getting all reports' ids where one specific token appears.

In [None]:
def unnest_tokens_report(unnested_tokens_list, row):
    """Given a row observation of a DataFrame to represent a report (with content,
    tokens and id), iterate over the set of tokens and save each one as a dict with 
    token value and report id. Each dict is appended in the unnested_tokens_list
    passed as param.
        
    Args:
        unnested_tokens_list (list): list of dicts, each dict containing a token value 
            and the report id where it occured.
        row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            
    """  
    
    for token in row[TOKENS_COLNAME]:
        new_row = {
            TERM_COLNAME: token.strip('\'').strip(),
            REPORT_ID_COLNAME: row[REPORT_ID_COLNAME]
        }
        unnested_tokens_list.append(new_row)

In [None]:
unnested_tokens_list = []
df.apply(lambda row: unnest_tokens_report(unnested_tokens_list, row), axis=COLUMN_AXIS)

print("The unnested_tokens_list looks like: \n")
print(unnested_tokens_list[:9])
print("\nThe 'list of dicts' format will be used to create a pandas.DataFrame:")

In [None]:
unnested_tokens_df = pd.DataFrame(unnested_tokens_list)
unnested_tokens_df.head(10)

### Grouping by term to create inverted index

In [None]:
class InvertedIndexTermOccurrence(object):
    """Class for register term frequency and docs' ids in which a 
    term of a inverted index structure appears.
    
    Attributes:
        term_freq (int): Quantity of docs in which term appears.
        docs_ids (list): ids of docs in which term appears.
    """
    
    def __init__(self, term_freq, docs_ids):
        self.term_freq = term_freq
        self.docs_ids = docs_ids
    
    def get_term_freq(self):
        return self.term_freq
    
    def get_docs_ids(self):
        return self.docs_ids

In [None]:
def create_inverted_index_structure(unnested_tokens_df):
    
    inverted_index = dict()

    for term, obs in unnested_tokens_df.groupby([TERM_COLNAME]):

        term_freq = len(obs.get_values())
        docs_ids = set(obs[REPORT_ID_COLNAME])

        inverted_index[term] = InvertedIndexTermOccurrence(term_freq, docs_ids)
    
    return inverted_index

In [None]:
inverted_index = create_inverted_index_structure(unnested_tokens_df)

## Processing Queries

In [None]:
def is_one_term_query(query):
    
    empty_str = ""
    space_str = " "
    
    if query == empty_str:
        raise ValueError('You should search for a non empty string.')
    
    elif query == space_str:
        return True
    
    else:
        return len(query.split(space_str)) == 1

In [None]:
def get_query_operator(query):
    return AND if AND in query else OR

In [None]:
def search_two_terms(terms_to_search, operator):
    
    term1 = terms_to_search[0]
    term2 = terms_to_search[1]
    docs_ids_term1 = inverted_index[term1].get_docs_ids()
    docs_ids_term2 = inverted_index[term2].get_docs_ids()                
    
    if operator == AND:
        result = docs_ids_term1 & docs_ids_term2
    elif operator == OR:
        result = docs_ids_term1 | docs_ids_term2
        
    return list(result)

In [None]:
def search(query, inverted_index):
    
    if is_one_term_query(query):
        return inverted_index[query].get_docs_ids()  
    
    else:    
        operator = get_query_operator(query)
        terms_to_search = query.split(" " + operator + " ")
        terms_to_search = list(map(lambda term: term.lower(), terms_to_search)) # lowercase all terms to search
        
        return search_two_terms(terms_to_search, operator)        

# Sanities checks

In [None]:
query = "Campina AND Grande"
search_result = sorted(search(query, inverted_index))
correct_answer = sorted([1952, 4802, 1987, 6694, 5382, 1770, 2763, 1068, 5870, 2777, 1370, 2779])
assert search_result == correct_answer

In [None]:
# inverted_index = dict()

# ROW = 0
# for row_index in range(df.shape[ROW]):
#     for token in df[TOKENS_COL][row_index]:
        
#         report_id = data.at[row_index, REPORT_ID_COLNAME]
        
#         if token in inverted_index:
#             inverted_index[token].append(report_id)
#         else:
#             inverted_index[token] = [report_id]

As consultas devem ser caseSensitive?
E as stopwords?
Pode usar funções de set?