In [31]:
import numpy as np
import pandas as pd
from textblob import TextBlob as tb
from nltk import word_tokenize
from time import time

In [2]:
df = pd.read_csv('../data/noticias_estadao.csv', sep=',')

## Meeting data

In [3]:
df.head()

Unnamed: 0,titulo,conteudo,idNoticia
0,PT espera 30 mil pessoas em festa na Esplanada,BRASÍLIA - Após o desgaste provocado com o lan...,1
1,Alckmin toma posse de olho no Planalto,"Reeleito em outubro, o governador tucano Geral...",2
2,Seis obstáculos e desafios do segundo mandato ...,1. Rearranjo das contas A nova equipe econôm...,3
3,Veja os desafios dos governadores que assumem ...,"No Acre, governador reeleito quer erradicar an...",4
4,PT impulsiona cerimônia de posse da Dilma nas ...,"Os perfis da presidente Dilma Rousseff, nas re...",5


In [4]:
print("Data has %d rows and %d columns" % df.shape)

Data has 7643 rows and 3 columns


## Creating constants that will be used over this report

In [5]:
COLUMN_AXIS = 1
FULL_REPORT_COLNAME = 'noticia'
CONTENT_COLNAME = 'conteudo'
TITLE_COLNAME = 'titulo'
TOKENS_COLNAME = 'tokens'
TERM_COLNAME = 'term'
REPORT_ID_COLNAME = 'idNoticia'

## Concatenate alls reports' title and content in just one column.

In [6]:
def concatenate_report(df_row):
    """Concatenate report title and content in just one column.
        
        Args:
            df_row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            str: full report (content with title) in lowercase.
    """

    full_report = df_row[TITLE_COLNAME] + " " + df_row[CONTENT_COLNAME]
    return full_report.lower()

In [8]:
df[FULL_REPORT_COLNAME] = df.apply(
    lambda row: concatenate_report(row), 
    axis=COLUMN_AXIS
)

## Selecting just report's id and full content columns

In [9]:
df = df[[REPORT_ID_COLNAME, FULL_REPORT_COLNAME]]

## Dataframe now looks like:

In [10]:
df.head()

Unnamed: 0,idNoticia,noticia
0,1,pt espera 30 mil pessoas em festa na esplanada...
1,2,alckmin toma posse de olho no planalto reeleit...
2,3,seis obstáculos e desafios do segundo mandato ...
3,4,veja os desafios dos governadores que assumem ...
4,5,pt impulsiona cerimônia de posse da dilma nas ...


## Tokenizing report's text and saving tokens in another column in dataframe

In [48]:
def tokenize_text(df_row):
    """Tokenize the text content of a report given as a row from a DataFrame
        
        Args:
            df_row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            

        Return:
            set: a report content turned into a set of tokens.
    """    
    
    text_blob = tb(df_row[FULL_REPORT_COLNAME]) 
    m_tokens = set(text_blob.words)
#    m_tokens = word_tokenize(df_row[FULL_REPORT_COLNAME])
    return m_tokens

In [49]:
df[TOKENS_COLNAME] = df.apply(
    lambda row: tokenize_text(row), 
    axis=COLUMN_AXIS
)

## Creating inverted index

### First, we will create a intermediate structure called unnested_tokens. This structure will save each token of a report, associating it to the report's id. After this step, we will group this unnested_tokens structure by tokens, getting all reports in which a specific token appears.

In [50]:
def unnest_tokens_report(unnested_tokens_list, df_row):
    """Given a row observation of a DataFrame to represent a report (with content,
    tokens and id), iterate over the set of tokens and save each one as a dict with 
    token value and report id. Each dict is appended in the unnested_tokens_list
    passed as param.
        
        Args:
            unnested_tokens_list (list): list of dicts, each dict containing a token value 
                and the report id where it occured.
            df_row (:obj: pandas.Series): one row observation from a pandas.DataFrame.            
    """  
    
    for token in df_row[TOKENS_COLNAME]:
        new_row = {
            TERM_COLNAME: token.strip('\''),
            REPORT_ID_COLNAME: df_row[REPORT_ID_COLNAME]
        }
        unnested_tokens_list.append(new_row)

In [51]:
unnested_tokens_list = []
df.apply(
    lambda row: unnest_tokens_report(unnested_tokens_list, row), 
    axis=COLUMN_AXIS
)

print("The unnested_tokens_list looks like: \n")
print(unnested_tokens_list[:9])
print("\nThe 'list of dicts' format will be used to create a pandas.DataFrame:")

The unnested_tokens_list looks like: 

[{'term': 'pouca', 'idNoticia': 1}, {'term': 'ex-presidente', 'idNoticia': 1}, {'term': 'botarmos', 'idNoticia': 1}, {'term': 'os', 'idNoticia': 1}, {'term': 'aqui', 'idNoticia': 1}, {'term': 'pessoal', 'idNoticia': 1}, {'term': '1.º', 'idNoticia': 1}, {'term': 'segundo', 'idNoticia': 1}, {'term': 'eu', 'idNoticia': 1}]

The 'list of dicts' format will be used to create a pandas.DataFrame:


In [52]:
unnested_tokens_df = pd.DataFrame(unnested_tokens_list)
unnested_tokens_df.head(10)

Unnamed: 0,idNoticia,term
0,1,pouca
1,1,ex-presidente
2,1,botarmos
3,1,os
4,1,aqui
5,1,pessoal
6,1,1.º
7,1,segundo
8,1,eu
9,1,entre


### Grouping by term to create inverted index

In [58]:
class InvertedIndexTermOccurrence(object):
    """
    
    Attributes:
        term_freq (int): Quantity of docs in which term appears.
        docs_ids (list): ids of docs in which term appears.
    """
    
    def __init__(self, term_freq, docs_ids):
        self.term_freq = term_freq
        self.docs_ids = docs_ids
    
    def get_term_freq(self):
        return self.term_freq
    
    def get_docs_ids(self):
        return self.docs_ids

In [59]:
def create_inverted_index_structure(unnested_tokens_df):
    
    inverted_index = dict()

    for term, obs in unnested_tokens_df.groupby([TERM_COLNAME]):

        term_freq = len(obs.get_values())
        docs_ids = list(obs[REPORT_ID_COLNAME])

        inverted_index[term] = InvertedIndexTermOccurrence(term_freq, docs_ids)
    
    return inverted_index

In [60]:
inverted_index = create_inverted_index_structure(unnested_tokens_df)

In [None]:
# inverted_index = dict()

# ROW = 0
# for row_index in range(df.shape[ROW]):
#     for token in df[TOKENS_COL][row_index]:
        
#         report_id = data.at[row_index, REPORT_ID_COLNAME]
        
#         if token in inverted_index:
#             inverted_index[token].append(report_id)
#         else:
#             inverted_index[token] = [report_id]