# Carregando bibliotecas necessárias

In [1]:
#manupulação de dados em tabelas
import pandas as pd 
#plots de gráficos
import matplotlib.pyplot as plt
#manupalação de vetores
import numpy as np 

#biblioteca spacy
import spacy
#OBS: Caso não tenha o pacote em português, execute no terminal o comando: python3 -m spacy download pt

# biblioteca string - Nativa do python
import string 

#Os stop words são oriundo da biblioteca spacy
from spacy.lang.pt.stop_words import STOP_WORDS

# Função de pre-processamento de texto

In [2]:
pln=spacy.load("pt_core_news_sm")
stop_words=STOP_WORDS
pontuacoes=string.punctuation
pontuacoes=pontuacoes+"..."+' '

# remove da lista de stop words alguns elementos importantes
stop_words.remove('bom')
stop_words.remove('muito')
stop_words.remove('não')
stop_words.remove('nem')

In [3]:
def processamento(texto):
    # texto em minuscula
    texto=texto.lower()
    documento=pln(texto)
    
    #removendo stop words
    lista_tokens_1=[]
    for p in documento:
        if (p.text in stop_words)==False:
            lista_tokens_1.append(p)
    #removendo pontuações      
    lista_tokens_2=[]
    for p in lista_tokens_1:
        if (p.text in pontuacoes)==False:
            lista_tokens_2.append(p)
    #lematização de tokens        
    lista_tokens_3=[]
    for p in lista_tokens_2:
        lista_tokens_3.append(p.lemma_)

    return lista_tokens_3

# ANALISE DE DADOS

In [4]:
df=pd.read_csv('olist_order_reviews_dataset.csv')
# Amostrado da tabela
df.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [5]:
df['review_creation_date']=pd.to_datetime(df['review_creation_date'])
# encontrando o ano de publicação
ano=[]
for i in range(len(df)):
    ano.append(df['review_creation_date'].iloc[i].year)
df['ANO']=ano

In [6]:
# Frequencia de comentário por ano
df['ANO'].value_counts()

2018    56164
2017    42735
2016      325
Name: ANO, dtype: int64

In [7]:
# Seleção de comentários de 2018
df=df[df['ANO']==2018].reset_index(drop=True)

In [8]:
#remover linhas duplicadas
df.drop_duplicates(subset='review_id',inplace=True)
# selecionar apenas algumas colunas releantes
df=df[['review_comment_title','review_comment_message','review_score']].reset_index(drop=True)
# preencher campos vazios
df.fillna('',inplace=True)
# reestruturação dos comentários
df['review']=df['review_comment_title']+ ' '+df['review_comment_message']
# remoção de comentário vazios


# Quantidade de Tokens por texto

In [11]:
listatoken=[]
for i in range(len(df)):
    listatoken.append(processamento(df['review'].iloc[i]))

In [12]:
df['LISTA_TOKEN']=listatoken

In [13]:
# calcula a quantidade de tokens em comentario
df['LEN_TOKEN']=df['LISTA_TOKEN'].apply(len)

# Estatistica básica

In [14]:
df.describe()

Unnamed: 0,review_score,LEN_TOKEN
count,55812.0,55812.0
mean,4.07493,3.091826
std,1.36186,4.9292
min,1.0,0.0
25%,4.0,0.0
50%,5.0,0.0
75%,5.0,5.0
max,5.0,45.0


## Observação: 

1. Há alguns comentários que não produziram textos depois do processamento

2. Há alguns comentários com tamanhos exagerados. 

## Filtro

In [16]:
df=df[(df['LEN_TOKEN']>0)&(df['LEN_TOKEN']<30)].reset_index(drop=True)

In [24]:
import warnings
warnings.filterwarnings("ignore")

# Bag of Words (BoW)

In [19]:
#importando biblioteca
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [20]:
#Criando o corpus textual
corpus=list(df['review'].values)

In [25]:
#vetorizando
vectorizer = CountVectorizer(tokenizer=processamento,max_features=300,stop_words=None,token_pattern=None)
vectorizer.fit(corpus)
#gerando o vocabulario
vocabulario=vectorizer.get_feature_names()
bow=vectorizer.transform(corpus)
#transformando em uma tabela
d_bow=pd.DataFrame(data=bow.toarray(),columns=vocabulario)
d_bow

Unnamed: 0,\r\n,\r\n\r\n,....,1,10,100,2,20,3,4,...,vcs,vendedor,vender,ver,vir,voltar,ótima,ótimo,único,👏
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24325,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24327,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


# TFidf

In [26]:
tfidf_transforme=TfidfTransformer()
# obter tfidf a partir do BOW
tfidf=tfidf_transforme.fit_transform(bow)
d_tfidf=pd.DataFrame(data=tfidf.toarray(),columns=vocabulario)
#transformando em uma tabela
d_tfidf

Unnamed: 0,\r\n,\r\n\r\n,....,1,10,100,2,20,3,4,...,vcs,vendedor,vender,ver,vir,voltar,ótima,ótimo,único,👏
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.463727,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.361058,0.0,0.0,0.0,0.0,0.0,0.0,0.0
