# Introdução

Aqui neste notebook eu mostro como é feito o processo de obtenção dos termos e das sentenças do dataset, que possui muitos aspectos e sentenças. É mostrado como limpo o dataset, retirando linhas duplicadas, retirando palavras incorretas ou irrelevantes, e como seleciono elas para então poder trabalhar com elas.

In [5]:
! pip install textblob

Collecting textblob
  Using cached textblob-0.15.3-py2.py3-none-any.whl (636 kB)
Installing collected packages: textblob
Successfully installed textblob-0.15.3


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jailsonpereira/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/jailsonpereira/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [54]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jailsonpereira/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Bibliotecas utilizadas

In [7]:
#Bibliotecas utilizadas no notebook
import pandas as pd
import numpy as np
from textblob import TextBlob
import ast
from nltk.corpus import stopwords
import os

# Métodos e funções

In [8]:
# Função que retorna se um termo está escrito correto
# str.isalpha: verifica se na sentenca ocorre somente caracteres alfabeticos
def isCorrect(sentenca):
    for c in sentenca:
        if(c != ' '):
            if(not c.isalpha()): 
                return False
    return True

In [9]:
# Função que retorna a frequência de uma palavra única
def checagem_frequencia(word, sentenca):
    tokens = TextBlob(sentenca).words #Quebra a sentenca em palavras
    count = 0 #Frequencia do termo
    for palavra in tokens:
        if(word.lower() in palavra.lower()): #Verifica se a palavra corresponde
            count += 1
    return count

In [10]:
# Função que retorna a frequência de um termo em uma sentença
def get_frequencia(word,sentenca):
    count = 0 #Frequencia da palavra
    if(' ' in word): #Se for uma palavra composta
        word = word.lower()
        sentenca = sentenca.lower()
        count = sentenca.count(word)
    else: #Se for uma palavra única
        count = checagem_frequencia(word,sentenca)
    return count            

In [12]:
def isNoun(word):
    if(' ' in word):
        return True
    tags = TextBlob(word).pos_tags
    if('NN' in tags[0][1]):
        return True
    return False

# Variavéis e constantes

In [70]:
#Diretorio dos datasets
save_mapeamentos = ['../../datasets_processed/mapeamentos/mapeamentos-camera-others.txt','../../datasets_processed/mapeamentos/mapeamentos-cell-others.txt']

save_palavras = ['../../datasets_processed/palavras-camera-others.txt','../../datasets_processed/palavras-cell-others.txt']

save_frequencias = ['../../datasets_processed/frequencias/frequencias-camera-others.txt','../../datasets_processed/frequencias/frequencias-cell-others.txt']

save_similaridades = ['../../datasets_processed/similaridades/similaridades-camera-others.txt','../../datasets_processed/similaridades/similaridades-cell-others.txt']

save_contextos = ['../../datasets_processed/contextos/contexto-camera-others.txt','../../datasets_processed/contextos/contexto-cell-others.txt']

save_sentencas = ['../../datasets_processed/sentencas/sentencas-camera-others.txt','../../datasets_processed/sentencas/sentencas-cell-others.txt']

save_contextualizacao = ['../../datasets_processed/contextualizacao/contextualizacao-camera-others.txt','../../datasets_processed/contextualizacao/contextualizacao-cell-others.txt']

save_csv = ['../../dataset_outros/dataset-camera-others.csv','../../dataset_outros/dataset-cell-others.csv']

save_datasets = ['../../datasets/dados-camera-others','../../datasets/dados-cell-others']

In [38]:
# inseri os caminhos dos arquivos dos direórios em uma lista
caminhos = ['../../dataset_outros/{}'.format(file) for file in os.listdir('../../dataset_outros/') if '.tsv' in file]
dir_gabaritos = ['../../datasets/gabaritos/{}'.format(file) for file in os.listdir('../../datasets/gabaritos/') if '.txt' in file]

In [39]:
# ordena as listas em ordem alfabética
dir_gabaritos = sorted(dir_gabaritos)
caminhos = sorted(caminhos)

In [40]:
#dominio = 1 #Variavel que indica o indice do arquivo que vai ser utilizado

dominio = {} # Dicionário que indica o indice para os arquivos de dados de cada produto
lista_produtos = [file.replace('.txt', '') for file in os.listdir('../../datasets/gabaritos/') if '.txt' in file] # lista produtos

for i, value in enumerate(dir_gabaritos):
    temp = value.split('/')[-1].replace('.txt', '')
    dominio[temp] = i
    
lista_produtos = sorted(lista_produtos)

In [41]:
print(lista_produtos)
print(dir_gabaritos)
print(caminhos)

['DadosCamera', 'DadosCells', 'DadosDvds', 'DadosLaptops', 'DadosRouters']
['../../datasets/gabaritos/DadosCamera.txt', '../../datasets/gabaritos/DadosCells.txt', '../../datasets/gabaritos/DadosDvds.txt', '../../datasets/gabaritos/DadosLaptops.txt', '../../datasets/gabaritos/DadosRouters.txt']
['../../dataset_outros/dataset-aspectos-other-cam.tsv', '../../dataset_outros/dataset-aspectos-other-cel.tsv']


In [42]:
dominio

{'DadosCamera': 0,
 'DadosCells': 1,
 'DadosDvds': 2,
 'DadosLaptops': 3,
 'DadosRouters': 4}

In [43]:
dados = {}
stop_words = stopwords.words('english')
frequencias = {}
documentos = set()
gabarito = set()

In [44]:
#for produto in lista_produtos:  # iterar sobre todos os produtos
arq = open(dir_gabaritos[dominio["DadosCamera"]],'r') # gabarito de um determinado produto
for linha in arq.readlines():
    valores = linha.split(': ')
    classe = valores[0]
    if(classe != "Others"): # classes diferentes de outros, verifica a gramatica lexical e adiciona no conjunto
        atributes = ast.literal_eval(valores[1])
        gabarito = gabarito.union(atributes)
arq.close()

# Explorando o dataset

In [45]:
df = pd.read_table(caminhos[dominio["DadosCamera"]],names=['aspecto','sentenca','sla'])
df

Unnamed: 0,aspecto,sentenca,sla
0,paper,comes with two viewers made of thick paper and...,
1,viewer,it will become a strip.- weave the strip into ...,
2,shots,i got back to my house and took a couple of qu...,
3,batteries,bought the kd-220z at frys for $120 with a $20...,
4,quality,the camera worked ok and the pictures were of ...,
...,...,...,...
191050,time,i went to my nephew's wrestling tournement i ...,
191051,ideas?i,any ideas?i love cyber-shot because my 5-year-...,
191052,loves,"my [...] also loves to take pictures with it, ...",
191053,looks,makes our skin looks flawless.,


In [46]:
df = df.drop_duplicates() #removendo linhas duplicadas do dataset
df

Unnamed: 0,aspecto,sentenca,sla
0,paper,comes with two viewers made of thick paper and...,
1,viewer,it will become a strip.- weave the strip into ...,
2,shots,i got back to my house and took a couple of qu...,
3,batteries,bought the kd-220z at frys for $120 with a $20...,
4,quality,the camera worked ok and the pictures were of ...,
...,...,...,...
191050,time,i went to my nephew's wrestling tournement i ...,
191051,ideas?i,any ideas?i love cyber-shot because my 5-year-...,
191052,loves,"my [...] also loves to take pictures with it, ...",
191053,looks,makes our skin looks flawless.,


In [47]:
df = df.drop("sla",axis=1) #removendo a última coluna do dataset
df

Unnamed: 0,aspecto,sentenca
0,paper,comes with two viewers made of thick paper and...
1,viewer,it will become a strip.- weave the strip into ...
2,shots,i got back to my house and took a couple of qu...
3,batteries,bought the kd-220z at frys for $120 with a $20...
4,quality,the camera worked ok and the pictures were of ...
...,...,...
191050,time,i went to my nephew's wrestling tournement i ...
191051,ideas?i,any ideas?i love cyber-shot because my 5-year-...
191052,loves,"my [...] also loves to take pictures with it, ..."
191053,looks,makes our skin looks flawless.


In [48]:
df['is_correct'] = df['aspecto'].apply(lambda x: isCorrect(x))
df

Unnamed: 0,aspecto,sentenca,is_correct
0,paper,comes with two viewers made of thick paper and...,True
1,viewer,it will become a strip.- weave the strip into ...,True
2,shots,i got back to my house and took a couple of qu...,True
3,batteries,bought the kd-220z at frys for $120 with a $20...,True
4,quality,the camera worked ok and the pictures were of ...,True
...,...,...,...
191050,time,i went to my nephew's wrestling tournement i ...,True
191051,ideas?i,any ideas?i love cyber-shot because my 5-year-...,False
191052,loves,"my [...] also loves to take pictures with it, ...",True
191053,looks,makes our skin looks flawless.,True


In [49]:
palavras_corretas = df[df['is_correct'] == True]
palavras_corretas

Unnamed: 0,aspecto,sentenca,is_correct
0,paper,comes with two viewers made of thick paper and...,True
1,viewer,it will become a strip.- weave the strip into ...,True
2,shots,i got back to my house and took a couple of qu...,True
3,batteries,bought the kd-220z at frys for $120 with a $20...,True
4,quality,the camera worked ok and the pictures were of ...,True
...,...,...,...
191049,havent,i still havent had time to read the manual to ...,True
191050,time,i went to my nephew's wrestling tournement i ...,True
191052,loves,"my [...] also loves to take pictures with it, ...",True
191053,looks,makes our skin looks flawless.,True


In [50]:
palavras_corretas['is_stopword'] = palavras_corretas['aspecto'].apply(lambda x: x in stop_words)
palavras_corretas['is_other'] = palavras_corretas['aspecto'].apply(lambda x: x not in gabarito)
palavras_corretas = palavras_corretas.query('is_stopword == False & is_other == True')
palavras_corretas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,aspecto,sentenca,is_correct,is_stopword,is_other
0,paper,comes with two viewers made of thick paper and...,True,False,True
1,viewer,it will become a strip.- weave the strip into ...,True,False,True
2,shots,i got back to my house and took a couple of qu...,True,False,True
3,batteries,bought the kd-220z at frys for $120 with a $20...,True,False,True
4,quality,the camera worked ok and the pictures were of ...,True,False,True
...,...,...,...,...,...
191049,havent,i still havent had time to read the manual to ...,True,False,True
191050,time,i went to my nephew's wrestling tournement i ...,True,False,True
191052,loves,"my [...] also loves to take pictures with it, ...",True,False,True
191053,looks,makes our skin looks flawless.,True,False,True


In [55]:
palavras_corretas['isNN'] = palavras_corretas['aspecto'].apply(lambda x: isNoun(x))
palavras_corretas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,aspecto,sentenca,is_correct,is_stopword,is_other,isNN
0,paper,comes with two viewers made of thick paper and...,True,False,True,True
1,viewer,it will become a strip.- weave the strip into ...,True,False,True,True
2,shots,i got back to my house and took a couple of qu...,True,False,True,True
3,batteries,bought the kd-220z at frys for $120 with a $20...,True,False,True,True
4,quality,the camera worked ok and the pictures were of ...,True,False,True,True
...,...,...,...,...,...,...
191049,havent,i still havent had time to read the manual to ...,True,False,True,True
191050,time,i went to my nephew's wrestling tournement i ...,True,False,True,True
191052,loves,"my [...] also loves to take pictures with it, ...",True,False,True,True
191053,looks,makes our skin looks flawless.,True,False,True,True


In [56]:
palavras_corretas = palavras_corretas.query("isNN == True")
palavras_corretas

Unnamed: 0,aspecto,sentenca,is_correct,is_stopword,is_other,isNN
0,paper,comes with two viewers made of thick paper and...,True,False,True,True
1,viewer,it will become a strip.- weave the strip into ...,True,False,True,True
2,shots,i got back to my house and took a couple of qu...,True,False,True,True
3,batteries,bought the kd-220z at frys for $120 with a $20...,True,False,True,True
4,quality,the camera worked ok and the pictures were of ...,True,False,True,True
...,...,...,...,...,...,...
191049,havent,i still havent had time to read the manual to ...,True,False,True,True
191050,time,i went to my nephew's wrestling tournement i ...,True,False,True,True
191052,loves,"my [...] also loves to take pictures with it, ...",True,False,True,True
191053,looks,makes our skin looks flawless.,True,False,True,True


In [57]:
database = palavras_corretas[['aspecto','sentenca']]#.sample(n=6000,random_state=42)
database

Unnamed: 0,aspecto,sentenca
0,paper,comes with two viewers made of thick paper and...
1,viewer,it will become a strip.- weave the strip into ...
2,shots,i got back to my house and took a couple of qu...
3,batteries,bought the kd-220z at frys for $120 with a $20...
4,quality,the camera worked ok and the pictures were of ...
...,...,...
191049,havent,i still havent had time to read the manual to ...
191050,time,i went to my nephew's wrestling tournement i ...
191052,loves,"my [...] also loves to take pictures with it, ..."
191053,looks,makes our skin looks flawless.


In [58]:
database_sample = database.sample(n=5000,random_state=42)
database_sample

Unnamed: 0,aspecto,sentenca
185255,reviewers,"like the other reviewers said, you have to pri..."
171555,shots,"so, don't expect it to be able to take those a..."
26800,upgrade,the new 50d was a worthwhile upgrade for me.
54742,works,the photos are quality and the burst feature w...
150567,junkie,just a gadget junkie here shooting for fun and...
...,...,...
116798,thing,"but this thing is really smart, about the size..."
121737,tripod,it is heavier than i expected but my new tripo...
129355,gripe,my main gripe is how slow the processor is and...
6470,subjects,"even at its lowest iso settings, subjects look..."


In [59]:
dados_teste_camera_others = database.sample(n=5000,random_state=42)
dados_teste_camera_others = dados_teste_camera_others.rename(columns={"sentenca": 'reviews'})
dados_teste_camera_others

Unnamed: 0,aspecto,reviews
185255,reviewers,"like the other reviewers said, you have to pri..."
171555,shots,"so, don't expect it to be able to take those a..."
26800,upgrade,the new 50d was a worthwhile upgrade for me.
54742,works,the photos are quality and the burst feature w...
150567,junkie,just a gadget junkie here shooting for fun and...
...,...,...
116798,thing,"but this thing is really smart, about the size..."
121737,tripod,it is heavier than i expected but my new tripo...
129355,gripe,my main gripe is how slow the processor is and...
6470,subjects,"even at its lowest iso settings, subjects look..."


In [60]:
dados_teste_camera_others.to_csv(save_datasets[dominio[produto]]+str(len(dados_teste_camera_others))+".csv",index=False)

In [61]:
database_sample['aspecto'].value_counts()[:200]

quality       133
time          106
thing          76
features       76
shots          73
             ... 
focuses         5
ups             5
month           5
technology      5
action          5
Name: aspecto, Length: 200, dtype: int64

In [62]:
aspectos_selecionados = database_sample['aspecto'].value_counts().index[:200]
aspectos_selecionados

Index(['quality', 'time', 'thing', 'features', 'shots', 'feature', 'works',
       'photographer', 'picture', 'batteries',
       ...
       'seller', 'rate', 'step', 'objects', 'information', 'focuses', 'ups',
       'month', 'technology', 'action'],
      dtype='object', length=200)

In [67]:
aspectos_escolhidos = list(aspectos_selecionados)
print(aspectos_escolhidos)
print(len(aspectos_escolhidos))

['quality', 'time', 'thing', 'features', 'shots', 'feature', 'works', 'photographer', 'picture', 'batteries', 'problem', 'love', 'video', 'dslr', 'days', 'images', 'color', 'life', 'years', 'shot', 'lot', 'times', 'software', 'battery', 'something', 'modes', 'recommend', 'viewfinder', 'way', 'control', 'year', 'complaint', 'day', 'need', 'button', 'problems', 'fits', 'results', 'feel', 'photography', 'experience', 'tripod', 'want', 'user', 'piece', 'issue', 'ones', 'card', 'lcd', 'people', 'menu', 'photo', 'amount', 'nikon', 'colors', 'canon', 'weeks', 'review', 'buttons', 'case', 'looks', 'bit', 'look', 'functions', 'trip', 'hours', 'controls', 'body', 'reason', 'videos', 'things', 'money', 'version', 'set', 'reviewers', 'package', 'service', 'stuff', 'hand', 'daughter', 'complaints', 'kodak', 'research', 'wife', 'fact', 'job', 'side', 'deal', 'resolution', 'idea', 'hands', 'sony', 'situations', 'display', 'design', 'fit', 'week', 'products', 'computer', 'upgrade', 'function', 'eye', 

# aspectos_escolhidos = ['phones', 'works', 'time','thing','features','service','keyboard','quality',
 'bought',
 'things',
 'times',
 'use',
 'reviews',
 'everything',
 'life',
 'months',
 'condition',
 'problems',
 'experience',
 'days',
 'cell phone',
 'people',
 'day',
 'years',
 'button',
 'user',
 'reception',
 'issues',
 'volume',
 'issue',
 'minutes',
 'person',
 'feature',
 'way',
 'end',
 'option',
 'sim card',
 'version',
 'case',
 'son',
 'side',
 'design',
 'plan',
 'sound',
 'problem',
 'weeks',
 'piece',
 'seller',
 'recommend',
 'buttons',
 'cell',
 'reason',
 'touch',
 'work',
 'bit',
 'talk',
 'text',
 'hours',
 'deal',
 'stuff',
 'customer service',
 'something',
 'gps',
 'need',
 'hands',
 'year',
 'found',
 'blackberry',
 'signal',
 'store',
 'cover',
 'mobile',
 'network',
 'wifi',
 'functions',
 'keypad',
 'review',
 'line',
 'tracfone',
 'app',
 'bluetooth',
 'connection',
 'complaints',
 'owner',
 'waste',
 'amazon',
 'users',
 'words',
 'brand',
 'money',
 'reviewers',
 'devices',
 'daughter',
 'difference',
 'want',
 'calls',
 'charger',
 'number',
 'sim',
 'amount',
 'hand',
 'purchase',
 'box',
 'call',
 'model',
 'month',
 'menu',
 'looks',
 'touch phone',
 'browser',
 'mode',
 'customer',
 'verizon',
 'luck',
 'music',
 'smartphones',
 'support',
 'news',
 'improvement',
 'internet',
 'hardware',
 'data',
 'player',
 'cell phones',
 'nokia',
 'process',
 'messages',
 'options',
 'package',
 'speaker',
 'usage',
 'touch screen',
 'ability',
 'space',
 'versions',
 'models',
 'upgrade',
 'networks',
 'carriers',
 'touchscreen',
 'texts',
 'card',
 'fan',
 'stars',
 'lot',
 'radio',
 'pocket',
 'love',
 'carrier',
 'questions',
 'contract',
 'set',
 'pain',
 'look',
 'note',
 'try',
 'trouble',
 'freezes',
 'unit',
 'flash',
 'idea',
 'cable',
 'fits',
 'lg',
 'week',
 'function',
 'tiles',
 'coverage',
 'point',
 'saw',
 'sim cards',
 'headset',
 'car charger',
 'system',
 'sites',
 'move',
 'ways',
 'functionality',
 'complaint',
 'capability',
 'job',
 'put',
 'husband',
 'slide',
 'brand phone',
 'information',
 'areas',
 'plans',
 'email',
 'cards',
 'company',
 'game',
 'hope',
 'ringtones',
 'claims']

In [65]:
len(aspectos_escolhidos)

200

In [68]:
dataset_refinado = database_sample[database_sample['aspecto'].isin(aspectos_escolhidos)]
dataset_refinado

Unnamed: 0,aspecto,sentenca
185255,reviewers,"like the other reviewers said, you have to pri..."
171555,shots,"so, don't expect it to be able to take those a..."
26800,upgrade,the new 50d was a worthwhile upgrade for me.
54742,works,the photos are quality and the burst feature w...
5502,number,my advice is to charge up the battery before a...
...,...,...
187458,video,would definitely recommend for families who lo...
116798,thing,"but this thing is really smart, about the size..."
121737,tripod,it is heavier than i expected but my new tripo...
129355,gripe,my main gripe is how slow the processor is and...


In [71]:
dataset_refinado.to_csv(save_csv[dominio['DadosCamera']],index=False)

# Pré-processamento

In [72]:
dataframe = pd.read_csv(save_csv[dominio['DadosCamera']])
dataframe

Unnamed: 0,aspecto,sentenca
0,reviewers,"like the other reviewers said, you have to pri..."
1,shots,"so, don't expect it to be able to take those a..."
2,upgrade,the new 50d was a worthwhile upgrade for me.
3,works,the photos are quality and the burst feature w...
4,number,my advice is to charge up the battery before a...
...,...,...
2887,video,would definitely recommend for families who lo...
2888,thing,"but this thing is really smart, about the size..."
2889,tripod,it is heavier than i expected but my new tripo...
2890,gripe,my main gripe is how slow the processor is and...


In [73]:
dataframe['aspecto'].value_counts()

quality        133
time           106
thing           76
features        76
shots           73
              ... 
charge           5
noise            5
anyone           5
information      5
objects          5
Name: aspecto, Length: 200, dtype: int64

In [74]:
datasample = dataframe
datasample

Unnamed: 0,aspecto,sentenca
0,reviewers,"like the other reviewers said, you have to pri..."
1,shots,"so, don't expect it to be able to take those a..."
2,upgrade,the new 50d was a worthwhile upgrade for me.
3,works,the photos are quality and the burst feature w...
4,number,my advice is to charge up the battery before a...
...,...,...
2887,video,would definitely recommend for families who lo...
2888,thing,"but this thing is really smart, about the size..."
2889,tripod,it is heavier than i expected but my new tripo...
2890,gripe,my main gripe is how slow the processor is and...


In [76]:
datasample['aspecto'].value_counts()

quality        133
time           106
thing           76
features        76
shots           73
              ... 
charge           5
noise            5
anyone           5
information      5
objects          5
Name: aspecto, Length: 200, dtype: int64

In [77]:
#Lendo o dataset - obtendo os aspectos corretos e suas sentenças
tam = len(datasample)
for ind in range(tam):
    word = datasample.aspecto[ind].strip()
    sentenca = datasample.sentenca[ind].strip()
    count_frequencia = get_frequencia(word,sentenca)
    if(count_frequencia > 0):
        if(word not in dados.keys()):
            dados[word] = set()
            frequencias[word] = 0
        #Caso tudo esteja correto eu pego a palavra e o documento que ela aparece
        dados[word].add(sentenca)            
        documentos = documentos.union(set([sentenca]))
        frequencias[word] += count_frequencia

In [78]:
len(dados.keys()) #Número de aspectos corretos

200

In [79]:
len(documentos) #Número de documentos

2891

In [80]:
# Calculando as frequencias dos aspectos
valor_frequencias = {}
for key in sorted(frequencias.keys()):
    freq = frequencias[key]
    if(freq not in valor_frequencias.keys()):
        valor_frequencias[freq] = []
    valor_frequencias[freq].append(key)

In [81]:
valor_frequencias.keys() #Frequencias dos termos

dict_keys([6, 8, 5, 7, 15, 42, 26, 19, 9, 16, 18, 17, 30, 11, 10, 21, 14, 12, 32, 36, 66, 77, 28, 24, 43, 13, 47, 46, 44, 20, 136, 27, 75, 22, 81, 109, 23, 39, 57, 29])

In [82]:
len(valor_frequencias.keys()) #Total de frequencias encontrado

40

In [83]:
# Selecionando as 100 palavras de maior frequência
cont = 0
palavras_utilizadas = []
documentos_utilizados = set()
for valor in sorted(valor_frequencias.keys(),reverse=True):
    if(cont < 150):
        for palavra in valor_frequencias[valor]:
            print(palavra,valor)
            palavras_utilizadas.append(palavra)
            documentos_utilizados = documentos_utilizados.union(dados[palavra])
            cont += 1

quality 136
time 109
thing 81
features 77
shots 75
feature 66
works 57
photographer 47
picture 46
problem 44
love 43
batteries 42
video 39
dslr 36
days 32
color 30
images 30
years 29
life 28
shot 27
battery 26
lot 24
software 24
times 23
something 22
control 21
day 21
modes 21
way 21
problems 20
recommend 20
viewfinder 20
year 20
button 19
complaint 19
feel 19
issue 19
need 19
ones 19
card 18
fits 18
photo 18
photography 18
results 18
tripod 18
want 18
case 17
experience 17
lcd 17
people 17
piece 17
review 17
user 17
canon 16
menu 16
nikon 16
amount 15
bit 15
body 15
buttons 15
colors 15
functions 15
look 15
looks 15
weeks 15
controls 14
hand 14
hours 14
trip 14
money 13
reason 13
things 13
version 13
videos 13
daughter 12
display 12
hands 12
kodak 12
package 12
reviewers 12
service 12
set 12
side 12
stuff 12
complaints 11
deal 11
fact 11
idea 11
job 11
products 11
research 11
resolution 11
sony 11
week 11
wife 11
computer 10
design 10
everything 10
eye 10
fit 10
function 10
situations

In [84]:
cont

155

In [85]:
print(len(palavras_utilizadas)) #Número de palavras selecionadas
print(len(documentos_utilizados)) #Número de documentos em que os termos aparecem

155
2647


In [86]:
# Palavras selecionadas
for c in palavras_utilizadas:
    print(c)

quality
time
thing
features
shots
feature
works
photographer
picture
problem
love
batteries
video
dslr
days
color
images
years
life
shot
battery
lot
software
times
something
control
day
modes
way
problems
recommend
viewfinder
year
button
complaint
feel
issue
need
ones
card
fits
photo
photography
results
tripod
want
case
experience
lcd
people
piece
review
user
canon
menu
nikon
amount
bit
body
buttons
colors
functions
look
looks
weeks
controls
hand
hours
trip
money
reason
things
version
videos
daughter
display
hands
kodak
package
reviewers
service
set
side
stuff
complaints
deal
fact
idea
job
products
research
resolution
sony
week
wife
computer
design
everything
eye
fit
function
situations
upgrade
cable
kit
option
part
quality photos
seconds
use
view
accessories
bag
bought
cards
choice
con
conditions
detail
dial
digital
found
guy
items
minutes
models
photographers
pixels
slr
support
turn
wish
amazon
angle
area
comments
concern
cons
cost
cover
door
drawback
gripe
impression
interface
movie

In [87]:
documentos_utilizados = sorted(documentos_utilizados) #Ordenando os documentos
len(documentos_utilizados) #Número de documentos

2647

In [88]:
palavras_utilizadas.sort() #Ordenando as termos
palavras_utilizadas #Aspectos utilizados

['accessories',
 'amazon',
 'amount',
 'angle',
 'area',
 'bag',
 'batteries',
 'battery',
 'bit',
 'body',
 'bought',
 'button',
 'buttons',
 'cable',
 'canon',
 'card',
 'cards',
 'case',
 'choice',
 'color',
 'colors',
 'comments',
 'complaint',
 'complaints',
 'computer',
 'con',
 'concern',
 'conditions',
 'cons',
 'control',
 'controls',
 'cost',
 'cover',
 'daughter',
 'day',
 'days',
 'deal',
 'design',
 'detail',
 'dial',
 'digital',
 'display',
 'door',
 'drawback',
 'dslr',
 'everything',
 'experience',
 'eye',
 'fact',
 'feature',
 'features',
 'feel',
 'fit',
 'fits',
 'found',
 'function',
 'functions',
 'gripe',
 'guy',
 'hand',
 'hands',
 'hours',
 'idea',
 'images',
 'impression',
 'interface',
 'issue',
 'items',
 'job',
 'kit',
 'kodak',
 'lcd',
 'life',
 'look',
 'looks',
 'lot',
 'love',
 'menu',
 'minutes',
 'models',
 'modes',
 'money',
 'movies',
 'need',
 'nikon',
 'ones',
 'option',
 'package',
 'part',
 'people',
 'photo',
 'photographer',
 'photographers',
 

In [89]:
documents = documentos_utilizados#[:2000] #Selecionando os primeiros 2000 documentos do dataset
len(documents) #Número de documentos a ser utilizada

2647

In [90]:
mapeamentos = {} #Variável que mapeia os domentos para os termos
frequencias = {} #Variável que mapeia as frequências para os termos
for doc in documents:
    for p in palavras_utilizadas:
        count = get_frequencia(p,doc)
        if(count > 0):
            if(p not in mapeamentos.keys()):
                mapeamentos[p] = set()
            if(p not in frequencias.keys()):
                frequencias[p] = 0
            mapeamentos[p].add(doc) #Mapeando as sentenças para os termos
            frequencias[p] += count #Obtendo as frequências dos termos nos documentos

## Salvando os dados em arquivos

In [92]:
# Salvando as palavras selecionadas em um arquivo
arq = open(save_palavras[dominio['DadosCamera']],'w')
for p in palavras_utilizadas:
    arq.write(p + ' ->> ' + str(mapeamentos[p]) + '\n')
arq.close()

In [93]:
# Salvando as frequências em um arquivo
arq = open(save_frequencias[dominio['DadosCamera']],'w')
for p in palavras_utilizadas:
    arq.write(p + ': ' + str(frequencias[p]) + '\n')
arq.close()

In [94]:
# Salvando os documentos em um arquivo
arq = open(save_sentencas[dominio['DadosCamera']],'w')
for doc in documents:
    arq.write(doc + '\n\n')
arq.close()

In [96]:
# Salvando os mapeamentos em um arquivo
arq = open(save_mapeamentos[dominio['DadosCamera']],'w')
for p in palavras_utilizadas:
    arq.write(p + ' ->> ' + str(mapeamentos[p]) + '\n')
arq.close()

## Pré-processamento do contexto das palavras

In [120]:
n = len(documents) #Número de documentos utilizados
n_palavras = len(palavras_utilizadas) #Número de palavras utilizadas no processo
contextualizacao = {} #Variável que salva os documentos que cada termo aparece

In [121]:
# Inicializando a variável - uso esta variável para calcular o contexto entre as palavras
for p in palavras_utilizadas:
    contextualizacao[p] = []

In [122]:
# Fazendo o mapeamento dos documentos para os termos
for p in palavras_utilizadas:
    for doc in documents:
        count = get_frequencia(p,doc)
        if(count > 0):
            contextualizacao[p].append([doc])
        else:
            contextualizacao[p].append([])

In [123]:
# Salvando a variável de contextualização
arq = open(save_contextualizacao[dominio],'w')
for p in sorted(contextualizacao.keys()):
    arq.write(p + ' ->> ' + str(contextualizacao[p]) + '\n')
arq.close()

# Obtendo os dados dos arquivos

In [124]:
# Variáveis utilizadas
palavras_utilizadas = []
documentos_utilizados = []
contextualizacao = {}

In [125]:
# Obtendo a variável responsável pela contextualização das palavras
arq = open(save_contextualizacao[dominio],'r')
for linha in arq.readlines():
    valores = linha.split(' ->> ')
    classe = valores[0]
    conjunto = valores[1]
    palavras_utilizadas.append(classe)
    contextualizacao[classe] = ast.literal_eval(conjunto)
arq.close()

In [126]:
# Obtendo os documentos
arq = open(save_sentencas[dominio],'r')
for linha in arq.readlines():
    if(linha != '\n'):
        documentos_utilizados.append(linha.strip())
arq.close()

In [127]:
len(palavras_utilizadas) #Quantidade de palavras a serem utilizadas

153

In [128]:
len(documentos_utilizados) #Quantidade de documentos a ser utilizado

2665

In [129]:
n_palavras = len(palavras_utilizadas) #Número de palavras(aspectos)
n = len(documentos_utilizados) #Número de documentos

# Calculando a similaridade entre as palavras

In [130]:
matrizG = [[1.0]*n_palavras for i in range(n_palavras)] #Matriz de similaridade
# Locais onde salvar as similaridades
caminhoG = ['../similaridades/similaridades-camera-others.txt','../similaridades/similaridades-cell-others.txt']

In [62]:
import spacy #Importando a biblioteca do spacy

In [63]:
nlp = spacy.load("en_vectors_web_lg") #Carregando o modelo do spacy

In [131]:
print("Processando...")
#Salvando as similaridades num arquivo
arq = open(caminhoG[dominio],'w')
for ind in range(len(palavras_utilizadas)):
    token1 = nlp(palavras_utilizadas[ind]) #Palavra 1
    for j in range(ind+1,len(palavras_utilizadas)):
        token2 = nlp(palavras_utilizadas[j]) #Palavra 2
        try:
            similaridade = token1.similarity(token2) #Similaridade entre a palavra 1 e palavra 2
            if(similaridade < 0):
                similaridade = 0.0
            if(similaridade > 1.0):
                similaridade = 1.0
        except:
            similaridade = 0.0
        #Salvando a similaridade no arquivo
        arq.write(palavras_utilizadas[ind]+'->>'+palavras_utilizadas[j]+'->>'+str(similaridade)+'\n')
        matrizG[ind][j] = similaridade
        matrizG[j][ind] = similaridade
arq.close()
print("Concluido!\n")

Processando...
Concluido!



## Word2vec

In [20]:
import gensim
import ast

In [23]:
caminho_word2vec = ['../similaridades/similaridades-camera-others-word2vec.txt','../similaridades/similaridades-cell-others-word2vec.txt']
matriz_word2vec = [[0.0]*n_palavras for i in range(n_palavras)]
caminho = '../../model/GoogleNews-vectors-negative300.bin'

In [None]:
#Carregando o modelo do Word2Vec
model = gensim.models.KeyedVectors.load_word2vec_format(caminho,binary=True)

In [None]:
#Salvando as similaridades do Word2Vec num arquivo
arq = open(caminho_word2vec[dominio], 'w')
for ind in range(len(palavras_utilizadas)):
    for j in range(ind+1, len(palavras_utilizadas)):
        try:
            #Obtendo a similaridade usando o modelo
            similaridade = model.similarity(palavras_utilizadas[ind], palavras_utilizadas[j])
            if(similaridade < 0):
                similaridade = 0.0
            if(similaridade > 1.0):
                similaridade = 1.0
        except:
            similaridade = 0.0
        matriz_word2vec[i][j] = similaridade
        matriz_word2vec[j][i] = similaridade
        #Escrevendo o resultado no arquivo
        arq.write(palavras_utilizadas[ind].replace('_', ' ')+'->>'+palavras_utilizadas[j].replace('_', ' ')+'->>'+str(similaridade)+'\n')
arq.close()
print("Concluido!\n")

# Calculando o Contexto entre as palavras

In [65]:
import math #Importando a biblioteca utilizada para cálculos matemáticos

In [132]:
# Obtendo a contextualização das palavras
mapeamento = {}
for p in sorted(contextualizacao.keys()):
    vetor = contextualizacao[p]
    vetor = [set(v) for v in vetor]
    mapeamento[p] = vetor

In [133]:
#Função que retorna o NPMI das palavras
def contexto(p1,p2,mapeamento):
    fx = 0 #Variavel que guarda o numero de documentos que p1 aparece
    fy = 0 #Variavel que guarda o numero de documentos que p2 aparece
    fxy = 0 #VAriavel que guarda o numero de documentos que p1 e p2 aparecem em uma sentenca
    n = len(mapeamento[p1]) #Numero de documentos
    #Percorrendo os documentos
    for i in list(range(n)):
        if(len(mapeamento[p1][i].intersection(mapeamento[p2][i])) > 0):
            fxy += 1
        if(len(mapeamento[p1][i]) > 0):
            fx += 1
        if(len(mapeamento[p2][i]) > 0):
            fy += 1
    #Obtendo o NPMI
    npmi = 0
    fator1 = (n*fxy)/(fx*fy)
    fator2 = (fxy/n)
    try:
        valor = math.log10(fator1)/-math.log10(fator2)
        npmi = (valor+1)/2 #Normalizando para o intervalo [0,1]
    except:
        npmi = 0.0
    return npmi

In [134]:
matrizT = [[1.0]*n_palavras for i in range(n_palavras)] #Matriz de contextualização

In [135]:
tam = n_palavras #Quantidade de palavras no gabarito
total = (1+tam)*tam/2 #Total de iteracoes que serão feitas
cont = 0 #Variavel contadora
percent = 0 #Valor da porcentagem do processamento
aux = 0 #Variavel auxiliar
#Salvando o valor do NPMI entre as palavras
arq = open(save_contextos[dominio],'w')
print('Calculando o contexto entre as palavras...')
for i in range(n_palavras):
    for j in range(i+1,n_palavras):
        cont += 1
        #valor = NPMI(words[i],words[j],documentos)
        valor = contexto(palavras_utilizadas[i],palavras_utilizadas[j],mapeamento) #Valor do NPMI
        matrizT[i][j] = valor
        matrizT[j][i] = valor
        resp = str(palavras_utilizadas[i]) + '->>' + str(palavras_utilizadas[j]) + '->>' + str(valor) + '\n'
        arq.write(resp) #Salvando as palavras num arquivo
        #Printando o valor da porcentagem na tela
        aux = int(cont*100/total)
        if(aux > percent):
            print(str(aux) + "% concluido")
            percent = aux
arq.close()
print("Concluido\n")

Calculando o contexto entre as palavras...
1% concluido
2% concluido
3% concluido
4% concluido
5% concluido
6% concluido
7% concluido
8% concluido
9% concluido
10% concluido
11% concluido
12% concluido
13% concluido
14% concluido
15% concluido
16% concluido
17% concluido
18% concluido
19% concluido
20% concluido
21% concluido
22% concluido
23% concluido
24% concluido
25% concluido
26% concluido
27% concluido
28% concluido
29% concluido
30% concluido
31% concluido
32% concluido
33% concluido
34% concluido
35% concluido
36% concluido
37% concluido
38% concluido
39% concluido
40% concluido
41% concluido
42% concluido
43% concluido
44% concluido
45% concluido
46% concluido
47% concluido
48% concluido
49% concluido
50% concluido
51% concluido
52% concluido
53% concluido
54% concluido
55% concluido
56% concluido
57% concluido
58% concluido
59% concluido
60% concluido
61% concluido
62% concluido
63% concluido
64% concluido
65% concluido
66% concluido
67% concluido
68% concluido
69% concluido


# Gerando e salvando as matrizes

In [136]:
indices = {} #Variável que mapeia os termos para os seus indices nas matrizes
ind = 0
for i in range(n_palavras):
    word = palavras_utilizadas[i]
    indices[word] = ind
    ind += 1

In [137]:
# Obtendo as similaridades entre os termos
arq = open(caminhoG[dominio],'r')
for linha in arq.readlines():
    valores = linha.split('->>')
    p1 = valores[0]
    p2 = valores[1]
    valor = float(valores[2])
    x = indices[p1]
    y = indices[p2]
    #Colocando os valores das similaridades nas matrizes
    matrizG[x][y] = valor
    matrizG[y][x] = valor
arq.close()

In [138]:
# Obtendo o NPMI entre as palavras
arq = open(save_contextos[dominio],'r')
for linha in arq.readlines():
    valores = linha.split('->>')
    p1 = valores[0]
    p2 = valores[1]
    valor = float(valores[2])
    x = indices[p1]
    y = indices[p2]
    #Salvando o NPMI na matriz de contextualização
    matrizT[x][y] = valor
    matrizT[y][x] = valor
arq.close()

In [139]:
# Diretórios dos locais para salvar as matrizes de similaridade e contexto
save_matrizG = ['../matrizes/similaridades-camera-others.txt','../matrizes/similaridades-cell-others.txt']
save_matrizT = ['../matrizes/contextos-camera-others.txt','../matrizes/contextos-cell-others.txt']

In [140]:
# Salvando a matriz de similaridade
arq = open(save_matrizG[dominio],'w')
for vetor in matrizG:
    arq.write(str(vetor) + '\n')
arq.close()

In [141]:
# Salvando a matriz de contexto
arq = open(save_matrizT[dominio],'w')
for vetor in matrizT:
    arq.write(str(vetor) + '\n')
arq.close()