# Introdução

Aqui neste notebook eu mostro como é feito o processo de obtenção dos termos e das sentenças do dataset, que possui muitos aspectos e sentenças. É mostrado como limpo o dataset, retirando linhas duplicadas, retirando palavras incorretas ou irrelevantes, e como seleciono elas para então poder trabalhar com elas.

# Bibliotecas utilizadas

In [7]:
#Bibliotecas utilizadas no notebook
import pandas as pd
import numpy as np
from textblob import TextBlob
import ast
from nltk.corpus import stopwords
import os

# Métodos e funções

In [2]:
# Função que retorna se um termo está escrito correto
# str.isalpha: verifica se na sentenca ocorre somente caracteres alfabeticos
def isCorrect(sentenca):
    for c in sentenca:
        if(c != ' '):
            if(not c.isalpha()): 
                return False
    return True

In [3]:
# Função que retorna a frequência de uma palavra única
def checagem_frequencia(word, sentenca):
    tokens = TextBlob(sentenca).words #Quebra a sentenca em palavras
    count = 0 #Frequencia do termo
    for palavra in tokens:
        if(word.lower() in palavra.lower()): #Verifica se a palavra corresponde
            count += 1
    return count

In [4]:
# Função que retorna a frequência de um termo em uma sentença
def get_frequencia(word,sentenca):
    count = 0 #Frequencia da palavra
    if(' ' in word): #Se for uma palavra composta
        word = word.lower()
        sentenca = sentenca.lower()
        count = sentenca.count(word)
    else: #Se for uma palavra única
        count = checagem_frequencia(word,sentenca)
    return count            

In [5]:
def isNoun(word):
    if(' ' in word):
        return True
    tags = TextBlob(word).pos_tags
    if('NN' in tags[0][1]):
        return True
    return False

# Variavéis e constantes

In [6]:
#Diretorio dos datasets
#caminhos = ['../../dataset-outros/dataset-aspectos-other-cam.tsv','../../dataset-outros/dataset-aspectos-other-cel.tsv']

save_mapeamentos = ['../../datasets_processed/mapeamentos/mapeamentos-camera-others.txt','../../datasets_processed/mapeamentos/mapeamentos-cell-others.txt']

save_palavras = ['../../datasets_processed/palavras-camera-others.txt','../../datasets_processed/palavras-cell-others.txt']

save_frequencias = ['../../datasets_processed/frequencias/frequencias-camera-others.txt','../../datasets_processed/frequencias/frequencias-cell-others.txt']

save_similaridades = ['../../datasets_processed/similaridades/similaridades-camera-others.txt','../../datasets_processed/similaridades/similaridades-cell-others.txt']

save_contextos = ['../../datasets_processed/contextos/contexto-camera-others.txt','../../datasets_processed/contextos/contexto-cell-others.txt']

save_sentencas = ['../../datasets_processed/sentencas/sentencas-camera-others.txt','../../datasets_processed/sentencas/sentencas-cell-others.txt']

save_contextualizacao = ['../../datasets_processed/contextualizacao/contextualizacao-camera-others.txt','../../datasets_processed/contextualizacao/contextualizacao-cell-others.txt']

save_csv = ['../../dataset-outros/dataset-camera-others.csv','../../dataset-outros/dataset-cell-others.csv']

#dir_gabaritos = ['../../datasets/gabaritos/DadosCamera.txt','../../datasets/gabaritos/DadosCells.txt']

save_datasets = ['../../datasets/dados-camera-others','../../datasets/dados-cell-others']

In [19]:
caminhos = ['../../dataset_outros/{}'.format(file) for file in os.listdir('../../dataset_outros/')]
dir_gabaritos = ['../../datasets/gabaritos/{}'.format(file) for file in os.listdir('../../datasets/gabaritos/')]

['../../datasets/gabaritos/DadosCamera.txt', '../../datasets/gabaritos/DadosCells.txt', '../../datasets/gabaritos/DadosDvds.txt', '../../datasets/gabaritos/DadosLaptops.txt', '../../datasets/gabaritos/DadosRouters.txt']
['../../dataset_outros/dataset-aspectos-other-cam.tsv', '../../dataset_outros/dataset-aspectos-other-cel.tsv']


In [24]:
#dominio = 1 #Variavel que indica o indice do arquivo que vai ser utilizado

dominio = {} # Dicionário que indica o indice para os arquivos de dados de cada produto
lista_produtos = [file.replace('.txt', '') for file in os.listdir('../../datasets/gabaritos/')] # lista produtos

for i, value in enumerate(dir_gabaritos):
    dominio[value.replace('.txt', '')] = i

['DadosCamera', 'DadosCells', 'DadosDvds', 'DadosLaptops', 'DadosRouters']


In [10]:
dados = {}
stop_words = stopwords.words('english')
frequencias = {}
documentos = set()
gabarito = set()

In [25]:
arq = open(dir_gabaritos[dominio],'r')
for linha in arq.readlines():
    valores = linha.split(': ')
    classe = valores[0]
    if(classe != "Others"):
        atributes = ast.literal_eval(valores[1])
        gabarito = gabarito.union(atributes)
arq.close()

# Explorando o dataset

In [79]:
df = pd.read_table(caminhos[dominio],names=['aspecto','sentenca','sla'])
df

Unnamed: 0,aspecto,sentenca,sla
0,works,as far as the features go everything works fine.,
1,have,itsounds great but i have yet to go anywhere i...,
2,text,i don't do much on my phone outside of calling...,
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,...",
4,function,"also, the menu function for texting is a littl...",
...,...,...,...
133056,issue,but it's a minor issue considering the price o...,
133057,features,"i was a little bummed about that but oh well, ...",
133058,wanted,i really really wanted to like this phone.,
133059,person,i could hear the other person on the line clea...,


In [80]:
df = df.drop_duplicates() #removendo linhas duplicadas do dataset
df

Unnamed: 0,aspecto,sentenca,sla
0,works,as far as the features go everything works fine.,
1,have,itsounds great but i have yet to go anywhere i...,
2,text,i don't do much on my phone outside of calling...,
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,...",
4,function,"also, the menu function for texting is a littl...",
...,...,...,...
133056,issue,but it's a minor issue considering the price o...,
133057,features,"i was a little bummed about that but oh well, ...",
133058,wanted,i really really wanted to like this phone.,
133059,person,i could hear the other person on the line clea...,


In [81]:
df = df.drop("sla",axis=1) #removendo a última coluna do dataset
df

Unnamed: 0,aspecto,sentenca
0,works,as far as the features go everything works fine.
1,have,itsounds great but i have yet to go anywhere i...
2,text,i don't do much on my phone outside of calling...
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,..."
4,function,"also, the menu function for texting is a littl..."
...,...,...
133056,issue,but it's a minor issue considering the price o...
133057,features,"i was a little bummed about that but oh well, ..."
133058,wanted,i really really wanted to like this phone.
133059,person,i could hear the other person on the line clea...


In [82]:
df['is_correct'] = df['aspecto'].apply(lambda x: isCorrect(x))
df

Unnamed: 0,aspecto,sentenca,is_correct
0,works,as far as the features go everything works fine.,True
1,have,itsounds great but i have yet to go anywhere i...,True
2,text,i don't do much on my phone outside of calling...,True
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,...",True
4,function,"also, the menu function for texting is a littl...",True
...,...,...,...
133056,issue,but it's a minor issue considering the price o...,True
133057,features,"i was a little bummed about that but oh well, ...",True
133058,wanted,i really really wanted to like this phone.,True
133059,person,i could hear the other person on the line clea...,True


In [83]:
palavras_corretas = df[df['is_correct'] == True]
palavras_corretas

Unnamed: 0,aspecto,sentenca,is_correct
0,works,as far as the features go everything works fine.,True
1,have,itsounds great but i have yet to go anywhere i...,True
2,text,i don't do much on my phone outside of calling...,True
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,...",True
4,function,"also, the menu function for texting is a littl...",True
...,...,...,...
133056,issue,but it's a minor issue considering the price o...,True
133057,features,"i was a little bummed about that but oh well, ...",True
133058,wanted,i really really wanted to like this phone.,True
133059,person,i could hear the other person on the line clea...,True


In [84]:
palavras_corretas['is_stopword'] = palavras_corretas['aspecto'].apply(lambda x: x in stop_words)
palavras_corretas['is_other'] = palavras_corretas['aspecto'].apply(lambda x: x not in gabarito)
palavras_corretas = palavras_corretas.query('is_stopword == False & is_other == True')
palavras_corretas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,aspecto,sentenca,is_correct,is_stopword,is_other
0,works,as far as the features go everything works fine.,True,False,True
2,text,i don't do much on my phone outside of calling...,True,False,True
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,...",True,False,True
4,function,"also, the menu function for texting is a littl...",True,False,True
5,steps,"its easy to figure out, just seems to have som...",True,False,True
...,...,...,...,...,...
133056,issue,but it's a minor issue considering the price o...,True,False,True
133057,features,"i was a little bummed about that but oh well, ...",True,False,True
133058,wanted,i really really wanted to like this phone.,True,False,True
133059,person,i could hear the other person on the line clea...,True,False,True


In [85]:
palavras_corretas['isNN'] = palavras_corretas['aspecto'].apply(lambda x: isNoun(x))
palavras_corretas

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,aspecto,sentenca,is_correct,is_stopword,is_other,isNN
0,works,as far as the features go everything works fine.,True,False,True,True
2,text,i don't do much on my phone outside of calling...,True,False,True,True
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,...",True,False,True,True
4,function,"also, the menu function for texting is a littl...",True,False,True,True
5,steps,"its easy to figure out, just seems to have som...",True,False,True,True
...,...,...,...,...,...,...
133056,issue,but it's a minor issue considering the price o...,True,False,True,True
133057,features,"i was a little bummed about that but oh well, ...",True,False,True,True
133058,wanted,i really really wanted to like this phone.,True,False,True,False
133059,person,i could hear the other person on the line clea...,True,False,True,True


In [86]:
palavras_corretas = palavras_corretas.query("isNN == True")
palavras_corretas

Unnamed: 0,aspecto,sentenca,is_correct,is_stopword,is_other,isNN
0,works,as far as the features go everything works fine.,True,False,True,True
2,text,i don't do much on my phone outside of calling...,True,False,True,True
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,...",True,False,True,True
4,function,"also, the menu function for texting is a littl...",True,False,True,True
5,steps,"its easy to figure out, just seems to have som...",True,False,True,True
...,...,...,...,...,...,...
133055,instruction booklet,i didn't know that until i read the instructio...,True,False,True,True
133056,issue,but it's a minor issue considering the price o...,True,False,True,True
133057,features,"i was a little bummed about that but oh well, ...",True,False,True,True
133059,person,i could hear the other person on the line clea...,True,False,True,True


In [87]:
database = palavras_corretas[['aspecto','sentenca']]#.sample(n=6000,random_state=42)
database

Unnamed: 0,aspecto,sentenca
0,works,as far as the features go everything works fine.
2,text,i don't do much on my phone outside of calling...
3,bit awkwardly,"the keyboard is set up a little bit awkwardly,..."
4,function,"also, the menu function for texting is a littl..."
5,steps,"its easy to figure out, just seems to have som..."
...,...,...
133055,instruction booklet,i didn't know that until i read the instructio...
133056,issue,but it's a minor issue considering the price o...
133057,features,"i was a little bummed about that but oh well, ..."
133059,person,i could hear the other person on the line clea...


In [88]:
database_sample = database.sample(n=5000,random_state=42)
database_sample

Unnamed: 0,aspecto,sentenca
20699,video,add to it easy video out (your phone on any tv...
18956,works,"after 5 months or so of using, it works very w..."
98598,luck,may you have better luck than i.
118056,lid,i loved the features on the phone and the flip...
25974,situation,it's perfectly good for my light-use situation...
...,...,...
30387,day,my first day was spent just putting in settings.
55244,case,you may need to get a hard case cover so that ...
103035,quality,the call quality is astoundingly bad.
38071,things,the tech i spoke to was very informative and c...


In [89]:
dados_teste_camera_others = database.sample(n=5000,random_state=42)
dados_teste_camera_others = dados_teste_camera_others.rename(columns={"sentenca": 'reviews'})
dados_teste_camera_others

Unnamed: 0,aspecto,reviews
20699,video,add to it easy video out (your phone on any tv...
18956,works,"after 5 months or so of using, it works very w..."
98598,luck,may you have better luck than i.
118056,lid,i loved the features on the phone and the flip...
25974,situation,it's perfectly good for my light-use situation...
...,...,...
30387,day,my first day was spent just putting in settings.
55244,case,you may need to get a hard case cover so that ...
103035,quality,the call quality is astoundingly bad.
38071,things,the tech i spoke to was very informative and c...


In [90]:
dados_teste_camera_others.to_csv(save_datasets[dominio]+str(len(dados_teste_camera_others))+".csv",index=False)

In [91]:
database_sample['aspecto'].value_counts()[:200]

phones       137
works        104
time          94
thing         91
features      72
            ... 
company        5
game           5
hope           4
ringtones      4
claims         4
Name: aspecto, Length: 200, dtype: int64

In [92]:
aspectos_selecionados = database_sample['aspecto'].value_counts().index[:200]
aspectos_selecionados

Index(['phones', 'works', 'time', 'thing', 'features', 'service', 'keyboard',
       'quality', 'bought', 'things',
       ...
       'information', 'areas', 'plans', 'email', 'cards', 'company', 'game',
       'hope', 'ringtones', 'claims'],
      dtype='object', length=200)

In [93]:
list(aspectos_selecionados)

['phones',
 'works',
 'time',
 'thing',
 'features',
 'service',
 'keyboard',
 'quality',
 'bought',
 'things',
 'times',
 'use',
 'reviews',
 'everything',
 'life',
 'months',
 'condition',
 'problems',
 'experience',
 'days',
 'cell phone',
 'people',
 'day',
 'years',
 'button',
 'user',
 'reception',
 'issues',
 'volume',
 'issue',
 'minutes',
 'person',
 'feature',
 'way',
 'end',
 'option',
 'sim card',
 'version',
 'case',
 'son',
 'side',
 'design',
 'plan',
 'sound',
 'problem',
 'weeks',
 'piece',
 'seller',
 'recommend',
 'buttons',
 'cell',
 'reason',
 'touch',
 'work',
 'bit',
 'talk',
 'text',
 'hours',
 'deal',
 'stuff',
 'customer service',
 'something',
 'gps',
 'need',
 'hands',
 'year',
 'found',
 'blackberry',
 'signal',
 'store',
 'cover',
 'mobile',
 'network',
 'wifi',
 'functions',
 'keypad',
 'review',
 'line',
 'tracfone',
 'app',
 'bluetooth',
 'connection',
 'complaints',
 'owner',
 'waste',
 'amazon',
 'users',
 'words',
 'brand',
 'money',
 'reviewers',
 '

In [94]:
aspectos_escolhidos = ['phones',
 'works',
 'time',
 'thing',
 'features',
 'service',
 'keyboard',
 'quality',
 'bought',
 'things',
 'times',
 'use',
 'reviews',
 'everything',
 'life',
 'months',
 'condition',
 'problems',
 'experience',
 'days',
 'cell phone',
 'people',
 'day',
 'years',
 'button',
 'user',
 'reception',
 'issues',
 'volume',
 'issue',
 'minutes',
 'person',
 'feature',
 'way',
 'end',
 'option',
 'sim card',
 'version',
 'case',
 'son',
 'side',
 'design',
 'plan',
 'sound',
 'problem',
 'weeks',
 'piece',
 'seller',
 'recommend',
 'buttons',
 'cell',
 'reason',
 'touch',
 'work',
 'bit',
 'talk',
 'text',
 'hours',
 'deal',
 'stuff',
 'customer service',
 'something',
 'gps',
 'need',
 'hands',
 'year',
 'found',
 'blackberry',
 'signal',
 'store',
 'cover',
 'mobile',
 'network',
 'wifi',
 'functions',
 'keypad',
 'review',
 'line',
 'tracfone',
 'app',
 'bluetooth',
 'connection',
 'complaints',
 'owner',
 'waste',
 'amazon',
 'users',
 'words',
 'brand',
 'money',
 'reviewers',
 'devices',
 'daughter',
 'difference',
 'want',
 'calls',
 'charger',
 'number',
 'sim',
 'amount',
 'hand',
 'purchase',
 'box',
 'call',
 'model',
 'month',
 'menu',
 'looks',
 'touch phone',
 'browser',
 'mode',
 'customer',
 'verizon',
 'luck',
 'music',
 'smartphones',
 'support',
 'news',
 'improvement',
 'internet',
 'hardware',
 'data',
 'player',
 'cell phones',
 'nokia',
 'process',
 'messages',
 'options',
 'package',
 'speaker',
 'usage',
 'touch screen',
 'ability',
 'space',
 'versions',
 'models',
 'upgrade',
 'networks',
 'carriers',
 'touchscreen',
 'texts',
 'card',
 'fan',
 'stars',
 'lot',
 'radio',
 'pocket',
 'love',
 'carrier',
 'questions',
 'contract',
 'set',
 'pain',
 'look',
 'note',
 'try',
 'trouble',
 'freezes',
 'unit',
 'flash',
 'idea',
 'cable',
 'fits',
 'lg',
 'week',
 'function',
 'tiles',
 'coverage',
 'point',
 'saw',
 'sim cards',
 'headset',
 'car charger',
 'system',
 'sites',
 'move',
 'ways',
 'functionality',
 'complaint',
 'capability',
 'job',
 'put',
 'husband',
 'slide',
 'brand phone',
 'information',
 'areas',
 'plans',
 'email',
 'cards',
 'company',
 'game',
 'hope',
 'ringtones',
 'claims']

In [95]:
len(aspectos_escolhidos)

195

In [96]:
dataset_refinado = database_sample[database_sample['aspecto'].isin(aspectos_escolhidos)]
dataset_refinado

Unnamed: 0,aspecto,sentenca
18956,works,"after 5 months or so of using, it works very w..."
98598,luck,may you have better luck than i.
69278,mobile,"virgin mobile has the worst customer service, ..."
124301,design,"i also like the design, it looks very nice."
82161,company,never seen a company act this badly.
...,...,...
30387,day,my first day was spent just putting in settings.
55244,case,you may need to get a hard case cover so that ...
103035,quality,the call quality is astoundingly bad.
38071,things,the tech i spoke to was very informative and c...


In [97]:
dataset_refinado.to_csv(save_csv[dominio],index=False)

# Pré-processamento

In [98]:
dataframe = pd.read_csv(save_csv[dominio])
dataframe

Unnamed: 0,aspecto,sentenca
0,works,"after 5 months or so of using, it works very w..."
1,luck,may you have better luck than i.
2,mobile,"virgin mobile has the worst customer service, ..."
3,design,"i also like the design, it looks very nice."
4,company,never seen a company act this badly.
...,...,...
2870,day,my first day was spent just putting in settings.
2871,case,you may need to get a hard case cover so that ...
2872,quality,the call quality is astoundingly bad.
2873,things,the tech i spoke to was very informative and c...


In [99]:
dataframe['aspecto'].value_counts()

phones       137
works        104
time          94
thing         91
features      72
            ... 
put            5
sim cards      5
hope           4
ringtones      4
claims         4
Name: aspecto, Length: 195, dtype: int64

In [100]:
datasample = dataframe
datasample

Unnamed: 0,aspecto,sentenca
0,works,"after 5 months or so of using, it works very w..."
1,luck,may you have better luck than i.
2,mobile,"virgin mobile has the worst customer service, ..."
3,design,"i also like the design, it looks very nice."
4,company,never seen a company act this badly.
...,...,...
2870,day,my first day was spent just putting in settings.
2871,case,you may need to get a hard case cover so that ...
2872,quality,the call quality is astoundingly bad.
2873,things,the tech i spoke to was very informative and c...


In [101]:
datasample['aspecto'].value_counts()

phones       137
works        104
time          94
thing         91
features      72
            ... 
put            5
sim cards      5
hope           4
ringtones      4
claims         4
Name: aspecto, Length: 195, dtype: int64

In [102]:
#Lendo o dataset - obtendo os aspectos corretos e suas sentenças
tam = len(datasample)
for ind in range(tam):
    word = datasample.aspecto[ind].strip()
    sentenca = datasample.sentenca[ind].strip()
    count_frequencia = get_frequencia(word,sentenca)
    if(count_frequencia > 0):
        if(word not in dados.keys()):
            dados[word] = set()
            frequencias[word] = 0
        #Caso tudo esteja correto eu pego a palavra e o documento que ela aparece
        dados[word].add(sentenca)            
        documentos = documentos.union(set([sentenca]))
        frequencias[word] += count_frequencia

In [103]:
len(dados.keys()) #Número de aspectos corretos

193

In [104]:
len(documentos) #Número de documentos

2862

In [105]:
# Calculando as frequencias dos aspectos
valor_frequencias = {}
for key in sorted(frequencias.keys()):
    freq = frequencias[key]
    if(freq not in valor_frequencias.keys()):
        valor_frequencias[freq] = []
    valor_frequencias[freq].append(key)

In [106]:
valor_frequencias.keys() #Frequencias dos termos

dict_keys([7, 10, 9, 14, 5, 11, 49, 8, 28, 15, 6, 21, 17, 30, 4, 32, 12, 13, 34, 31, 19, 22, 33, 74, 23, 26, 58, 24, 20, 29, 145, 16, 18, 35, 52, 25, 39, 64, 94, 45, 96, 46, 48, 108, 27])

In [107]:
len(valor_frequencias.keys()) #Total de frequencias encontrado

45

In [108]:
# Selecionando as 100 palavras de maior frequência
cont = 0
palavras_utilizadas = []
documentos_utilizados = set()
for valor in sorted(valor_frequencias.keys(),reverse=True):
    if(cont < 150):
        for palavra in valor_frequencias[valor]:
            print(palavra,valor)
            palavras_utilizadas.append(palavra)
            documentos_utilizados = documentos_utilizados.union(dados[palavra])
            cont += 1

phones 145
works 108
time 96
thing 94
features 74
service 64
keyboard 58
quality 52
bought 49
use 48
times 46
things 45
reviews 39
problems 35
day 34
everything 33
months 33
condition 32
life 32
days 31
experience 31
cell phone 30
people 29
button 28
years 27
issues 26
reception 25
user 25
way 25
minutes 24
issue 23
volume 23
end 22
feature 22
person 22
case 21
sim card 21
version 21
option 20
side 20
design 19
son 19
plan 18
problem 18
cell 17
sound 17
piece 16
text 16
touch 16
weeks 16
buttons 15
reason 15
recommend 15
seller 15
work 15
app 14
bit 14
sim 14
talk 14
year 14
customer service 13
deal 13
gps 13
hands 13
hours 13
network 13
something 13
store 13
stuff 13
cover 12
found 12
functions 12
line 12
need 12
blackberry 11
keypad 11
mobile 11
review 11
signal 11
tracfone 11
wifi 11
amazon 10
bluetooth 10
brand 10
call 10
calls 10
charger 10
complaints 10
connection 10
hand 10
money 10
number 10
owner 10
users 10
waste 10
words 10
amount 9
box 9
customer 9
daughter 9
devices 9
diff

In [109]:
cont

153

In [110]:
print(len(palavras_utilizadas)) #Número de palavras selecionadas
print(len(documentos_utilizados)) #Número de documentos em que os termos aparecem

153
2665


In [111]:
# Palavras selecionadas
for c in palavras_utilizadas:
    print(c)

phones
works
time
thing
features
service
keyboard
quality
bought
use
times
things
reviews
problems
day
everything
months
condition
life
days
experience
cell phone
people
button
years
issues
reception
user
way
minutes
issue
volume
end
feature
person
case
sim card
version
option
side
design
son
plan
problem
cell
sound
piece
text
touch
weeks
buttons
reason
recommend
seller
work
app
bit
sim
talk
year
customer service
deal
gps
hands
hours
network
something
store
stuff
cover
found
functions
line
need
blackberry
keypad
mobile
review
signal
tracfone
wifi
amazon
bluetooth
brand
call
calls
charger
complaints
connection
hand
money
number
owner
users
waste
words
amount
box
customer
daughter
devices
difference
model
purchase
reviewers
support
want
browser
contract
data
hardware
improvement
internet
looks
luck
menu
mode
month
music
news
smartphones
verizon
ability
card
cell phones
lot
love
messages
networks
nokia
options
package
player
process
radio
set
space
speaker
touch screen
usage
carrier
carri

In [112]:
documentos_utilizados = sorted(documentos_utilizados) #Ordenando os documentos
len(documentos_utilizados) #Número de documentos

2665

In [113]:
palavras_utilizadas.sort() #Ordenando as termos
palavras_utilizadas #Aspectos utilizados

['ability',
 'amazon',
 'amount',
 'app',
 'bit',
 'blackberry',
 'bluetooth',
 'bought',
 'box',
 'brand',
 'browser',
 'button',
 'buttons',
 'call',
 'calls',
 'card',
 'carrier',
 'carriers',
 'case',
 'cell',
 'cell phone',
 'cell phones',
 'charger',
 'complaints',
 'condition',
 'connection',
 'contract',
 'cover',
 'customer',
 'customer service',
 'data',
 'daughter',
 'day',
 'days',
 'deal',
 'design',
 'devices',
 'difference',
 'end',
 'everything',
 'experience',
 'fan',
 'feature',
 'features',
 'flash',
 'found',
 'functions',
 'gps',
 'hand',
 'hands',
 'hardware',
 'hours',
 'improvement',
 'internet',
 'issue',
 'issues',
 'keyboard',
 'keypad',
 'life',
 'line',
 'looks',
 'lot',
 'love',
 'luck',
 'menu',
 'messages',
 'minutes',
 'mobile',
 'mode',
 'model',
 'models',
 'money',
 'month',
 'months',
 'music',
 'need',
 'network',
 'networks',
 'news',
 'nokia',
 'number',
 'option',
 'options',
 'owner',
 'package',
 'people',
 'person',
 'phones',
 'piece',
 'pla

In [114]:
documents = documentos_utilizados#[:2000] #Selecionando os primeiros 2000 documentos do dataset
len(documents) #Número de documentos a ser utilizada

2665

In [115]:
mapeamentos = {} #Variável que mapeia os domentos para os termos
frequencias = {} #Variável que mapeia as frequências para os termos
for doc in documents:
    for p in palavras_utilizadas:
        count = get_frequencia(p,doc)
        if(count > 0):
            if(p not in mapeamentos.keys()):
                mapeamentos[p] = set()
            if(p not in frequencias.keys()):
                frequencias[p] = 0
            mapeamentos[p].add(doc) #Mapeando as sentenças para os termos
            frequencias[p] += count #Obtendo as frequências dos termos nos documentos

## Salvando os dados em arquivos

In [116]:
# Salvando as palavras selecionadas em um arquivo
arq = open(save_palavras[dominio],'w')
for p in palavras_utilizadas:
    arq.write(p + ' ->> ' + str(mapeamentos[p]) + '\n')
arq.close()

In [117]:
# Salvando as frequências em um arquivo
arq = open(save_frequencias[dominio],'w')
for p in palavras_utilizadas:
    arq.write(p + ': ' + str(frequencias[p]) + '\n')
arq.close()

In [118]:
# Salvando os documentos em um arquivo
arq = open(save_sentencas[dominio],'w')
for doc in documents:
    arq.write(doc + '\n\n')
arq.close()

In [119]:
# Salvando os mapeamentos em um arquivo
arq = open(save_mapeamentos[dominio],'w')
for p in palavras_utilizadas:
    arq.write(p + ' ->> ' + str(mapeamentos[p]) + '\n')
arq.close()

## Pré-processamento do contexto das palavras

In [120]:
n = len(documents) #Número de documentos utilizados
n_palavras = len(palavras_utilizadas) #Número de palavras utilizadas no processo
contextualizacao = {} #Variável que salva os documentos que cada termo aparece

In [121]:
# Inicializando a variável - uso esta variável para calcular o contexto entre as palavras
for p in palavras_utilizadas:
    contextualizacao[p] = []

In [122]:
# Fazendo o mapeamento dos documentos para os termos
for p in palavras_utilizadas:
    for doc in documents:
        count = get_frequencia(p,doc)
        if(count > 0):
            contextualizacao[p].append([doc])
        else:
            contextualizacao[p].append([])

In [123]:
# Salvando a variável de contextualização
arq = open(save_contextualizacao[dominio],'w')
for p in sorted(contextualizacao.keys()):
    arq.write(p + ' ->> ' + str(contextualizacao[p]) + '\n')
arq.close()

# Obtendo os dados dos arquivos

In [124]:
# Variáveis utilizadas
palavras_utilizadas = []
documentos_utilizados = []
contextualizacao = {}

In [125]:
# Obtendo a variável responsável pela contextualização das palavras
arq = open(save_contextualizacao[dominio],'r')
for linha in arq.readlines():
    valores = linha.split(' ->> ')
    classe = valores[0]
    conjunto = valores[1]
    palavras_utilizadas.append(classe)
    contextualizacao[classe] = ast.literal_eval(conjunto)
arq.close()

In [126]:
# Obtendo os documentos
arq = open(save_sentencas[dominio],'r')
for linha in arq.readlines():
    if(linha != '\n'):
        documentos_utilizados.append(linha.strip())
arq.close()

In [127]:
len(palavras_utilizadas) #Quantidade de palavras a serem utilizadas

153

In [128]:
len(documentos_utilizados) #Quantidade de documentos a ser utilizado

2665

In [129]:
n_palavras = len(palavras_utilizadas) #Número de palavras(aspectos)
n = len(documentos_utilizados) #Número de documentos

# Calculando a similaridade entre as palavras

In [130]:
matrizG = [[1.0]*n_palavras for i in range(n_palavras)] #Matriz de similaridade
# Locais onde salvar as similaridades
caminhoG = ['../similaridades/similaridades-camera-others.txt','../similaridades/similaridades-cell-others.txt']

In [62]:
import spacy #Importando a biblioteca do spacy

In [63]:
nlp = spacy.load("en_vectors_web_lg") #Carregando o modelo do spacy

In [131]:
print("Processando...")
#Salvando as similaridades num arquivo
arq = open(caminhoG[dominio],'w')
for ind in range(len(palavras_utilizadas)):
    token1 = nlp(palavras_utilizadas[ind]) #Palavra 1
    for j in range(ind+1,len(palavras_utilizadas)):
        token2 = nlp(palavras_utilizadas[j]) #Palavra 2
        try:
            similaridade = token1.similarity(token2) #Similaridade entre a palavra 1 e palavra 2
            if(similaridade < 0):
                similaridade = 0.0
            if(similaridade > 1.0):
                similaridade = 1.0
        except:
            similaridade = 0.0
        #Salvando a similaridade no arquivo
        arq.write(palavras_utilizadas[ind]+'->>'+palavras_utilizadas[j]+'->>'+str(similaridade)+'\n')
        matrizG[ind][j] = similaridade
        matrizG[j][ind] = similaridade
arq.close()
print("Concluido!\n")

Processando...
Concluido!



## Word2vec

In [20]:
import gensim
import ast

In [23]:
caminho_word2vec = ['../similaridades/similaridades-camera-others-word2vec.txt','../similaridades/similaridades-cell-others-word2vec.txt']
matriz_word2vec = [[0.0]*n_palavras for i in range(n_palavras)]
caminho = '../../model/GoogleNews-vectors-negative300.bin'

In [None]:
#Carregando o modelo do Word2Vec
model = gensim.models.KeyedVectors.load_word2vec_format(caminho,binary=True)

In [None]:
#Salvando as similaridades do Word2Vec num arquivo
arq = open(caminho_word2vec[dominio], 'w')
for ind in range(len(palavras_utilizadas)):
    for j in range(ind+1, len(palavras_utilizadas)):
        try:
            #Obtendo a similaridade usando o modelo
            similaridade = model.similarity(palavras_utilizadas[ind], palavras_utilizadas[j])
            if(similaridade < 0):
                similaridade = 0.0
            if(similaridade > 1.0):
                similaridade = 1.0
        except:
            similaridade = 0.0
        matriz_word2vec[i][j] = similaridade
        matriz_word2vec[j][i] = similaridade
        #Escrevendo o resultado no arquivo
        arq.write(palavras_utilizadas[ind].replace('_', ' ')+'->>'+palavras_utilizadas[j].replace('_', ' ')+'->>'+str(similaridade)+'\n')
arq.close()
print("Concluido!\n")

# Calculando o Contexto entre as palavras

In [65]:
import math #Importando a biblioteca utilizada para cálculos matemáticos

In [132]:
# Obtendo a contextualização das palavras
mapeamento = {}
for p in sorted(contextualizacao.keys()):
    vetor = contextualizacao[p]
    vetor = [set(v) for v in vetor]
    mapeamento[p] = vetor

In [133]:
#Função que retorna o NPMI das palavras
def contexto(p1,p2,mapeamento):
    fx = 0 #Variavel que guarda o numero de documentos que p1 aparece
    fy = 0 #Variavel que guarda o numero de documentos que p2 aparece
    fxy = 0 #VAriavel que guarda o numero de documentos que p1 e p2 aparecem em uma sentenca
    n = len(mapeamento[p1]) #Numero de documentos
    #Percorrendo os documentos
    for i in list(range(n)):
        if(len(mapeamento[p1][i].intersection(mapeamento[p2][i])) > 0):
            fxy += 1
        if(len(mapeamento[p1][i]) > 0):
            fx += 1
        if(len(mapeamento[p2][i]) > 0):
            fy += 1
    #Obtendo o NPMI
    npmi = 0
    fator1 = (n*fxy)/(fx*fy)
    fator2 = (fxy/n)
    try:
        valor = math.log10(fator1)/-math.log10(fator2)
        npmi = (valor+1)/2 #Normalizando para o intervalo [0,1]
    except:
        npmi = 0.0
    return npmi

In [134]:
matrizT = [[1.0]*n_palavras for i in range(n_palavras)] #Matriz de contextualização

In [135]:
tam = n_palavras #Quantidade de palavras no gabarito
total = (1+tam)*tam/2 #Total de iteracoes que serão feitas
cont = 0 #Variavel contadora
percent = 0 #Valor da porcentagem do processamento
aux = 0 #Variavel auxiliar
#Salvando o valor do NPMI entre as palavras
arq = open(save_contextos[dominio],'w')
print('Calculando o contexto entre as palavras...')
for i in range(n_palavras):
    for j in range(i+1,n_palavras):
        cont += 1
        #valor = NPMI(words[i],words[j],documentos)
        valor = contexto(palavras_utilizadas[i],palavras_utilizadas[j],mapeamento) #Valor do NPMI
        matrizT[i][j] = valor
        matrizT[j][i] = valor
        resp = str(palavras_utilizadas[i]) + '->>' + str(palavras_utilizadas[j]) + '->>' + str(valor) + '\n'
        arq.write(resp) #Salvando as palavras num arquivo
        #Printando o valor da porcentagem na tela
        aux = int(cont*100/total)
        if(aux > percent):
            print(str(aux) + "% concluido")
            percent = aux
arq.close()
print("Concluido\n")

Calculando o contexto entre as palavras...
1% concluido
2% concluido
3% concluido
4% concluido
5% concluido
6% concluido
7% concluido
8% concluido
9% concluido
10% concluido
11% concluido
12% concluido
13% concluido
14% concluido
15% concluido
16% concluido
17% concluido
18% concluido
19% concluido
20% concluido
21% concluido
22% concluido
23% concluido
24% concluido
25% concluido
26% concluido
27% concluido
28% concluido
29% concluido
30% concluido
31% concluido
32% concluido
33% concluido
34% concluido
35% concluido
36% concluido
37% concluido
38% concluido
39% concluido
40% concluido
41% concluido
42% concluido
43% concluido
44% concluido
45% concluido
46% concluido
47% concluido
48% concluido
49% concluido
50% concluido
51% concluido
52% concluido
53% concluido
54% concluido
55% concluido
56% concluido
57% concluido
58% concluido
59% concluido
60% concluido
61% concluido
62% concluido
63% concluido
64% concluido
65% concluido
66% concluido
67% concluido
68% concluido
69% concluido


# Gerando e salvando as matrizes

In [136]:
indices = {} #Variável que mapeia os termos para os seus indices nas matrizes
ind = 0
for i in range(n_palavras):
    word = palavras_utilizadas[i]
    indices[word] = ind
    ind += 1

In [137]:
# Obtendo as similaridades entre os termos
arq = open(caminhoG[dominio],'r')
for linha in arq.readlines():
    valores = linha.split('->>')
    p1 = valores[0]
    p2 = valores[1]
    valor = float(valores[2])
    x = indices[p1]
    y = indices[p2]
    #Colocando os valores das similaridades nas matrizes
    matrizG[x][y] = valor
    matrizG[y][x] = valor
arq.close()

In [138]:
# Obtendo o NPMI entre as palavras
arq = open(save_contextos[dominio],'r')
for linha in arq.readlines():
    valores = linha.split('->>')
    p1 = valores[0]
    p2 = valores[1]
    valor = float(valores[2])
    x = indices[p1]
    y = indices[p2]
    #Salvando o NPMI na matriz de contextualização
    matrizT[x][y] = valor
    matrizT[y][x] = valor
arq.close()

In [139]:
# Diretórios dos locais para salvar as matrizes de similaridade e contexto
save_matrizG = ['../matrizes/similaridades-camera-others.txt','../matrizes/similaridades-cell-others.txt']
save_matrizT = ['../matrizes/contextos-camera-others.txt','../matrizes/contextos-cell-others.txt']

In [140]:
# Salvando a matriz de similaridade
arq = open(save_matrizG[dominio],'w')
for vetor in matrizG:
    arq.write(str(vetor) + '\n')
arq.close()

In [141]:
# Salvando a matriz de contexto
arq = open(save_matrizT[dominio],'w')
for vetor in matrizT:
    arq.write(str(vetor) + '\n')
arq.close()