In [1]:
import os
import xmltodict
import random
import pickle
import gzip
import pandas as pd
import numpy
from pprint import pprint
from utils.tui import Progress
from utils.lexical import LexicalProcessing
from sklearn.feature_extraction.text import TfidfVectorizer

BASE_DIR = '../data/corpora/trainset'
TEST_PERCENT = 0.2
LP = LexicalProcessing()

In [2]:
def get_count():
    count = 0
    for product in os.listdir(BASE_DIR):
        corpus = {}

        product_path = '{}/{}'.format(BASE_DIR, product)
        for rank in os.listdir(product_path):  
            rank_path = '{}/{}'.format(product_path, rank)
            fls = os.listdir(rank_path)
            fls = [ x for x in fls if '.xml' in x ]
            count += len(fls)
    return count

In [3]:
def get_corpora():
    corpora_kabum = {}
    errors = []
    count = get_count()
    P = Progress(count, '')
    for product in os.listdir(BASE_DIR):
        corpus = {}

        product_path = '{}/{}'.format(BASE_DIR, product)
        for rank in os.listdir(product_path):  
            rank_path = '{}/{}'.format(product_path, rank)
            fls = os.listdir(rank_path)
            fls = [ x for x in fls if '.xml' in x ]

            reviews = []
            for fl_name in fls:
                fl_path = '{}/{}'.format(rank_path, fl_name)
                with open(fl_path) as fl_:
                    try:
                        r = xmltodict.parse(fl_.read())['review']
                        if not r['opinion']:
                            raise Exception(fl_path)
                        reviews.append(r)
                    except:
                        errors.append(fl_path)
                P.progressStep()
            corpus[rank] = reviews
        corpora_kabum[product] = corpus
    return (corpora_kabum, errors)

In [4]:
def create_dataframe(corpora_kabum):
    dataset = dict(test=[], category=[], score=[], review=[])
    for cat in corpora_kabum.keys():
        for stars in corpora_kabum[cat].keys():
            for review in corpora_kabum[cat][stars]:
                dataset['test'].append(random.choices([0,1], weights=[0.8, 0.2],k=1)[0])
                dataset['category'].append(review['category']['@value'])
                dataset['score'].append(float(review['stars']['@value']))
                dataset['review'].append(review['opinion'])
    return pd.DataFrame(data=dataset)

In [5]:
DF_PATH = 'df_kabum.pkl.gz'

def save(data, path):
    with gzip.open(path, 'wb') as f:
        f.write(pickle.dumps(data))

def load(path):
    with gzip.open(path, 'rb') as f:
        return pickle.loads(f.read())
    

In [7]:
def tokenize_review(text):
    text = LP.lowercase(text)
    text = LP.remove_punctuation(text)
    tokens = LP.tokenize_words(text)
    progess_tokenize.progressStep()
    return tokens

def normalize_review(tokens):
    return ' '.join(tokens)

def lemmatize_words(text):
    lemmas = LP.lemmatize_sentence(text)
    progess_lemma.progressStep()
    return lemmas

In [8]:
LOAD_PICKLE = True
if LOAD_PICKLE:
    df = load(DF_PATH)
    
else:
    c_ = get_corpora()
    df = create_dataframe(c_[0])
    df['tokens'] = df.review.apply(tokenize_review)
    df['normalized'] = df.tokens.apply(normalize_review)
    df['lemmas'] = df.normalized.apply(lemmatize_words)
    save(df, DF_PATH)

df.head()

Unnamed: 0,test,category,score,review,tokens,normalized,lemmas
0,0,Joystick / Controle,1.0,Comprei e já vou devolver pois nas especificaç...,"[comprei, e, já, vou, devolver, pois, nas, esp...",comprei e já vou devolver pois nas especificaç...,"[comprar, e, já, ir, devolver, pois, o, especi..."
1,1,Joystick / Controle,1.0,Esta guitarra NÃO é compatível com Nintendo Wi...,"[esta, guitarra, não, é, compatível, com, nint...",esta guitarra não é compatível com nintendo wi...,"[este, guitarra, não, ser, compatível, com, ni..."
2,1,Joystick / Controle,1.0,Não funciona corretamente no Wii. No Rockband ...,"[não, funciona, corretamente, no, wii, no, roc...",não funciona corretamente no wii no rockband o...,"[não, funcionar, corretamente, o, wii, o, rock..."
3,0,Joystick / Controle,1.0,"Não sei em outros consoles, mas no wii esse pr...","[não, sei, em, outros, consoles, mas, no, wii,...",não sei em outros consoles mas no wii esse pro...,"[não, saber, em, outro, consolar, mas, o, wii,..."
4,1,Joystick / Controle,1.0,Produto nada funcional e ainda com preço exage...,"[produto, nada, funcional, e, ainda, com, preç...",produto nada funcional e ainda com preço exage...,"[produto, nado, funcional, e, ainda, com, preç..."


In [10]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('../models/cbow_s50.txt')

In [11]:
ERRR = []
progess_vecs = Progress(len(df), 'vecs')
def calc_vec(tokens):
    vecs = []
    for w in tokens:
        try:
            vecs.append(model[w])
        except:
            pass
    progess_vecs.progressStep()
    if not vecs:
        ERRR.append(tokens)
        return numpy.zeros((50, ))
        
    return numpy.average(vecs, axis=0)

def aaaa(a):
    return int(a)

df['vec'] = df.lemmas.apply(calc_vec)
df['score'] = df.score.apply(aaaa)



In [13]:
train_x = df[df.test == 0].vec.values.tolist()
train_y = df[df.test == 0].score.values.tolist()

test_x = df[df.test == 1].vec.values.tolist()
test_y = df[df.test == 1].score.values.tolist()

In [14]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

clf = GaussianNB()
clf.fit(train_x, train_y)
print('Test accuracy: {:0.3f}'.format(accuracy_score(test_y, clf.predict(test_x))))
print('Train accuracy: {:0.3f}'.format(accuracy_score(train_y, clf.predict(train_x))))

Test accuracy: 0.278
Train accuracy: 0.277


In [15]:
from sklearn.svm import SVC

svc = SVC(verbose=True)
svc.fit(train_x, train_y)
print('Test accuracy: {:0.3f}'.format(accuracy_score(test_y, svc.predict(test_x))))
print('Train accuracy: {:0.3f}'.format(accuracy_score(train_y, svc.predict(train_x))))



[LibSVM]Test accuracy: 0.484
Train accuracy: 0.497


In [16]:
from sklearn.neural_network import MLPClassifier

mplc = MLPClassifier(
    hidden_layer_sizes=(100, 100),verbose=True, batch_size=200, max_iter=1000
)
mplc.fit(train_x, train_y)
print('Test accuracy: {:0.3f}'.format(accuracy_score(test_y, mplc.predict(test_x))))
print('Train accuracy: {:0.3f}'.format(accuracy_score(train_y, mplc.predict(train_x))))

Iteration 1, loss = 1.26545734
Iteration 2, loss = 1.17592148
Iteration 3, loss = 1.15174574
Iteration 4, loss = 1.13794054
Iteration 5, loss = 1.12882821
Iteration 6, loss = 1.12293978
Iteration 7, loss = 1.11727585
Iteration 8, loss = 1.11280913
Iteration 9, loss = 1.10794831
Iteration 10, loss = 1.10383481
Iteration 11, loss = 1.10037255
Iteration 12, loss = 1.09770207
Iteration 13, loss = 1.09389154
Iteration 14, loss = 1.09166823
Iteration 15, loss = 1.08938453
Iteration 16, loss = 1.08681106
Iteration 17, loss = 1.08422374
Iteration 18, loss = 1.08309410
Iteration 19, loss = 1.08069385
Iteration 20, loss = 1.07750598
Iteration 21, loss = 1.07664381
Iteration 22, loss = 1.07508988
Iteration 23, loss = 1.07267008
Iteration 24, loss = 1.07194206
Iteration 25, loss = 1.06950676
Iteration 26, loss = 1.06767112
Iteration 27, loss = 1.06631770
Iteration 28, loss = 1.06375808
Iteration 29, loss = 1.06351070
Iteration 30, loss = 1.06156228
Iteration 31, loss = 1.05978161
Iteration 32, los

Iteration 253, loss = 0.91127674
Iteration 254, loss = 0.91122801
Iteration 255, loss = 0.91152617
Iteration 256, loss = 0.91028030
Iteration 257, loss = 0.91120398
Iteration 258, loss = 0.91007149
Iteration 259, loss = 0.91150564
Iteration 260, loss = 0.90973251
Iteration 261, loss = 0.90851796
Iteration 262, loss = 0.90906742
Iteration 263, loss = 0.90840042
Iteration 264, loss = 0.90939029
Iteration 265, loss = 0.90667559
Iteration 266, loss = 0.90629887
Iteration 267, loss = 0.90742953
Iteration 268, loss = 0.90727660
Iteration 269, loss = 0.90814717
Iteration 270, loss = 0.90534963
Iteration 271, loss = 0.90582160
Iteration 272, loss = 0.90535102
Iteration 273, loss = 0.90544396
Iteration 274, loss = 0.90610264
Iteration 275, loss = 0.90509581
Iteration 276, loss = 0.90402659
Iteration 277, loss = 0.90354310
Iteration 278, loss = 0.90308922
Iteration 279, loss = 0.90270994
Iteration 280, loss = 0.90355227
Iteration 281, loss = 0.90184839
Iteration 282, loss = 0.90078228
Iteration 

Iteration 502, loss = 0.84667436
Iteration 503, loss = 0.84654269
Iteration 504, loss = 0.84595894
Iteration 505, loss = 0.84779720
Iteration 506, loss = 0.84606726
Iteration 507, loss = 0.84660243
Iteration 508, loss = 0.84409156
Iteration 509, loss = 0.84534113
Iteration 510, loss = 0.84431950
Iteration 511, loss = 0.84561154
Iteration 512, loss = 0.84403719
Iteration 513, loss = 0.84339211
Iteration 514, loss = 0.84375362
Iteration 515, loss = 0.84398956
Iteration 516, loss = 0.84458263
Iteration 517, loss = 0.84267814
Iteration 518, loss = 0.84307023
Iteration 519, loss = 0.84267050
Iteration 520, loss = 0.84391653
Iteration 521, loss = 0.84294461
Iteration 522, loss = 0.84314480
Iteration 523, loss = 0.84202922
Iteration 524, loss = 0.84079957
Iteration 525, loss = 0.84090995
Iteration 526, loss = 0.84250004
Iteration 527, loss = 0.84263492
Iteration 528, loss = 0.84195909
Iteration 529, loss = 0.84015231
Iteration 530, loss = 0.84069395
Iteration 531, loss = 0.84182923
Iteration 



Test accuracy: 0.499
Train accuracy: 0.679


In [18]:
from sklearn.neural_network import MLPClassifier

mlp2 = MLPClassifier(
    hidden_layer_sizes=(100, 100),verbose=True, batch_size=500, max_iter=500
)
mlp2.fit(train_x, train_y)
print('Test accuracy: {:0.3f}'.format(accuracy_score(test_y, mlp2.predict(test_x))))
print('Train accuracy: {:0.3f}'.format(accuracy_score(train_y, mlp2.predict(train_x))))

Iteration 1, loss = 1.32070953
Iteration 2, loss = 1.21480830
Iteration 3, loss = 1.18536413
Iteration 4, loss = 1.16460585
Iteration 5, loss = 1.15005299
Iteration 6, loss = 1.14129363
Iteration 7, loss = 1.13264478
Iteration 8, loss = 1.12643961
Iteration 9, loss = 1.12128550
Iteration 10, loss = 1.11553639
Iteration 11, loss = 1.11242584
Iteration 12, loss = 1.10919701
Iteration 13, loss = 1.10591694
Iteration 14, loss = 1.10265346
Iteration 15, loss = 1.09996974
Iteration 16, loss = 1.09856417
Iteration 17, loss = 1.09674743
Iteration 18, loss = 1.09553844
Iteration 19, loss = 1.09276293
Iteration 20, loss = 1.09110938
Iteration 21, loss = 1.08887701
Iteration 22, loss = 1.08664748
Iteration 23, loss = 1.08461879
Iteration 24, loss = 1.08394354
Iteration 25, loss = 1.08242331
Iteration 26, loss = 1.08107288
Iteration 27, loss = 1.07991951
Iteration 28, loss = 1.07887741
Iteration 29, loss = 1.07865425
Iteration 30, loss = 1.07558140
Iteration 31, loss = 1.07521728
Iteration 32, los

Iteration 253, loss = 0.96067132
Iteration 254, loss = 0.96175029
Iteration 255, loss = 0.96194740
Iteration 256, loss = 0.96215379
Iteration 257, loss = 0.96292780
Iteration 258, loss = 0.96016641
Iteration 259, loss = 0.95998749
Iteration 260, loss = 0.95909264
Iteration 261, loss = 0.96005104
Iteration 262, loss = 0.95903827
Iteration 263, loss = 0.95839669
Iteration 264, loss = 0.95904593
Iteration 265, loss = 0.95877199
Iteration 266, loss = 0.95841228
Iteration 267, loss = 0.95797153
Iteration 268, loss = 0.95690245
Iteration 269, loss = 0.95737576
Iteration 270, loss = 0.95730537
Iteration 271, loss = 0.95712453
Iteration 272, loss = 0.95693582
Iteration 273, loss = 0.95724775
Iteration 274, loss = 0.95657141
Iteration 275, loss = 0.95514375
Iteration 276, loss = 0.95477479
Iteration 277, loss = 0.95569750
Iteration 278, loss = 0.95492893
Iteration 279, loss = 0.95441507
Iteration 280, loss = 0.95390335
Iteration 281, loss = 0.95489409
Iteration 282, loss = 0.95638259
Iteration 



Test accuracy: 0.513
Train accuracy: 0.636


In [38]:
txt = 'pessimo produto'
a = LP.lowercase(txt)
a = LP.remove_punctuation(a)
a = LP.lemmatize_sentence(a)
tokens = calc_vec(a)
tokens



array([-0.024314  , -0.168973  ,  0.142887  ,  0.259714  ,  0.020199  ,
        0.0711055 , -0.0186905 , -0.189174  ,  0.23010701,  0.045157  ,
       -0.0785355 ,  0.152928  , -0.308636  ,  0.0351135 ,  0.073931  ,
       -0.1494035 , -0.1500845 , -0.1053205 , -0.1948435 ,  0.11555099,
       -0.272222  ,  0.1495595 , -0.0141165 ,  0.2729825 ,  0.079673  ,
       -0.047302  , -0.1266935 , -0.0120105 ,  0.0519605 ,  0.08785599,
        0.13849701, -0.0003675 ,  0.137727  ,  0.083528  ,  0.2879265 ,
        0.095863  ,  0.1848495 , -0.061666  , -0.0999755 ,  0.2678085 ,
        0.110423  , -0.028264  , -0.054922  , -0.1629705 ,  0.1017225 ,
        0.036644  ,  0.135204  ,  0.0420545 , -0.1898075 ,  0.15031   ],
      dtype=float32)

In [39]:
mlp2.predict([tokens])

array([5])

In [40]:
import wget

ModuleNotFoundError: No module named 'wget'