In [1]:
import os
import xmltodict
import random
import pickle
import gzip
import pandas as pd
import numpy
from pprint import pprint
from utils.tui import Progress
from utils.lexical import Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

BASE_DIR = '../data/corpora/trainset'
TEST_PERCENT = 0.2
PP = Preprocessing()

In [2]:
def get_count():
    count = 0
    for product in os.listdir(BASE_DIR):
        corpus = {}

        product_path = '{}/{}'.format(BASE_DIR, product)
        for rank in os.listdir(product_path):  
            rank_path = '{}/{}'.format(product_path, rank)
            fls = os.listdir(rank_path)
            fls = [ x for x in fls if '.xml' in x ]
            count += len(fls)
    return count

In [3]:
def get_corpora():
    corpora_kabum = {}
    errors = []
    count = get_count()
    P = Progress(count, '')
    for product in os.listdir(BASE_DIR):
        corpus = {}

        product_path = '{}/{}'.format(BASE_DIR, product)
        for rank in os.listdir(product_path):  
            rank_path = '{}/{}'.format(product_path, rank)
            fls = os.listdir(rank_path)
            fls = [ x for x in fls if '.xml' in x ]

            reviews = []
            for fl_name in fls:
                fl_path = '{}/{}'.format(rank_path, fl_name)
                with open(fl_path) as fl_:
                    try:
                        r = xmltodict.parse(fl_.read())['review']
                        if not r['opinion']:
                            raise Exception(fl_path)
                        reviews.append(r)
                    except:
                        errors.append(fl_path)
                P.progressStep()
            corpus[rank] = reviews
        corpora_kabum[product] = corpus
    return (corpora_kabum, errors)

In [4]:
def create_dataframe(corpora_kabum):
    dataset = dict(test=[], category=[], score=[], review=[])
    for cat in corpora_kabum.keys():
        for stars in corpora_kabum[cat].keys():
            for review in corpora_kabum[cat][stars]:
                dataset['test'].append(random.choices([0,1], weights=[0.8, 0.2],k=1)[0])
                dataset['category'].append(review['category']['@value'])
                dataset['score'].append(float(review['stars']['@value']))
                dataset['review'].append(review['opinion'])
    return pd.DataFrame(data=dataset)

In [5]:
DF_PATH = 'df_kabum.pkl.gz'

def save(data, path):
    with gzip.open(path, 'wb') as f:
        f.write(pickle.dumps(data))

def load(path):
    with gzip.open(path, 'rb') as f:
        return pickle.loads(f.read())
    

In [6]:
def tokenize_review(text):
    text = PP.lowercase(text)
    text = PP.remove_punctuation(text)
    tokens = PP.tokenize_words(text)
    return tokens

def normalize_review(tokens):
    return ' '.join(tokens)

In [7]:
LOAD_PICKLE = True
if LOAD_PICKLE:
    df = load(DF_PATH)
    
else:
    c_ = get_corpora()
    df = create_dataframe(c_[0])
    df['tokens'] = df.review.apply(tokenize_review)
    df['normalized'] = df['tokens'].apply(normalize_review)
    save(df, DF_PATH)

df.head()

Unnamed: 0,test,category,score,review,tokens,normalized
0,0,Joystick / Controle,1.0,Comprei e já vou devolver pois nas especificaç...,"[comprei, e, já, vou, devolver, pois, nas, esp...",comprei e já vou devolver pois nas especificaç...
1,1,Joystick / Controle,1.0,Esta guitarra NÃO é compatível com Nintendo Wi...,"[esta, guitarra, não, é, compatível, com, nint...",esta guitarra não é compatível com nintendo wi...
2,1,Joystick / Controle,1.0,Não funciona corretamente no Wii. No Rockband ...,"[não, funciona, corretamente, no, wii, no, roc...",não funciona corretamente no wii no rockband o...
3,0,Joystick / Controle,1.0,"Não sei em outros consoles, mas no wii esse pr...","[não, sei, em, outros, consoles, mas, no, wii,...",não sei em outros consoles mas no wii esse pro...
4,1,Joystick / Controle,1.0,Produto nada funcional e ainda com preço exage...,"[produto, nada, funcional, e, ainda, com, preç...",produto nada funcional e ainda com preço exage...


In [8]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('cbow_s50.txt')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [9]:
ERRR = []
def calc_vec(tokens):
    vecs = []
    for w in tokens:
        try:
            vecs.append(model[w])
        except:
            pass
    if not vecs:
        ERRR.append(tokens)
        return numpy.zeros((50, ))
        
    return numpy.average(vecs, axis=0)

def aaaa(a):
    return int(a)

df['vec'] = df.tokens.apply(calc_vec)
df['score'] = df.score.apply(aaaa)

In [10]:
train_x = df[df.test == 0].vec.values.tolist()
train_y = df[df.test == 0].score.values.tolist()

test_x = df[df.test == 1].vec.values.tolist()
test_y = df[df.test == 1].score.values.tolist()

In [16]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

clf = GaussianNB()
clf.fit(train_x, train_y)
print('Test accuracy: {:0.3f}'.format(accuracy_score(test_y, clf.predict(test_x))))
print('Train accuracy: {:0.3f}'.format(accuracy_score(train_y, clf.predict(train_x))))

Test accuracy: 0.279
Train accuracy: 0.286


In [17]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(train_x, train_y)
print('Test accuracy: {:0.3f}'.format(accuracy_score(test_y, svc.predict(test_x))))
print('Train accuracy: {:0.3f}'.format(accuracy_score(train_y, svc.predict(train_x))))



[LibSVM]Test accuracy: 0.476
Train accuracy: 0.488


In [26]:
from sklearn.neural_network import MLPClassifier

mplc = MLPClassifier(
    hidden_layer_sizes=(100, 100),verbose=True, batch_size=200, max_iter=1000
)
mplc.fit(train_x, train_y)
print('Test accuracy: {:0.3f}'.format(accuracy_score(test_y, mplc.predict(test_x))))
print('Train accuracy: {:0.3f}'.format(accuracy_score(train_y, mplc.predict(train_x))))

Iteration 1, loss = 1.26626224
Iteration 2, loss = 1.17226473
Iteration 3, loss = 1.14355934
Iteration 4, loss = 1.12958398
Iteration 5, loss = 1.12078397
Iteration 6, loss = 1.11405616
Iteration 7, loss = 1.10785356
Iteration 8, loss = 1.10405135
Iteration 9, loss = 1.09904393
Iteration 10, loss = 1.09669037
Iteration 11, loss = 1.09240084
Iteration 12, loss = 1.08964475
Iteration 13, loss = 1.08720379
Iteration 14, loss = 1.08453770
Iteration 15, loss = 1.08154340
Iteration 16, loss = 1.07857400
Iteration 17, loss = 1.07697364
Iteration 18, loss = 1.07406957
Iteration 19, loss = 1.07246755
Iteration 20, loss = 1.06930700
Iteration 21, loss = 1.06937924
Iteration 22, loss = 1.06649919
Iteration 23, loss = 1.06527769
Iteration 24, loss = 1.06335473
Iteration 25, loss = 1.06217435
Iteration 26, loss = 1.05899675
Iteration 27, loss = 1.05718949
Iteration 28, loss = 1.05557599
Iteration 29, loss = 1.05395985
Iteration 30, loss = 1.05206031
Iteration 31, loss = 1.05044745
Iteration 32, los

Iteration 253, loss = 0.89633654
Iteration 254, loss = 0.89537429
Iteration 255, loss = 0.89588566
Iteration 256, loss = 0.89550585
Iteration 257, loss = 0.89381433
Iteration 258, loss = 0.89432507
Iteration 259, loss = 0.89423696
Iteration 260, loss = 0.89322251
Iteration 261, loss = 0.89279755
Iteration 262, loss = 0.89306169
Iteration 263, loss = 0.89199518
Iteration 264, loss = 0.89171418
Iteration 265, loss = 0.89132016
Iteration 266, loss = 0.89073788
Iteration 267, loss = 0.89181470
Iteration 268, loss = 0.89080238
Iteration 269, loss = 0.88985632
Iteration 270, loss = 0.88956576
Iteration 271, loss = 0.89079482
Iteration 272, loss = 0.88829096
Iteration 273, loss = 0.88964482
Iteration 274, loss = 0.88824550
Iteration 275, loss = 0.88808457
Iteration 276, loss = 0.88795543
Iteration 277, loss = 0.88684993
Iteration 278, loss = 0.88747736
Iteration 279, loss = 0.88626186
Iteration 280, loss = 0.88634769
Iteration 281, loss = 0.88586713
Iteration 282, loss = 0.88654564
Iteration 

Iteration 502, loss = 0.83219050
Iteration 503, loss = 0.83077586
Iteration 504, loss = 0.83152520
Iteration 505, loss = 0.83068417
Iteration 506, loss = 0.82993995
Iteration 507, loss = 0.82980231
Iteration 508, loss = 0.83127030
Iteration 509, loss = 0.83032909
Iteration 510, loss = 0.82900871
Iteration 511, loss = 0.82773615
Iteration 512, loss = 0.82917262
Iteration 513, loss = 0.82923400
Iteration 514, loss = 0.83025005
Iteration 515, loss = 0.82821095
Iteration 516, loss = 0.83153804
Iteration 517, loss = 0.82967873
Iteration 518, loss = 0.82935980
Iteration 519, loss = 0.82863702
Iteration 520, loss = 0.82830741
Iteration 521, loss = 0.82835416
Iteration 522, loss = 0.82795831
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Test accuracy: 0.505
Train accuracy: 0.676


In [34]:
save(svc, 'm_svc_00.pkl.gz')

In [30]:
dict(mlp)

{}