In [1]:
import os
import xmltodict
import random
import pickle
import gzip
import pandas as pd
import numpy
from pprint import pprint
from utils.tui import Progress
from utils.lexical import Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

BASE_DIR = '../data/corpora/trainset'
TEST_PERCENT = 0.2
PP = Preprocessing()
LOAD_CORPUS = False

In [2]:
def get_count():
    count = 0
    for product in os.listdir(BASE_DIR):
        corpus = {}

        product_path = '{}/{}'.format(BASE_DIR, product)
        for rank in os.listdir(product_path):  
            rank_path = '{}/{}'.format(product_path, rank)
            fls = os.listdir(rank_path)
            fls = [ x for x in fls if '.xml' in x ]
            count += len(fls)
    return count

In [3]:
def get_corpora():
    corpora_kabum = {}
    errors = []
    count = get_count()
    P = Progress(count, '')
    for product in os.listdir(BASE_DIR):
        corpus = {}

        product_path = '{}/{}'.format(BASE_DIR, product)
        for rank in os.listdir(product_path):  
            rank_path = '{}/{}'.format(product_path, rank)
            fls = os.listdir(rank_path)
            fls = [ x for x in fls if '.xml' in x ]

            reviews = []
            for fl_name in fls:
                fl_path = '{}/{}'.format(rank_path, fl_name)
                with open(fl_path) as fl_:
                    try:
                        r = xmltodict.parse(fl_.read())['review']
                        if not r['opinion']:
                            raise Exception(fl_path)
                        reviews.append(r)
                    except:
                        errors.append(fl_path)
                P.progressStep()
            corpus[rank] = reviews
        corpora_kabum[product] = corpus
    return (corpora_kabum, errors)

In [4]:
def create_dataframe(corpora_kabum):
    dataset = dict(test=[], category=[], score=[], review=[])
    for cat in corpora_kabum.keys():
        for stars in corpora_kabum[cat].keys():
            for review in corpora_kabum[cat][stars]:
                dataset['test'].append(random.choices([0,1], weights=[0.8, 0.2],k=1)[0])
                dataset['category'].append(review['category']['@value'])
                dataset['score'].append(float(review['stars']['@value']))
                dataset['review'].append(review['opinion'])
    dataframe = pd.DataFrame(data=dataset)

In [5]:
DF_PATH = 'df_kabum.pkl.gz'

def save(data, path):
    with gzip.open(path, 'wb') as f:
        f.write(pickle.dumps(data))

def load(path):
    with gzip.open(path, 'rb') as f:
        return pickle.loads(f.read())
    

In [6]:
def tokenize_review(text):
    text = PP.lowercase(text)
    text = PP.remove_punctuation(text)
    tokens = PP.tokenize_words(text)
    return tokens

def normalize_review(tokens):
    return ' '.join(tokens)

In [9]:
# c = get_corpora()
# df = create_dataframe(c)
# df['tokens'] = df.review.apply(tokenize_review)
# df['normalized'] = df['review'].apply(normalize_review)
# save(df, DF_PATH)

df = load(DF_PATH)
df['tokens'] = df['review'].apply(tokenize_review)
a = df[['test','score','tokens']]
a.head()

Unnamed: 0,test,score,tokens
0,0,3.0,"[pra, quem, quer, emagrecer, sem, sair, de, ca..."
1,1,3.0,"[o, aparelho, é, muito, instável, com, pessoas..."
2,0,5.0,"[comprei, sem, muita, certeza, da, resistencia..."
3,0,5.0,"[ótimo, produtoadorei, o, design, exatamente, ..."
4,0,5.0,"[uso, o, de, minha, vizinha, e, acho, muito, b..."


In [22]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('cbow_s50.txt')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [34]:
df['tokenize'] = df.review.apply(tokenize_review)

In [61]:
a = df[['score','tokenize','test']]

In [63]:
train_x = []
train_y = []
for i in range(100):
    l = []
    for w in a.iloc[i].tokenize:
        try:
            l.append(model[w])
        except:
            pass
    train_x.append(numpy.average(l, axis=0))
    train_y.append(a.iloc[i].score)

In [60]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()

In [65]:
clf.fit(train_x, train_y)

GaussianNB(priors=None, var_smoothing=1e-09)

In [69]:
print(clf.predict([train_x[5]]))
print(train_y[5])

[5.]
5.0


In [None]:
l
for w in a.iloc[].tokenize:
    l.append(model[w])