# Analise exploratória

In [3]:
from pathlib import Path
import sys

parent = Path().absolute().parents[0].as_posix()

sys.path.insert(0, parent)

from tqdm import tqdm

import pandas as pd
import numpy as np

import yake
import spacy
import pickle

from nlpiper.core import Compose
from nlpiper.transformers import cleaners
from nlpiper.core import Document


from gensim.corpora.dictionary import Dictionary
from gensim import models 

from sklearn.cluster import DBSCAN, KMeans
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from resources.stopwords import WORDS

In [2]:
spacy.__version__

'3.2.4'

In [None]:
nlp = spacy.load("pt_core_news_lg")

In [None]:
for word in nlp('Esta é uma fila.'):
    print(word.pos_)

In [4]:
data = pd.read_csv('../data/scraping_data.csv.gz', compression='gzip')
data_v2 = pd.read_csv('../data/scraping_data_v2.csv.gz', compression='gzip')
data_political_parties = pd.read_csv('../data/scraping_political_parties.csv.gz', compression='gzip')

In [5]:
data_v2.head()

Unnamed: 0.1,Unnamed: 0,city,title,content,year,tstamp,link,source
0,0,Lisboa,Expresso | PSP de Lisboa detém falso polícia p...,Expresso | PSP de Lisboa detém falso polícia p...,2019,20191014194634,https://arquivo.pt/wayback/20191014194634/http...,expresso.pt
1,1,Lisboa,Lisboa | Lisboa | PÚBLICO,Lisboa | Lisboa | PÚBLICO Ir para o conteúdo I...,2019,20190831124050,https://arquivo.pt/wayback/20190831124050/http...,publico.pt
2,2,Lisboa,Treze pessoas detidas por furtos qualificados ...,Treze pessoas detidas por furtos qualificados ...,2019,20190522144307,https://arquivo.pt/wayback/20190522144307/http...,observador.pt
3,3,Porto,Expresso | Porto,Expresso | Porto Assinar Loja Siga-nos Faceboo...,2019,20190522154115,https://arquivo.pt/wayback/20190522154115/http...,expresso.pt
4,4,Porto,Porto | Porto | PÚBLICO,Porto | Porto | PÚBLICO Ir para o conteúdo Ir ...,2019,20190831124749,https://arquivo.pt/wayback/20190831124749/http...,publico.pt


In [None]:
data.shape

In [None]:
data.isna().any()

In [None]:
data.dropna(inplace=True)
data.shape

In [None]:
data_political_parties.shape

In [None]:
data.city.value_counts()

In [None]:
data.groupby(['year', 'city']).size()

## Data Pre-processing

In [2]:
#docs_on_tokens = np.load('../data/processed/docs_cleaned.npz', allow_pickle=True)['files']
import pickle
with open('../data/processed/docs_cleaned.pickle', 'rb') as handle:
    docs_on_tokens = pickle.load(handle)

In [3]:
len(docs_on_tokens)

30712

In [None]:
top_n = 50
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs_on_tokens[0:1000])
feature_array = vectorizer.get_feature_names()

tf_idf_scores = sorted(list(zip(vectorizer.get_feature_names(), 
                                             X.sum(0).getA1())), 
                                 key=lambda x: x[1], reverse=True)

print('tf_idf scores: \n', tf_idf_scores[:top_n])


print('idf values: \n', sorted(list(zip(feature_array,vectorizer.idf_,)),
       key = lambda x: x[1], reverse=True)[:top_n])

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs_on_tokens[0:top_n])
feature_array = vectorizer.get_feature_names()
print('Frequency: \n', sorted(list(zip(vectorizer.get_feature_names(), 
                                         X.sum(0).getA1())),
                            key=lambda x: x[1], reverse=True)[:top_n])       


In [None]:
tf_idf_scores[0]

In [None]:
len(X.indices)

In [None]:
len(X.data)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ["This is very strange",
          "This is very nice"]
vectorizer = TfidfVectorizer(norm='l2')
corpus = vectorizer.fit_transform(corpus)

print(corpus)

In [None]:
vectorizer.vocabulary_

In [None]:
print(dict(zip(vectorizer.vocabulary_, vectorizer.idf_)))


## Topic modeling

In [None]:
# Create a corpus from a list of texts
dictionary = Dictionary(docs_on_tokens)
corpus = [dictionary.doc2bow(text) for text in docs_on_tokens]

In [None]:
dictionary.token2id.keys()

In [None]:
corpus

In [None]:
model = models.LdaModel(corpus, num_topics=50)

In [None]:
print(model.print_topics())


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
gensimvis.prepare(model, corpus, dictionary)


## Keyword Detection

In [None]:
data.link.iloc[-1]

In [None]:
language = "pt"
max_ngram_size = 3
deduplication_thresold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
numOfKeywords = 5

custom_kw_extractor = yake.KeywordExtractor(
    lan=language, 
    n=max_ngram_size, 
    dedupLim=deduplication_thresold, 
    dedupFunc=deduplication_algo, 
    windowsSize=windowSize, 
    top=numOfKeywords, 
    features=None
)
keywords = custom_kw_extractor.extract_keywords(docs[0])

for kw in keywords:
    print(kw)
    

In [None]:
keywords_on_docs = []
for idx in range(len(data)):
    keywords_on_docs.append(custom_kw_extractor.extract_keywords(simple_pipeline(Document(data.content.iloc[idx].lower())).cleaned))

In [None]:
custom_kw_extractor.extract_keywords(simple_pipeline(Document(data.content.iloc[0].lower())).cleaned)

In [11]:
with open('../data/processed/docs_keywords.pickle', 'rb') as handle:
        docs_keywords = pickle.load(handle)

In [25]:
t = []
for keywords in docs_keywords[-50]:
        t.append((keywords[0], model.predict(keywords[0])[0][0]))
t

[('arbitros recusaram dirigir', '__label__Negative'),
 ('recusaram dirigir jogo', '__label__Negative'),
 ('dirigir jogo beiramar', '__label__Negative'),
 ('recusaram dirigir', '__label__Negative'),
 ('arbitros recusaram', '__label__Negative'),
 ('jogo beiramar suspensos.', '__label__Positive'),
 ('beiramar sporting suspensos', '__label__Positive'),
 ('dirigir jogo', '__label__Positive'),
 ('jogo beiramar sporting', '__label__Positive'),
 ('recusaram dirigir partida', '__label__Negative')]

In [27]:
from collections import Counter
Counter([v[1] for v in t])

Counter({'__label__Negative': 6, '__label__Positive': 4})

In [15]:
model.predict(docs_keywords[50][-2][0])[0][0]

'__label__Positive'

## Embeddings

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

In [None]:
model = Word2Vec(sentences=docs_on_tokens, vector_size=100, window=5, min_count=1, workers=4, sg=0, epochs=30)

In [None]:
model.wv.most_similar('rei', topn=10)

In [None]:
model.wv.similarity('homen', 'rei')

## Clustering

In [10]:
from sklearn.cluster import DBSCAN, KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt


In [5]:
with open('../data/processed/words_embedded.pickle', 'rb') as handle:
    words_emb = pickle.load(handle)

In [6]:
len(words_emb)

92921

In [7]:
words_emb[0]

(array([ 0.13466482, -0.33339703,  0.03049505, -0.14442785,  0.4825119 ,
         0.09325964, -0.30376583, -0.16224334, -0.3036018 ,  0.3862049 ,
        -0.02629075,  0.11636933, -0.09413078,  0.15269203, -0.57769305,
         0.13305959,  0.14668672, -0.1670519 , -0.14813097,  0.526141  ,
        -0.02352826,  0.01080366, -0.17984863, -0.12925963, -0.1232844 ,
         0.11996371, -0.15587   , -0.51917297, -0.22072664, -0.3079447 ,
         0.08658405,  0.11462795, -0.20802052, -0.2547001 ,  0.11560781,
        -0.3833881 ,  0.29114178,  0.1305361 , -0.01813705,  0.03185189,
        -0.102152  ,  0.4646123 ,  0.0557541 ,  0.20979929,  0.03325607,
        -0.2373364 ,  0.0556756 ,  0.11174974, -0.55730236, -0.04179006,
        -0.00191056, -0.09842405,  0.48407012, -0.06027829, -0.07975583,
        -0.11629441,  0.3252713 , -0.26345727,  0.12171944,  0.3118627 ,
        -0.27259988, -0.29362404,  0.1995114 ,  0.29705742,  0.07984243,
        -0.25047162,  0.65206164, -0.27045733, -0.3

In [8]:
len(words_emb[0])

2

In [None]:
X = [val[0] for val in words_emb]
ks = range(2, 12)
results = {}
for k in ks: 
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    results[k] = silhouette_score(X, kmeans.labels_)
    print(results)

In [6]:
kmeans_preds.labels_ 

array([3, 0, 0, ..., 0, 0, 0], dtype=int32)

In [None]:
kmeans = KMeans(n_clusters=10, random_state=0)
kmeans.fit(X)

In [None]:
kmeans = MiniBatchKMeans(n_clusters=20,
                         random_state=0,
                         batch_size=12)

In [None]:
kmeans.fit(X)

In [None]:
distortions = []
cluster_numbers = range(2, 20)
for k in cluster_numbers:
    k_means = KMeans(n_clusters=k, random_state=42)
    k_means.fit(X)
    distortions.append(sum(np.min(cdist(X, k_means.cluster_centers_, 'euclidean'), axis=1)) / len(X))

In [None]:
X_line = [cluster_numbers[0], cluster_numbers[-1]]
Y_line = [distortions[0], distortions[-1]]

plt.figure(figsize=(10,10))
plt.plot(cluster_numbers, distortions, 'b-')
plt.plot(X_line, Y_line, 'r')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [8]:
res = pd.DataFrame()
res['Word'] = [val[1] for val in words_emb]
res['Emb'] = [val[0] for val in words_emb]
res['Concept'] = kmeans.labels_

In [9]:
res.head()

Unnamed: 0,Word,Emb,Concept
0,aa,"[0.023016687, 0.084352694, 0.15179141, -0.1909...",3
1,aaai,"[0.0070010833, 0.015572468, 0.030469012, -0.03...",0
2,aaba,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
3,aabreuexpresso,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0
4,aachoo,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0


In [10]:
res.Concept.value_counts()

0     65815
6      7400
11     6261
8      4583
4      2698
3      2532
10     2209
7      1910
9      1818
2      1348
1       929
5       436
Name: Concept, dtype: int64

In [None]:
res[res.Concept == 8].head(50)

In [None]:
import gensim
model = gensim.models.Word2Vec.load('../models/trained/word2vec.model')

In [None]:
print(model.wv.most_similar('furto', topn=10))

In [None]:
for concept in np.unique(kmeans_preds.labels_):
    print('Concept:', concept)
    print(model.wv.most_similar(positive=[np.mean(res[res.Concept==concept]['Emb'])], topn=10))

In [19]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('../models/trained/fasttext-sentiment.vec')

In [20]:
print(model.most_similar('furto', topn=10))

[('esperansosa', 0.9991340041160583), ('fantasticamente', 0.9990586042404175), ('elisabeth', 0.9987783432006836), ('mtooo', 0.9987491965293884), ('oskar', 0.998711109161377), ('multi', 0.9986676573753357), ('9,0.', 0.9986391067504883), ("he's", 0.9986375570297241), ('ame', 0.9986228346824646), ('*-*,', 0.9986050724983215)]


In [None]:
for concept in np.unique(kmeans_preds.labels_):
    print('Concept:', concept)
    print(model.most_similar(positive=[np.mean(res[res.Concept==concept]['Emb'])], topn=10))

In [10]:
import fasttext
model = fasttext.load_model('../models/trained/fasttext-sentiment.bin')



In [16]:
print(model.get_nearest_neighbors('desporto'))


[(0.9591214656829834, 'ouvidas.'), (0.9582967162132263, 'inspirador!'), (0.9571285843849182, 'insano,'), (0.956961989402771, 'albergue...tanto'), (0.9567334651947021, 'filmaco!!!'), (0.956638753414154, 'recomendacao'), (0.9565740823745728, 'velho.'), (0.9564656019210815, '1**'), (0.9563958644866943, 'delicia,'), (0.9563903212547302, '.agora')]


In [18]:
# get all words from model
words = model.get_words()

with open('../models/trained/fasttext-sentiment.vec','w') as file_out:
    
    # the first line must contain number of total words and vector dimension
    file_out.write(str(len(words)) + " " + str(model.get_dimension()) + "\n")

    # line by line, you append vectors to VEC file
    for w in words:
        v = model.get_word_vector(w)
        vstr = ""
        for vi in v:
            vstr += " " + str(vi)
        try:
            file_out.write(w + vstr+'\n')
        except:
            pass

In [None]:
for concept in np.unique(kmeans_preds.labels_):
    print('Concept:', concept)
    print(model.most_similar(positive=[np.mean(res[res.Concept==concept]['Emb'])], topn=10))


## Sentiment Analysis

In [None]:
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load("pt_core_news_lg")
nlp.add_pipe('spacytextblob')
text = 'Este restaurante é uma vergonha!'
doc = nlp(text)
doc._.blob.polarity                         # Polarity: -0.125
doc._.blob.subjectivity                        # Subjectivity: 0.9
doc._.blob.sentiment_assessments.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]
doc._.blob.ngrams()                       

In [None]:
doc._.blob.polarity 

In [None]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
from textblob import TextBlob

nlp = spacy.load("pt_core_news_lg")
nlp.add_pipe('spacytextblob')
text = 'mau'
doc = nlp(text)
doc._.blob.polarity                            # Polarity: -0.125
doc._.blob.subjectivity                        # Subjectivity: 0.9
doc._.blob.sentiment_assessments.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]
#doc._.blob.ngrams()      

In [None]:
doc._.blob.sentiment_assessments.assessments

## External data

In [2]:
import ast
from sklearn.model_selection import StratifiedShuffleSplit


In [3]:
df = pd.read_csv('../data/external_corpus/concatenated.csv')

  df = pd.read_csv('../data/external_corpus/concatenated.csv')


In [4]:
df.head()

Unnamed: 0,dataset,original_index,review_text,review_text_processed,review_text_tokenized,polarity,rating,kfold_polarity,kfold_rating
0,b2w,11955,Bem macio e felpudo...recomendo. Preço imbatí...,bem macio e felpudo...recomendo. preco imbati...,"['bem', 'macio', 'felpudo', 'recomendo', 'prec...",1.0,4.0,1,1
1,b2w,35478,Produto excepcional! recomendo!!! inovador e ...,produto excepcional! recomendo!!! inovador e ...,"['produto', 'excepcional', 'recomendo', 'inova...",1.0,5.0,1,1
2,b2w,122760,recebi o produto antes do prazo mas veio com d...,recebi o produto antes do prazo mas veio com d...,"['recebi', 'produto', 'antes', 'do', 'prazo', ...",0.0,1.0,1,1
3,b2w,17114,Bom custo beneficio. Adequado para pessoas que...,bom custo beneficio. adequado para pessoas que...,"['bom', 'custo', 'beneficio', 'adequado', 'par...",1.0,5.0,1,1
4,b2w,19112,Além de higiênico tem o tamanho ideal. Só falt...,alem de higienico tem o tamanho ideal. so falt...,"['alem', 'de', 'higienico', 'tem', 'tamanho', ...",,3.0,-1,1


In [5]:
def sentiment(val):
    if val < 3:
        return 'Negative'
    elif val == 3:
        return 'Neutral'
    elif val > 3:
        return 'Positive'

df['sentiment'] = df['rating'].apply(lambda val : sentiment(val))       

In [6]:
df['dataset'].value_counts()

utlc_movies    1487449
utlc_apps      1039535
b2w             132373
buscape          84991
olist            41744
Name: dataset, dtype: int64

In [7]:
df['sentiment'].value_counts()

Positive    1976535
Negative     409629
Neutral      399928
Name: sentiment, dtype: int64

In [8]:
df[df.sentiment == 'Neutral']['']

Unnamed: 0,dataset,original_index,review_text,review_text_processed,review_text_tokenized,polarity,rating,kfold_polarity,kfold_rating,sentiment
4,b2w,19112,Além de higiênico tem o tamanho ideal. Só falt...,alem de higienico tem o tamanho ideal. so falt...,"['alem', 'de', 'higienico', 'tem', 'tamanho', ...",,3.0,-1,1,Neutral
14,b2w,50688,Ué não entendi! Estava procurando recomendaçõe...,ue nao entendi! estava procurando recomendacoe...,"['ue', 'nao', 'entendi', 'estava', 'procurando...",,3.0,-1,1,Neutral
30,b2w,130351,Não chegou ainda então não tem como ser avalia...,nao chegou ainda entao nao tem como ser avalia...,"['nao', 'chegou', 'ainda', 'entao', 'nao', 'te...",,3.0,-1,1,Neutral
38,b2w,8590,Qualida do produto dentro das espectativas. En...,qualida do produto dentro das espectativas. en...,"['qualida', 'do', 'produto', 'dentro', 'das', ...",,3.0,-1,1,Neutral
52,b2w,41736,"Bom produto, chegou antes do prazo, fácil de m...","bom produto, chegou antes do prazo, facil de m...","['bom', 'produto', 'chegou', 'antes', 'do', 'p...",,3.0,-1,1,Neutral
...,...,...,...,...,...,...,...,...,...,...
2786063,buscape,0_316604,"Não é ruim, mas não me parece tão bom como out...","nao e ruim, mas nao me parece tao bom como out...","['nao', 'ruim', 'mas', 'nao', 'me', 'parece', ...",,3.0,-1,10,Neutral
2786071,buscape,0_195702,"É bonito, barato e parece ser bem feito e func...","e bonito, barato e parece ser bem feito e func...","['bonito', 'barato', 'parece', 'ser', 'bem', '...",,3.0,-1,10,Neutral
2786080,buscape,2_395354,esta é uma máquina de desempenho muito bom\n\n...,esta e uma maquina de desempenho muito bom\n\n...,"['esta', 'uma', 'maquina', 'de', 'desempenho',...",,3.0,-1,10,Neutral
2786088,buscape,minus_1_150466,O esquema antigo de desmontagem e limpeza das ...,o esquema antigo de desmontagem e limpeza das ...,"['esquema', 'antigo', 'de', 'desmontagem', 'li...",,3.0,-1,10,Neutral


In [157]:
df = df[df.dataset.isin(['olist', 'b2w'])]

In [158]:
df.rating.unique()

array([4., 5., 1., 3., 2.])

In [159]:
df.polarity.unique()

array([ 1.,  0., nan])

In [160]:
df['polarity'].value_counts()

1.0    106971
0.0     47166
Name: polarity, dtype: int64

In [161]:
reviews = [(ast.literal_eval(doc), r) for doc, r in zip(df['review_text_tokenized'].to_list(), df['sentiment'].tolist())]
print(len(reviews))
data = [(doc, r) for doc, r in reviews if len(doc) > 0]
print(len(data))


174117
173841


In [123]:
X = np.array([' '.join(val[0]) for val in data])
Y = np.array([val[1] for val in data])

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
for train_index, test_index in sss.split(X, Y ):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

In [124]:
np.unique(y_test, return_counts=True)

(array(['Negative', 'Neutral', 'Positive'], dtype='<U8'),
 array([1139,  365, 2653]))

In [125]:
np.unique(y_train, return_counts=True)

(array(['Negative', 'Neutral', 'Positive'], dtype='<U8'),
 array([10254,  3286, 23872]))

In [126]:
X_train

array(['avaliacao acima expressa minha opiniao',
       'produto venho em otimo estado perfeitamente lacrado chegou bem antes do prazo recomendo',
       'interfone nao funciona que eu faco', ..., 'tudo ok',
       'entregue antes do prazo estipulado', 'excelente'], dtype='<U206')

In [127]:
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model
from scipy.sparse import vstack, hstack
from sklearn.metrics import classification_report

In [128]:
count_vect = TfidfVectorizer()
X_train_ = count_vect.fit_transform(X_train)

In [129]:
#nb=naive_bayes.BernoulliNB()
#nb.fit(X_train,y_train)
#prob=nb.feature_log_prob_ #index 0 is positive

#r=prob[0]-prob[1]

#print('Weighing features')
#X_train=[x.multiply(r).tocsr() for x in X_train_NB]
#X_train=vstack(X_train)

X_test_=count_vect.transform(X_test)
#X_test=[x.multiply(r).tocsr() for x in X_test_pre]
#X_test=vstack(X_test)


In [130]:
print('Training classifier')

svc = linear_model.LogisticRegression()
svc.fit(X_train_,y_train)

print('Testing classifier')
print('Accuracy=', svc.score(X_test_, y_test)*100)
print(classification_report(y_test, svc.predict(X_test_)))

Training classifier
Testing classifier
Accuracy= 84.55617031513111
              precision    recall  f1-score   support

    Negative       0.77      0.88      0.82      1139
     Neutral       0.32      0.06      0.11       365
    Positive       0.89      0.94      0.91      2653

    accuracy                           0.85      4157
   macro avg       0.66      0.63      0.61      4157
weighted avg       0.81      0.85      0.82      4157



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [131]:
svc = linear_model.SGDClassifier(loss='hinge', class_weight='balanced')
svc.fit(X_train_, y_train)

print('Testing classifier')
print('Accuracy=',svc.score(X_test_, y_test)*100)
print(classification_report(y_test, svc.predict(X_test_)))

Testing classifier
Accuracy= 83.32932403175367
              precision    recall  f1-score   support

    Negative       0.76      0.89      0.82      1139
     Neutral       0.31      0.27      0.29       365
    Positive       0.94      0.89      0.91      2653

    accuracy                           0.83      4157
   macro avg       0.67      0.68      0.67      4157
weighted avg       0.83      0.83      0.83      4157



In [132]:
import fasttext

In [133]:
df_train = pd.DataFrame() 
df_train['sentiment'] = y_train
df_train['review'] = X_train

df_train['sentiment'] = df_train['sentiment'].apply(lambda val: f'__label__{val}')

np.savetxt('olist-train.txt', df_train.values, fmt = "%s")

print(df_train.shape)

df_test = pd.DataFrame() 
df_test['sentiment'] = y_test
df_test['review'] = X_test

df_test['sentiment'] = df_test['sentiment'].apply(lambda val: f'__label__{val}')

np.savetxt('olist-test.txt', df_test.values, fmt = "%s")

print(df_test.shape)

(37412, 2)
(4157, 2)


In [146]:
model = fasttext.train_supervised(input="olist-train.txt", lr=0.05, epoch=50, wordNgrams=3)

Read 0M words
Number of words:  14831
Number of labels: 3
Progress: 100.0% words/sec/thread:  688938 lr:  0.000000 avg.loss:  0.188959 ETA:   0h 0m 0s


In [147]:
model.test("olist-train.txt")

(37412, 0.9863145514808083, 0.9863145514808083)

In [148]:
model.test("olist-test.txt")

(4157, 0.8373827279287948, 0.8373827279287948)

In [149]:
df_test['Pred'] = df_test['review'].apply(lambda val: model.predict(val)[0][0])

In [150]:
df_test.head()

Unnamed: 0,sentiment,review,Pred
0,__label__Negative,targaryen para entrega de bijuterias relogios ...,__label__Positive
1,__label__Positive,estou satisfeita com servico rapido bem embala...,__label__Positive
2,__label__Positive,recomendados,__label__Positive
3,__label__Negative,material de pessima qualidade,__label__Negative
4,__label__Negative,era pra ter chego ate dia 06 06 2018 ja fui no...,__label__Negative


In [151]:
print(classification_report(df_test['sentiment'], df_test['Pred']))

                   precision    recall  f1-score   support

__label__Negative       0.77      0.86      0.81      1139
 __label__Neutral       0.29      0.14      0.19       365
__label__Positive       0.90      0.92      0.91      2653

         accuracy                           0.84      4157
        macro avg       0.66      0.64      0.64      4157
     weighted avg       0.81      0.84      0.82      4157

