In [20]:
import unicodedata
import pandas as pd
import gensim
import nltk

In [2]:
posts = pd.read_csv('posts_count_me_pol.csv.gz', compression='gzip', error_bad_lines=False)
posts = posts[posts['me'] > 2]
posts = posts[posts['polarity'] > 2]
posts = posts[posts.contentCount.apply(lambda x: str(x).isnumeric())]
posts = posts.reset_index()
posts.shape

(37746, 20)

In [16]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [33]:
sent_tokenizer = nltk.data.load('tokenizers/punkt/portuguese.pickle')
reg_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

sentences_list = posts.apply(lambda row: sent_tokenizer.tokenize(row['content']), axis=1)

print("Process contents:", len(sentences_list))

sentences = []
for sentence in sentences_list:
    sentences.extend(sentence)

print("Process sentences:", len(sentences))
    
sentences_tokens = []
for sentence in sentences:
    tokens = reg_tokenizer.tokenize(sentence)
    sentences_tokens.append( [remove_accents(w.lower()) for w in tokens] )
    
print("Process tokens:", len(sentences_tokens))

Process contents: 37746
Process sentences: 847660
Process tokens: 847660


In [34]:
# train model
model = gensim.models.Word2Vec(sentences_tokens, size=300, workers=16, iter=10, negative=20)
# trim memory
model.init_sims(replace=True)
# save model
model.save('post_word2vec.mdl')
model.wv.save_word2vec_format('post_word2vec.bin', binary=True)

In [35]:
# creta a dict 
w2v = dict(zip(model.wv.index2word, model.wv.syn0))
print ("Number of tokens in Word2Vec:", len(w2v.keys()))

Number of tokens in Word2Vec: 66112


In [40]:
model.most_similar_cosmul('amor',topn=10)

[('sentimento', 0.8009729385375977),
 ('afeto', 0.7898211479187012),
 ('egoismo', 0.7882105112075806),
 ('gozo', 0.7722449898719788),
 ('contentamento', 0.7682509422302246),
 ('zelo', 0.766608715057373),
 ('sofrimento', 0.7666018605232239),
 ('arrependimento', 0.7656380534172058),
 ('perdao', 0.7592653632164001),
 ('anseio', 0.7586473226547241)]