# Treinamento das Word Embeddings

In [1]:
import os
import pandas as pd
import numpy as np
import spacy
import pt_core_news_sm
import unidecode
nlp = pt_core_news_sm.load(disable=['parser', 'ner', 'tagger', 'textcat'])

In [2]:
dataframe = pd.read_csv(os.getenv('DATASET_PATH'))

## Pré-processamento dos textos

In [3]:
def to_lower(texts):
    return (text.lower() for text in texts)

In [4]:
def preprocessing_text(doc):
    valid_tokens = []
    for token in doc:
        if not token.is_stop and token.is_alpha:
            valid_tokens.append(unidecode.unidecode(token.text))
    return ' '.join(valid_tokens)

In [5]:
processed_texts = [preprocessing_text(doc) for doc in nlp.pipe(to_lower(dataframe['title']),
                                                               batch_size=1000,
                                                               n_process=-1)]

## Word Embeddings
Nesta etapa são criados os vetores de palavras.

In [6]:
import logging
logging.basicConfig(format='%(asctime)s : %(message)s', level= logging.INFO)

In [7]:
from gensim.models import Word2Vec

modelo_cbow = Word2Vec(sg = 0,           # CBOW (sg = 0)
                    window = 2,          # considera 2 palavras antes e 2 depois
                    size = 300,          # tamanho do vetor de palavras: 300
                    min_count = 0,       # define a quantidade mínima de palavras no texto para ser considerada.
                    alpha = 0.03,        # taxa de aprendizagem da RNA
                    min_alpha = 0.007    # taxa de aprendizagem (minimizada a cada época)
                    )  

In [8]:
list_titles_tokens = [title.split(" ") for title in processed_texts]

In [9]:
modelo_cbow.build_vocab(list_titles_tokens, progress_per=5000)

2021-03-24 10:44:05,309 : collecting all words and their counts
2021-03-24 10:44:05,317 : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-03-24 10:44:05,328 : PROGRESS: at sentence #5000, processed 18630 words, keeping 2663 word types
2021-03-24 10:44:05,352 : PROGRESS: at sentence #10000, processed 37135 words, keeping 3601 word types
2021-03-24 10:44:05,378 : PROGRESS: at sentence #15000, processed 55522 words, keeping 4234 word types
2021-03-24 10:44:05,418 : PROGRESS: at sentence #20000, processed 74079 words, keeping 4805 word types
2021-03-24 10:44:05,470 : PROGRESS: at sentence #25000, processed 92438 words, keeping 5208 word types
2021-03-24 10:44:05,515 : PROGRESS: at sentence #30000, processed 110999 words, keeping 5592 word types
2021-03-24 10:44:05,539 : PROGRESS: at sentence #35000, processed 129376 words, keeping 5902 word types
2021-03-24 10:44:05,569 : collected 6121 word types from a corpus of 142380 raw words and 38507 sentences
2021-03-24 10:44

In [10]:
modelo_cbow.corpus_count

38507

In [11]:
modelo_cbow.train(list_titles_tokens, 
                  total_examples=modelo_cbow.corpus_count,
                  epochs=50)

2021-03-24 10:44:07,449 : training model with 3 workers on 6121 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=2
2021-03-24 10:44:07,767 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:44:07,769 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:07,772 : worker thread finished; awaiting finish of 0 more threads
2021-03-24 10:44:07,776 : EPOCH - 1 : training on 142380 raw words (114200 effective words) took 0.3s, 375339 effective words/s
2021-03-24 10:44:08,056 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:44:08,075 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:08,084 : worker thread finished; awaiting finish of 0 more threads
2021-03-24 10:44:08,087 : EPOCH - 2 : training on 142380 raw words (114102 effective words) took 0.3s, 423019 effective words/s
2021-03-24 10:44:08,413 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:

2021-03-24 10:44:16,386 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:16,398 : worker thread finished; awaiting finish of 0 more threads
2021-03-24 10:44:16,402 : EPOCH - 22 : training on 142380 raw words (114248 effective words) took 0.3s, 366462 effective words/s
2021-03-24 10:44:16,721 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:44:16,739 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:16,762 : worker thread finished; awaiting finish of 0 more threads
2021-03-24 10:44:16,772 : EPOCH - 23 : training on 142380 raw words (114192 effective words) took 0.3s, 331858 effective words/s
2021-03-24 10:44:17,025 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:44:17,030 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:17,039 : worker thread finished; awaiting finish of 0 more threads
2021-03-24 10:44:17,042 : EPOCH - 24 : training on 142380 raw words (1

2021-03-24 10:44:22,864 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:44:22,871 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:22,899 : worker thread finished; awaiting finish of 0 more threads
2021-03-24 10:44:22,903 : EPOCH - 44 : training on 142380 raw words (114430 effective words) took 0.4s, 309957 effective words/s
2021-03-24 10:44:23,251 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:44:23,287 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:23,288 : worker thread finished; awaiting finish of 0 more threads
2021-03-24 10:44:23,301 : EPOCH - 45 : training on 142380 raw words (114293 effective words) took 0.4s, 302132 effective words/s
2021-03-24 10:44:23,690 : worker thread finished; awaiting finish of 2 more threads
2021-03-24 10:44:23,700 : worker thread finished; awaiting finish of 1 more threads
2021-03-24 10:44:23,707 : worker thread finished; awaiting finish of 0

(5714149, 7119000)

In [12]:
# teste simples do modelo:
modelo_cbow.wv.most_similar('santo')

2021-03-24 10:44:25,417 : precomputing L2-norms of word weight vectors


[('espirito', 0.8411768674850464),
 ('antonio', 0.7855128645896912),
 ('padroeiro', 0.6689670085906982),
 ('salvador', 0.6618492603302002),
 ('catequistas', 0.6208908557891846),
 ('divino', 0.5763355493545532),
 ('mineiro', 0.5510334968566895),
 ('medalhao', 0.520987868309021),
 ('resplendor', 0.5057092905044556),
 ('religiosa', 0.5013446807861328)]

### Salva o vetor de palavras para ser utilizado posteriormente

In [13]:
modelo_cbow.wv.save_word2vec_format(os.getenv('WORD_EMBEDDINGS_PATH'), binary=False)

2021-03-24 10:44:25,668 : storing 6121x300 projection weights into /usr/src/data/modelo_cbow.txt
