# **Word2Vec modelling using gensim**

The model computed in this notebook can be used to find out the relations between words in a dataset, compute the similarity between them, or use the vector representation of those words as input for other applications such as text classification or clustering.

Data used in examples belongs to Grupo CincoM.


In [0]:
# Installs

from google.colab import drive
drive.mount('/content/gdrive') # grant access to google drive filesystem

!pip3 install spacy
!python3 -m spacy download es
!python -m spacy download es_core_news_md
!python -m spacy.es.download all
!pip install newspaper3k
!pip install ipywidgets

Mounted at /content/gdrive
Collecting es_core_news_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.1.0/es_core_news_sm-2.1.0.tar.gz#egg=es_core_news_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.1.0/es_core_news_sm-2.1.0.tar.gz (11.1MB)
[K     |████████████████████████████████| 11.1MB 490kB/s 
[?25hBuilding wheels for collected packages: es-core-news-sm
  Building wheel for es-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for es-core-news-sm: filename=es_core_news_sm-2.1.0-cp36-none-any.whl size=11111557 sha256=6754ebe16218db9a7616eb328e796336403b7b69c3195bc712e8a634637d24c4
  Stored in directory: /tmp/pip-ephem-wheel-cache-bjdq8sn_/wheels/cc/ee/c4/68922955901918a9aaa82e828d4f7ee1ccfc861285277e79b7
Successfully built es-core-news-sm
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-2.1.0
[38;5;2m✔ Download and installation succ

In [0]:
# imports 
import gzip
import gensim 
import logging
from newspaper import Article 
import spacy
from spacy.lang.es.stop_words import STOP_WORDS
import csv
import shutil
import os
import unicodedata
import pandas as pd

import es_core_news_md
nlp = es_core_news_md.load()

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
# Initializations

# Add stop words in spanish
STOP_WORDS.add('y')
STOP_WORDS.add('a')
STOP_WORDS.add('o')
STOP_WORDS.add('publicidad')
STOP_WORDS.add('expansion')
STOP_WORDS.add('para')
STOP_WORDS.add('\n')

path = '/content/gdrive/My Drive/datos/verbs'
modeloNombre = 'Expansión' # Possible values: 'Adn Político', 'Expansión', 'Life and Style', 'Manufactura', 'Obras', 'Quién', 'Todos'
modeloArchivo = unicodedata.normalize('NFKD', modeloNombre.replace(" ", "").lower()).encode('ASCII', 'ignore').decode('unicode_escape') + '_model.txt'
documents = []
urls = [
    
]

In [0]:
def trainVectorModel(publicacion):
    path = '/content/gdrive/My Drive/datos/'
    notesFile = u'content.csv'
    
    #modeloNombre = publicacion # Possible values: 'Adn Político', 'Expansión', 'Life and Style', 'Manufactura', 'Obras', 'Quién', 'Todos'
    modeloArchivo = unicodedata.normalize('NFKD', publicacion.replace(" ", "").lower()).encode('ASCII', 'ignore').decode('unicode_escape') + '_model.txt'
    documents = []
    
    # Read text from notes from selected publication
    with open(os.path.join(path, notesFile)) as csv_file:
        df = pd.read_csv(os.path.join(path, notesFile))
        
    # remove records with null 'texto' 
    df = df.loc[df['texto'] == df['texto']]
            
    if publicacion == 'Todos': 
        for index, row in df.iterrows():
            documents.append(gensim.utils.simple_preprocess(row['texto'])) #row[8] = texto
    else:
        for index, row in df.iterrows(): 
            if publicacion == row['sitio']:
                documents.append(gensim.utils.simple_preprocess(row['texto'])) #row[8] = texto
  
    # Scrapping webpages
    #for url in urls:
    #    article = Article(url)
    #    article.download()
    #    article.parse()
    #    # doc = nlp(article.text)
    #    documents.append(gensim.utils.simple_preprocess(article.text))
        
    # build vocabulary and train model (actual training)
    model = gensim.models.Word2Vec(
        documents,
        size=200, # Dimensionality of the word vectors.
        window=8, # Maximum distance between the current and predicted word within a sentence.
        min_count=3, # Ignores all words with total frequency lower than this.
        workers=10) # Use these many worker threads to train the model (=faster training with multicore machines).
    model.train(documents, total_examples=len(documents), epochs=666)
    model.wv.save_word2vec_format(os.path.join(path, 'verbs', modeloArchivo))
    
    # save model as gzip file
    with open(os.path.join(path, 'verbs', modeloArchivo),'rb') as inp_f:
         # open the output zipped file with file handler out_f
         with gzip.open(os.path.join(path, 'verbs', modeloArchivo) + '.gz', 'wb') as out_f:
             # use shutil to copy the file objec
             shutil.copyfileobj(inp_f,out_f)




In [0]:
# Run at th time

#trainVectorModel('Adn Político')
#trainVectorModel('Expansión')
#trainVectorModel('Life and Style')
#trainVectorModel('Manufactura')
#trainVectorModel('Obras')
trainVectorModel('Quién')
#trainVectorModel('Todos')



2019-08-23 16:05:53,864 : INFO : collecting all words and their counts
2019-08-23 16:05:53,865 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-08-23 16:05:53,994 : INFO : collected 36323 word types from a corpus of 661352 raw words and 2255 sentences
2019-08-23 16:05:53,995 : INFO : Loading a fresh vocabulary
2019-08-23 16:05:54,051 : INFO : effective_min_count=3 retains 14840 unique words (40% of original 36323, drops 21483)
2019-08-23 16:05:54,052 : INFO : effective_min_count=3 leaves 634307 word corpus (95% of original 661352, drops 27045)
2019-08-23 16:05:54,097 : INFO : deleting the raw counts dictionary of 36323 items
2019-08-23 16:05:54,099 : INFO : sample=0.001 downsamples 31 most-common words
2019-08-23 16:05:54,100 : INFO : downsampling leaves estimated 466493 word corpus (73.5% of prior 634307)
2019-08-23 16:05:54,139 : INFO : estimated required memory for 14840 words and 200 dimensions: 31164000 bytes
2019-08-23 16:05:54,140 : INFO : resettin