In [None]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import gensim
import sys
import os
from string import punctuation
from gensim.utils import deaccent
import utils
from multiprocessing import Pool,cpu_count
from itertools import chain
from pathlib import Path
import pickle

In [None]:
class BillionWordCorpus(object):
    def __init__(self, workers=-1, override=True):
        self.override = override
        self.workers = workers if workers != -1 else cpu_count()
        self.dirname = Path("../data/clean_corpus/spanish_billion_words")
        self.prepare()
    
    def prepare(self):
        files = [file for file in self.dirname.iterdir() if file.is_file() and "_preprocessed" not in str(file)]
        logging.getLogger(__name__).info(f"starting preprocessing with {self.workers} cores")
        
        if not self.override:
            files = [file for file in files if not os.path.exists(f"{str(file)}_preprocessed")]
        
        logging.getLogger(__name__).info(f"files to preprocess: {len(files)}")
        
        with Pool(self.workers, maxtasksperchild=2) as pool:
            pool.map(self.aux, files)
        
        logging.getLogger(__name__).info(f"finished preprocessing")
    
    @staticmethod
    def aux(fname):
        logging.getLogger(__name__).info(f"starting {fname}")
        fname_preprocessed = f"{fname}_preprocessed" 

        with open(fname_preprocessed, 'wb') as f:
            with open(fname) as open_file:
                for line in open_file:
                    line = "".join([x for x in line if x not in utils.non_words])
                    l = map(lambda x: x.strip().lower(), deaccent(line).split())
                    l = [*filter(lambda x: x not in utils.stopwords, l)]
                    pickle.dump(l, f, protocol=pickle.HIGHEST_PROTOCOL)

        logging.getLogger(__name__).info(f"finished {fname}")
    
    def __iter__(self):
        files = [file for file in self.dirname.iterdir() if file.is_file() and "_preprocessed" in str(file)]
        for f in files:
            with open(f, 'rb') as open_file:
                while True:
                    try:
                        my_list = pickle.load(open_file)
                        yield my_list
                    except EOFError:
                        break

In [None]:
sentences = BillionWordCorpus(override=False)

In [None]:
override = True
if not os.path.exists("model_bwc") or override:
#     bigram_transformer = gensim.models.Phrases(sentences)
    model_bwc = gensim.models.Word2Vec(sentences, min_count=40, size=300, workers=8, window=5)
    model_bwc.save("model_bwc")
else:
    model_bwc = gensim.models.Word2Vec.load("model_bwc")

In [None]:
model_bwc.wv.most_similar(positive=["rey","mujer"],negative=["hombre"],topn=1)

In [None]:
model_bwc.wv.most_similar(positive=["roma","francia"],negative=["italia"], topn=1)

In [None]:
model_bwc.wv.most_similar(positive=["buenos","caballo"],negative=["bueno"], topn=1)

In [None]:
model_bwc.wv.most_similar(positive=["lindo","bueno"],negative=["feo"], topn=1)