# Paso 1

Se tienen los libros de:

* Arthur Conan Doyle
* Lewis Carroll
* William Shakespeare

In [None]:
!pip install gensim

In [None]:
import multiprocessing
import re
import nltk

from gensim.models.word2vec import Word2Vec
from io import TextIOWrapper

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

nltk.download('stopwords', quiet=True)
cores = multiprocessing.cpu_count()

In [None]:
def serialize_text(f: TextIOWrapper):
    begun = False
    full_text = []
    paragraph = ""

    for base_line in f:
        line = base_line.strip()

        if len(line) == 0:
            if len(paragraph) > 0:
                full_text.append(paragraph.strip())
                paragraph = ""

            continue

        if line.startswith("*** START OF THE PROJECT GUTENBERG EBOOK"):
            begun = True
            continue

        if line.startswith("*** END OF THE PROJECT GUTENBERG EBOOK"):
            break

        if begun:
            paragraph += line + " "

    return full_text

stemmer = SnowballStemmer('english')
stops = stopwords.words('english')

def tokenize(text: str):
    processed_feature = re.sub(r'\W', ' ', str(text))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'[0-9]+', ' ', processed_feature)
    processed_feature = re.sub(' +', ' ', processed_feature)
    processed_feature = processed_feature.lower()
    processed_feature = [stemmer.stem(i) for i in processed_feature.split()]
    processed_feature = " ".join([token for token in processed_feature if token not in stops])
    return processed_feature

In [None]:
with open("./arthur-return-sherlock.txt") as f:
  lines = serialize_text(f)
  sentences_original = [line.strip() for text in lines for line in text.split(". ") if len(line.strip()) > 0]
  sentences = [tokenize(sentence).split(" ") for sentence in sentences_original]
  print(sentences)

In [None]:
w2v_model = Word2Vec(
    min_count=5,
    window=3,
    vector_size=1024,
    workers=cores-1
    )

w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30)

In [None]:
w2v_model.save("books.model")
w2v_model.wv.save_word2vec_format("books_word2vec.txt")