# HW04 - NLP
## Punto I

Choose three authors whose works are available in the Gutenberg project. For each of these authors, it is imperative to carefully select and download a minimum of three books, totaling at least nine books in all. Utilize these selected literary works to train word embeddings using the GENSIM library and the word2vec model.
- Prepare the training dataset using appropriate text preprocessing steps.
- Try different embeddings dimensionalities (at least 3) and save them to disk using appropriate GENSIM methods:
    - Books_<size_1>_<group_code>
    - Books_<size_2>_<group_code>
    - Books_<size_3>_<group_code

In [None]:
#!pip install gensim



In [21]:
import multiprocessing
import re
import os

from gensim.models.word2vec import Word2Vec
from io import TextIOWrapper

cores = multiprocessing.cpu_count()

In [22]:
def serialize_text(f: TextIOWrapper):
    begun = False
    full_text = []
    paragraph = ""

    for base_line in f:
        line = base_line.strip()

        if len(line) == 0:
            if len(paragraph) > 0:
                full_text.append(paragraph.strip())
                paragraph = ""

            continue

        if line.startswith("*** START OF THE PROJECT GUTENBERG EBOOK"):
            begun = True
            continue

        if line.startswith("*** END OF THE PROJECT GUTENBERG EBOOK"):
            break

        if begun:
            paragraph += line + " "

    return full_text

def tokenize(text: str):
    processed = text.lower()  # Solo minúsculas
    processed = re.sub(r'[^a-z\s\']', ' ', processed)  # Mantener letras y apóstrofes
    processed = re.sub(r'\s+', ' ', processed).strip()  # Normalizar espacios
    tokens = processed.split()
    tokens = [token for token in tokens if len(token) > 1]  # Eliminar tokens de 1 letra
    return tokens 

In [23]:
base_path = "./books"
books = os.listdir(base_path)
sentences = []

for book in books:
    book_name = book.replace('.txt', '')
    path = os.path.join(base_path, book)
    with open(path, encoding="utf-8") as f:
        lines = serialize_text(f)
        
        sentences_original = [line.strip() for text in lines for line in text.split(". ") if len(line.strip()) > 0]
        sentences_tokenized = [tokenize(sentence) for sentence in sentences_original]
        sentences_tokenized = [token_list for token_list in sentences_tokenized if len(token_list) > 0]  # Filtrar listas vacías
        
        sentences += sentences_tokenized

print(f"Total sentences: {len(sentences)}")

Total sentences: 24099


In [24]:
w2v_models = {}
vector_sizes = [1024, 512, 128]

for vector_size in vector_sizes:
  w2v_model = Word2Vec(
      min_count=5,
      window=3,
      vector_size=vector_size,
      workers=cores-1
      )

  w2v_model.build_vocab(sentences)
  w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30)

  w2v_models[vector_size] = w2v_model

In [25]:
os.makedirs("./vectors", exist_ok=True)

for vector_size in vector_sizes:
  path = f"./vectors/Books_{vector_size}_001"
  with open(f"{path}.txt", "w") as f:
      f.write("")

  with open(f"{path}.model", "w") as f:
      f.write("")

  w2v_models[vector_size].save(f"{path}.model")
  w2v_models[vector_size].wv.save_word2vec_format(f"{path}.txt")