In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.18.0-py3-

In [None]:
from gensim.models import Word2Vec
import pandas as pd
import re
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric, strip_short, stem_text
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
from datasets import load_dataset

dataset_corpus = load_dataset("large_spanish_corpus", "ParaCrawl")

Downloading builder script:   0%|          | 0.00/3.93k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.26k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.79G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15510649 [00:00<?, ? examples/s]

In [None]:
dataset_corpus

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 15510649
    })
})

In [None]:
subset = dataset_corpus['train'].select(range(1000000))


In [None]:
subset[0, 2]

{'text': ['lavado de cerebro a través de los medios de comunicación, y amenaza de fuerza a través de los militares.',
  'En realidad, el Nuevo OM sólo se puede mantener la ilusión de supremacía mágica, siempre y cuando reprima y desvíe el potencial humano, donde mora la verdadera magia: es decir, en la capacidad innata de nuestra especie de magia interactiva con los poderes de animación de la diosa planetaria.']}

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Procesamiento de texto

In [None]:
def clean_text(sentence_batch):
  # extraemos el texto de la entrada
  text_list = sentence_batch['text']

  cleaned_text_list = []

  for text in text_list:
    # convertir el texto a minuscula
    text = text.lower()

    # Eliminar URLs
    text = re.sub(r"http\S+", "", text)

    # Eliminar las menciones @ y '#' de las redes sociales
    text = re.sub(r"@\S+", "", text)
    text = re.sub(r"#\S+", "", text)

    # Eliminar los caracteres de puntuación

    text = strip_punctuation(text)

    # Eliminar números
    text = strip_numeric(text)

    # Eliminar las palabras cortas

    text = strip_short(text, minsize=2)

    # Eliminar las palabras comunes (stop words)

    stop_words = set(stopwords.words('spanish'))
    word_token = word_tokenize(text)
    filtered_text = [word for word in word_token if word not in stop_words]

    cleaned_text_list.append(filtered_text)

  # devuelve el texto limpio
  return {'text': cleaned_text_list}

In [None]:
sentences_corpus = subset.map(clean_text, batched=True)



Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [None]:
sentences_corpus['text']

[['lavado',
  'cerebro',
  'través',
  'medios',
  'comunicación',
  'amenaza',
  'fuerza',
  'través',
  'militares'],
 ['constante',
  'aluvión',
  'doble',
  'cañón',
  'requiriendo',
  'complicidad',
  'seres',
  'humanos',
  'reprimir',
  'engañar',
  'semejantes',
  'tan',
  'cacareada',
  'magia',
  'rápidamente',
  'desvanecería',
  'disiparía'],
 ['realidad',
  'nuevo',
  'om',
  'sólo',
  'puede',
  'mantener',
  'ilusión',
  'supremacía',
  'mágica',
  'siempre',
  'reprima',
  'desvíe',
  'potencial',
  'humano',
  'mora',
  'verdadera',
  'magia',
  'decir',
  'capacidad',
  'innata',
  'especie',
  'magia',
  'interactiva',
  'poderes',
  'animación',
  'diosa',
  'planetaria'],
 ['menos',
  'nuevo',
  'om',
  'pueda',
  'tiempo',
  'suprimir',
  'capacidad',
  'manera',
  'brutal',
  'natural',
  'espontáneamente',
  'afirmará',
  'misma'],
 ['haga',
  'verdadera',
  'magia',
  'anthropos',
  'niño',
  'luminoso',
  'entrará',
  'inmediato',
  'acción'],
 ['sophia',
  'm

In [None]:
model = Word2Vec(sentences_corpus['text'], vector_size=100, window=5, min_count=2, workers=6, sg=1)

# Podemos guardar el modelo para uso futuro
model.save("word2vec.model")


In [None]:
model.wv['rey']

array([-4.7621585e-02,  3.9839599e-01,  4.8014563e-01, -2.4782281e-02,
       -3.2940465e-01, -1.3604504e-01, -1.5344818e-01,  4.8451424e-01,
       -4.3801153e-01, -5.5046093e-01, -6.9141823e-01, -2.0335713e-01,
       -4.6941120e-04, -8.5356809e-02, -2.7640197e-01,  1.6081220e-02,
       -3.9780596e-01, -6.6929233e-01,  3.8406175e-02, -1.7596897e-01,
        1.6429284e-01,  9.9311328e-01, -2.2006106e-02, -6.1447084e-01,
        5.5180812e-01,  3.7671229e-01, -1.8067293e-01, -1.7508706e-01,
       -2.4473664e-01, -2.4074218e-01, -2.5335833e-01,  4.3585524e-01,
        4.0107259e-01, -1.8833454e-01, -2.3448601e-01,  5.8438838e-01,
        4.2057693e-02, -2.2725542e-01, -3.8843066e-01, -6.1544228e-01,
       -3.4959993e-01,  1.1861115e-01,  5.9657562e-01, -7.4290454e-01,
        2.5636178e-01,  5.9737229e-01, -1.2362590e+00,  2.5330812e-01,
        4.0826678e-01,  6.3538617e-01, -4.0549290e-01, -6.5254933e-01,
       -7.4612752e-02, -1.6330540e-01,  4.1698360e-01, -3.1338912e-01,
      

In [None]:
model.wv.most_similar('television', topn=5)


[('gsm', 0.8683694005012512),
 ('entertainment', 0.8665755391120911),
 ('supren', 0.8458159565925598),
 ('player', 0.8333703279495239),
 ('broadcaster', 0.833152174949646)]

In [None]:
word_vectors = model.wv
vectors = word_vectors.vectors
words = word_vectors.index_to_key

### Almacenamiento de embeddings

In [None]:
df_vectors = pd.DataFrame(vectors)
df_vectors.to_csv('embeddings.tsv', sep='\t', index=False)


In [None]:
df_words = pd.DataFrame(words)
df_words.to_csv('words.tsv', sep='\t', index=False)