In [None]:
from gensim.models import Word2Vec
from datasets import load_dataset
import gensim
import logging

# Configuració de logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

def read_corpus(dataset, limit=None):
    """
    Llegeix les dades del dataset i les processa fins el límit de bytes especificat.
    """
    current_size = 0
    for example in dataset:
        for line in example['text'].splitlines():
            processed_line = gensim.utils.simple_preprocess(line)
            current_size += sum(len(word) + 1 for word in processed_line)
            if limit and current_size > limit:
                return
            yield processed_line

# Carreguem el dataset 'catalan_general_crawling'
dataset = load_dataset("projecte-aina/catalan_general_crawling", split='train')

## Model de 100 MB

In [None]:
# Limitem a 100MB de text
limit_size = 100 * 1024 * 1024  # 100 MB en bytes
sentences = list(read_corpus(dataset, limit=limit_size))

# Configuració i entrenament del model Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=10, workers=4, epochs=25)

# Guardar el model
model.save("catalan_word2vec_100MB.model")

## Model de 500 MB

In [None]:
# Limitem a 500MB de text
limit_size = 500 * 1024 * 1024  # 500 MB en bytes
sentences = list(read_corpus(dataset, limit=limit_size))

# Configuració i entrenament del model Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=10, workers=4, epochs=25)

# Guardar el model
model.save("catalan_word2vec_500MB.model")

## Model de 1 GB

In [None]:
# Limitem a 1000MB de text
limit_size = 1000 * 1024 * 1024  # 1000 MB en bytes
sentences = list(read_corpus(dataset, limit=limit_size))

# Configuració i entrenament del model Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=10, workers=4, epochs=25)

# Guardar el model
model.save("catalan_word2vec_1GB.model")

## Model de 1 GB

In [None]:
sentences = list(read_corpus(dataset))

# Configuració i entrenament del model Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=10, workers=4, epochs=25)

# Guardar el model
model.save("catalan_word2vec_complete.model")