In [1]:
import os

In [2]:
wikidump_folder = "/media/hugo/Seagate Expansion Drive/wiki_dump"
assert os.path.exists(wikidump_folder), "Please download the wiki dump and put it in the folder {}".format(wikidump_folder)

In [3]:
processed_text_folder = "/media/hugo/Seagate Expansion Drive/wiki_dump/preprocessed_texts/"
assert os.path.exists(processed_text_folder), "Please run the notebook preprocess_wiki_dump.ipynb"

In [4]:
processed_text_jsons = [os.path.join(processed_text_folder, f) for f in os.listdir(processed_text_folder) if f.endswith(".json")]
print("Found {} json files".format(len(processed_text_jsons)))

Found 605 json files


In [5]:
CALCULATE_CORPUS_STATS = False

In [6]:
model_folder = "/media/hugo/Seagate Expansion Drive/wiki_dump/wordvectors/"
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
assert os.path.exists(model_folder)

In [7]:
model_folder_temp = "/media/hugo/Seagate Expansion Drive/wiki_dump/wordvectors/temp/"
if not os.path.exists(model_folder_temp):
    os.makedirs(model_folder_temp)
assert os.path.exists(model_folder_temp)

In [8]:
vocab_folder = "/media/hugo/Seagate Expansion Drive/wiki_dump/wordvectors/vocab/"
if not os.path.exists(vocab_folder):
    os.makedirs(vocab_folder)
assert os.path.exists(vocab_folder)

In [9]:
from gensim.test.utils import datapath
from gensim import utils
import json
from tqdm.notebook import tqdm

class MyCorpus:
    def __init__(self, jsons_folder:str):
        self.folder = jsons_folder
        self.print_nr_files()

    def print_nr_files(self):
        nr_files = len(os.listdir(self.folder))
        # print how many files were found in the folder
        print("Found {} files in {}".format(nr_files, self.folder))
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        for json_path in tqdm(os.listdir(self.folder), desc="Reading files"):
            with open(os.path.join(self.folder, json_path)) as f:
                lines = json.load(f)
                for line in lines:
                    yield line

# count = 0
# corpus = MyCorpus(processed_text_folder)
# for i, x in enumerate(corpus):
#     count += 1

# # Print how many sentences were found in the corpus nicely formatted
# print("Found {:_} sentences in the corpus".format(count))


In [10]:
nr_words = 0

words_set = set()

nr_sentences = 0


if CALCULATE_CORPUS_STATS:
    # count words in the corpus
    corpus = MyCorpus(processed_text_folder)
    for line in corpus:
        words = line.split()
        nr_words += len(words)
        words_set.update(words)
        nr_sentences += 1

    # Print how many words were found in the corpus nicely formatted
    print("Found {:_} words in the corpus".format(nr_words))

    # Print how many unique words were found in the corpus nicely formatted
    print("Found {:_} unique words in the corpus".format(len(words_set)))

    # Print type token ratio nicely formatted
    print("Type token ratio: {}".format(len(words_set)/nr_words))

    # Print how many sentences were found in the corpus nicely formatted
    print("Found {:_} sentences in the corpus".format(nr_sentences))

    # Print average sentence length nicely formatted
    print("Average sentence length: {:.2f}".format(nr_words/nr_sentences))

    # Print average nr of sentences per file nicely formatted
    print("Average nr of sentences per file: {:_}".format(int(nr_sentences/len(processed_text_jsons))))


In [11]:
class MyCorpus2:
    def __init__(self, json_file:str):
        self.json_file = json_file
    
    def __iter__(self):
        with open(self.json_file) as f:
            lines = json.load(f)
            for line in lines:
                yield line.split()

In [12]:
# import gensim.models

# sentences = MyCorpus(processed_text_folder)
# model = gensim.models.Word2Vec(sentences=sentences, vector_size=200, window=5, min_count=5, workers=4)
# model.save(temporary_filepath)

In [21]:
from tqdm.notebook import tqdm
import gensim.models


# list of all json files in the folder
json_files = [os.path.join(processed_text_folder, f) for f in os.listdir(processed_text_folder) if f.endswith(".json")]

for i, json_file in tqdm(enumerate(json_files), total=len(json_files), desc="Processing files"):
    print("Processing file {} of {}".format(i, len(json_files)))
    if os.path.exists(os.path.join(model_folder_temp, "word2vec_{}.model".format(i))):
        print("File {} already processed".format(i))
        continue
    sentences = MyCorpus2(json_file)
    if i == 0:
        model = gensim.models.Word2Vec(sentences=sentences, size=200, window=5, min_count=5, workers=4)
    else:
        model = gensim.models.Word2Vec.load(os.path.join(model_folder_temp, "word2vec_{}.model".format(i - 1)))
        model.build_vocab(sentences, update=True)
        model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

    # save model vocabulary to json file
    # get vocabulary
    vocab = list(model.wv.vocab.keys())
    print(f"\t- Vocabulary size: {len(vocab):_}", flush=True)  

    # Save vocabulary to json file
    with open(os.path.join(vocab_folder, "word2vec_{}.json".format(i)), "w") as f:
        json.dump(vocab, f)

    # Print vocabulary size
      

    model.save(os.path.join(model_folder_temp, "word2vec_{}.model".format(i)))

model.save(os.path.join(model_folder, "final_word2vec.model"))

vocab = model.wv.vocab.keys()
# Print vocabulary size
print(f"\t- Vocabulary size: {len(vocab):_}")  
with open(os.path.join(model_folder, "word2vec_{}.json".format(i)), "w") as f:
    json.dump(vocab, f)



Processing files:   0%|          | 0/605 [00:00<?, ?it/s]

Processing file 0 of 605
	- Vocabulary size: 17_784
Processing file 1 of 605
	- Vocabulary size: 18_228
Processing file 2 of 605
	- Vocabulary size: 18_351
Processing file 3 of 605
	- Vocabulary size: 18_421
Processing file 4 of 605
	- Vocabulary size: 18_481
Processing file 5 of 605
	- Vocabulary size: 18_514
Processing file 6 of 605
	- Vocabulary size: 18_515
Processing file 7 of 605
	- Vocabulary size: 18_552
Processing file 8 of 605
	- Vocabulary size: 18_594
Processing file 9 of 605
	- Vocabulary size: 18_617
Processing file 10 of 605
	- Vocabulary size: 18_629
Processing file 11 of 605
	- Vocabulary size: 18_652
Processing file 12 of 605
	- Vocabulary size: 18_672
Processing file 13 of 605
	- Vocabulary size: 18_681
Processing file 14 of 605
	- Vocabulary size: 18_690
Processing file 15 of 605
	- Vocabulary size: 18_696
Processing file 16 of 605
	- Vocabulary size: 18_702
Processing file 17 of 605
	- Vocabulary size: 18_708
Processing file 18 of 605
	- Vocabulary size: 18_708
Pro

ValueError: Cannot specify '_' with '3'.