In [None]:
import gensim 
from gensim.models import Word2Vec, Phrases
import logging          
import cython # for performance (multicore)
import preprocessing # from same directory
import os
import json
import ijson

In [7]:
print(gensim.models.word2vec.FAST_VERSION) # if 1, cython's being used

1


In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) # set logging configuration

In [17]:
class SentenceIterator:
    def __init__(self, batch_dirs):
        self.batch_dirs = batch_dirs

    def __iter__(self):

        for directory in self.batch_dirs: # iterate through directories
            
            # get all json files in the directory
            for filename in os.listdir(directory): 
                if filename.endswith(".json"):
                    filepath = os.path.join(directory, filename)

                    # process each file using the ijson parser
                    with open(filepath, 'rb') as f:
                        parser = ijson.items(f, 'item') # assumes json is an array at the top level (which it should be in this case)
                        for sentence in parser: 
                            yield sentence

In [None]:
batch_dirs = ["wrangling/data/processed_sentences"]

In [25]:
sentences = SentenceIterator(batch_dirs)

In [26]:
model = Word2Vec(
    sentences = sentences,
    vector_size=100,
    window=5,
    min_count=5,
    workers=6,
    sg=1,
    epochs=5
)

2025-03-10 09:18:33,252 : INFO : collecting all words and their counts
2025-03-10 09:18:33,259 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-03-10 09:18:33,312 : INFO : PROGRESS: at sentence #10000, processed 166539 words, keeping 9101 word types
2025-03-10 09:18:33,360 : INFO : PROGRESS: at sentence #20000, processed 338255 words, keeping 12981 word types
2025-03-10 09:18:33,408 : INFO : PROGRESS: at sentence #30000, processed 507335 words, keeping 15789 word types
2025-03-10 09:18:33,455 : INFO : PROGRESS: at sentence #40000, processed 680168 words, keeping 19000 word types
2025-03-10 09:18:33,491 : INFO : PROGRESS: at sentence #50000, processed 797044 words, keeping 20720 word types
2025-03-10 09:18:33,523 : INFO : PROGRESS: at sentence #60000, processed 910379 words, keeping 23130 word types
2025-03-10 09:18:33,562 : INFO : PROGRESS: at sentence #70000, processed 1047072 words, keeping 24888 word types
2025-03-10 09:18:33,607 : INFO : PROGRESS: at 

In [None]:
model.save("word2vec_full_vanilla")

2025-03-10 15:02:05,427 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec_full_vanilla', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-03-10T15:02:05.427201', 'gensim': '4.3.3', 'python': '3.11.2 (v3.11.2:878ead1ac1, Feb  7 2023, 10:02:41) [Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-15.3.1-arm64-arm-64bit', 'event': 'saving'}
2025-03-10 15:02:05,435 : INFO : storing np array 'vectors' to word2vec_full_vanilla.wv.vectors.npy
2025-03-10 15:02:05,825 : INFO : saved word2vec_full_vanilla


In [29]:
model.wv.save("word2vec_full_vanilla.kv")

2025-03-10 13:50:29,443 : INFO : KeyedVectors lifecycle event {'fname_or_handle': 'word2vec_full_vanilla.kv', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-03-10T13:50:29.442798', 'gensim': '4.3.3', 'python': '3.11.2 (v3.11.2:878ead1ac1, Feb  7 2023, 10:02:41) [Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-15.3.1-arm64-arm-64bit', 'event': 'saving'}
2025-03-10 13:50:29,448 : INFO : storing np array 'vectors' to word2vec_full_vanilla.kv.vectors.npy
2025-03-10 13:50:29,826 : INFO : saved word2vec_full_vanilla.kv


In [30]:
word_vectors = model.wv

In [45]:
sims = word_vectors.most_similar('sky', topn=10)
print(sims)

[('clouds', 0.806491494178772), ('skies', 0.7932544350624084), ('sun', 0.7737993597984314), ('mist', 0.7320118546485901), ('moon', 0.7308470606803894), ('moonlight', 0.7252644300460815), ('sea', 0.7220116257667542), ('ocean', 0.7080819606781006), ('starlit', 0.6974241137504578), ('glistening', 0.6945892572402954)]


In [87]:
print(word_vectors.most_similar(word_vectors['yoghurt'] - word_vectors['he'] + word_vectors['she'], topn=1))

[('yoghurt', 0.8011300563812256)]
