# Homework 2 - Kristiyan Dimitrov

In [36]:
import os
from gensim import utils
import gensim, logging
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import json
import gzip
import multiprocessing

In [2]:
# This shows the number of lines
!wc -l enwiki-latest.json.gz

 26603591 enwiki-latest.json.gz


# Preprocessing

In [19]:
def remove_stopwords(tokenized_sentence:str):
    
    return [word for word in tokenized_sentence if word not in stopwords.words('english')]


def preprocess_text(text:str):
    """Converts text to sentences; tokenizes sentences; removes all stopword tokens"""
        
    # Split text into sentences
    sentences = sent_tokenize(text)
    
    # Preprocess each sentence converting it into a list of lower case tokens, which are not too long or too short
    tokenized_sentences = [gensim.utils.simple_preprocess(sent) for sent in sentences]
    
    # Remove any tokens which are stop words while preserving the sentence structure of the article (i.e. list of lists)
    no_stops = [remove_stopwords(tokenized_sentence) for tokenized_sentence in tokenized_sentences]
        
    return no_stops


def process_article(article):
    """Preprocesses all the sections of a Wikipedia article, then
       save to txt file"""
    
    if article['title']+'.txt' in os.listdir('data'):
#         print(f"{article['title']} already processed")
        pass
        
    article_sentences = list()
    
    for text in article['section_texts']:
        article_sentences.extend(preprocess_text(text))
    
    try:
        with open(os.path.join('data', article['title']+'.txt'), 'w') as filehandle:
            for listitem in article_sentences:
                filehandle.write(f'{" ".join(listitem)}\n')
    except Exception as e:
            print(f"FAILED TO SAVE {article['title']}")
            print(e)
            
            
def take_batch(filehandle, batch_size:int):
    
    batch_of_lines = list()
    
    for _ in range(batch_size):
        
        batch_of_lines.append(json.loads(filehandle.readline()))
    
    return batch_of_lines

In [20]:
def run(batch_size, n_batches, n_cpus):

    with gzip.open('enwiki-latest.json.gz', 'r') as f:

        counter=0
    
        while counter < batch_size * n_batches:

            batch_of_articles = take_batch(f, batch_size)

            pool=multiprocessing.Pool(n_cpus) # multiprocessing.cpu_count()

            pool.map(process_article, batch_of_articles)

            pool.close()
            
#             if counter % 1000 == 0:
            print(counter)

            counter += batch_size

In [21]:
# Start at 7:22 PM
run(batch_size = 10_000, n_batches = 10, n_cpus = 10 )

FAILED TO SAVE Dragon 32/64
[Errno 2] No such file or directory: 'data/Dragon 32/64.txt'
FAILED TO SAVE Alliance 90/The Greens
[Errno 2] No such file or directory: 'data/Alliance 90/The Greens.txt'
FAILED TO SAVE ISO/IEC 8859-1
[Errno 2] No such file or directory: 'data/ISO/IEC 8859-1.txt'
FAILED TO SAVE ISO/IEC 8859
[Errno 2] No such file or directory: 'data/ISO/IEC 8859.txt'
FAILED TO SAVE IC 342/Maffei Group
[Errno 2] No such file or directory: 'data/IC 342/Maffei Group.txt'
FAILED TO SAVE Aoraki / Mount Cook
[Errno 2] No such file or directory: 'data/Aoraki / Mount Cook.txt'
0
FAILED TO SAVE PL/I
[Errno 2] No such file or directory: 'data/PL/I.txt'
FAILED TO SAVE OS/2
[Errno 2] No such file or directory: 'data/OS/2.txt'
FAILED TO SAVE IBM System/360
[Errno 2] No such file or directory: 'data/IBM System/360.txt'
FAILED TO SAVE 56 kbit/s line
[Errno 2] No such file or directory: 'data/56 kbit/s line.txt'
FAILED TO SAVE Z/OS
[Errno 2] No such file or directory: 'data/Z/OS.txt'
FAILED 

In [8]:
# 12 cpus working on 48 articles at a time; 10 batches
%timeit run(batch_size=48,n_batches=10,n_cpus=12)

1min 16s ± 5.38 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
# 6 cpus working on 48 articles at a time; 10 batches
%timeit -n2 run(batch_size=48,n_batches=10,n_cpus=6)

1min 10s ± 5.02 s per loop (mean ± std. dev. of 7 runs, 2 loops each)


In [12]:
# 10 cpus working on 100 articles at a time; 10 batches
%timeit -r1 run(batch_size=100,n_batches=10,n_cpus=10)

2min 1s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [13]:
# 10 cpus working on 1000 articles at a time; 1 batches
%timeit -r1 run(batch_size=1000,n_batches=1,n_cpus=10)

1min 52s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


- 12 cpus working on 48 articles at a time; 10 batches -> 
1min 16s ± 5.38 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
- 6 cpus working on 48 articles at a time; 10 batches
1min 10s ± 5.02 s per loop (mean ± std. dev. of 7 runs, 2 loops each)
- 10 cpus working on 100 articles at a time; 10 batches
2min 1s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
- 10 cpus working on 1000 articles at a time; 1 batches
1min 52s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

From the above, I suspect that larger batches work better

# Model Training

In [22]:
class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            if fname in ['.ipynb_checkpoints', '.DS_Store']:
                continue
#             print(fname)
            for line in open(os.path.join(self.dirname, fname)):
#                 yield bytes(line, 'utf-8').decode('utf-8', 'ignore').split()
                yield line.split()

Default model parameters are:
- size = 100
- window = 5
- sg default=0 (short for skip-gram){0,1} 1 for skip-gram, 0 for CBOW
- hs default=0 (short for hierarchical softmax) if 1 then negative sampling is used
- min_count=5 ignores all words with total frequency below this
- workers = 3
- iter = 5 (number of runs over data; the first run is to built the vocabulary; the remaining ones are training epochs

### NOTE: Below you will see A LOT of logging messages; scroll down to the end for some qualitative model evaluation

In [24]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

sentences = MySentences('data') # a memory-friendly iterator

# model = gensim.models.Word2Vec(sentences)
model = gensim.models.Word2Vec(sentences, min_count=25, size=300, workers=10, sg=0, iter=5, window=5)

In [37]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
%timeit -r1 gensim.models.Word2Vec(sentences, min_count=25, size=300, workers=10, sg=0, iter=5, window=5)

12min 8s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [25]:
# I can potentially save the model and keep training it later?
model.save("word2vec_100k.model")
# model = gensim.models.Word2Vec.load("word2vec_default.model")
model = gensim.models.Word2Vec.load("word2vec_100k.model")

# Using the model 

In [28]:
model.wv.most_similar(positive=['woman','queen'], negative=['king'], topn=1)

[('prostitute', 0.508292555809021)]

In [29]:
model.wv.most_similar(positive=['green','red'], topn=1)

[('blue', 0.6379384994506836)]

In [30]:
model.wv.most_similar(positive=['cat','cats'], negative=['dog'],topn=1)

[('sphynx', 0.4912039637565613)]

In [31]:
model.wv.most_similar(positive=['male','man'], negative=['female'],topn=1)

[('men', 0.4589303731918335)]

In [32]:
model.wv.most_similar(positive=['continent', 'river'],topn=1)

[('estuary', 0.6394079327583313)]

In [33]:
model.wv.most_similar(positive=['two', 'three'],topn=1)

[('four', 0.9416286945343018)]

In [34]:
model.wv.most_similar(positive=['jobs'],topn=1)

[('employment', 0.6662395596504211)]

In [26]:
len(model.wv.vocab)

122749

In [27]:
model.wv.vectors.shape

(122749, 300)

# Appendix