# Comparison of CBOW, SkipGram and SkipGram with Subword Information

### Imports 

In [3]:
import gzip
import gensim 
import logging
import warnings

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.filterwarnings('ignore')

### Dataset 

Let's take a closer look at the dataset that we will be using. We will print the first line. 

In [4]:
data_file="../word2vec/reviews_data.txt.gz"

with gzip.open (data_file, 'rb') as f:
    for i,line in enumerate (f):
        
        #print only two lines
        if i==0:
            print(line)
            break
        
        


b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

### Read files into a list
Now that we've had a sneak peak of our dataset, we can read it into a list so that we can pass this on to the Word2Vec model. Notice in the code below, that I am directly reading the 
compressed file. I'm also doing a mild pre-processing of the reviews using `gensim.utils.simple_preprocess (line)`. This does some basic pre-processing such as tokenization, lowercasing, etc and returns back a list of tokens (words). Documentation of this pre-processing method can be found on the official [Gensim documentation site](https://radimrehurek.com/gensim/utils.html). 



In [5]:

def read_input(input_file):
    """This method reads the input file which is in gzip format"""
    
    with gzip.open (input_file, 'rb') as f:
        for i, line in enumerate (f): 

            if (i%10000==0):
                logging.info ("read {0} reviews".format (i))
            # do some pre-processing and return a list of words for each review text
            yield gensim.utils.simple_preprocess (line)

# read the tokenized reviews into a list
# each review item becomes a series of words
# so this becomes a list of lists
documents = list (read_input (data_file))

2020-04-16 17:40:24,107 : INFO : read 0 reviews
2020-04-16 17:40:26,237 : INFO : read 10000 reviews
2020-04-16 17:40:28,448 : INFO : read 20000 reviews
2020-04-16 17:40:31,011 : INFO : read 30000 reviews
2020-04-16 17:40:33,289 : INFO : read 40000 reviews
2020-04-16 17:40:35,953 : INFO : read 50000 reviews
2020-04-16 17:40:38,366 : INFO : read 60000 reviews
2020-04-16 17:40:40,406 : INFO : read 70000 reviews
2020-04-16 17:40:42,259 : INFO : read 80000 reviews
2020-04-16 17:40:44,251 : INFO : read 90000 reviews
2020-04-16 17:40:46,164 : INFO : read 100000 reviews
2020-04-16 17:40:48,051 : INFO : read 110000 reviews
2020-04-16 17:40:49,945 : INFO : read 120000 reviews
2020-04-16 17:40:51,898 : INFO : read 130000 reviews
2020-04-16 17:40:54,444 : INFO : read 140000 reviews
2020-04-16 17:40:56,368 : INFO : read 150000 reviews
2020-04-16 17:40:58,353 : INFO : read 160000 reviews
2020-04-16 17:41:00,284 : INFO : read 170000 reviews
2020-04-16 17:41:02,330 : INFO : read 180000 reviews
2020-04

## Training the CBOW, SkipGram and SkipGram with Subword Information Models

### Train a CBOW model

In [6]:
model_cbow = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10)
%time model_cbow.train(documents,total_examples=len(documents),epochs=10)

2020-04-16 17:41:18,809 : INFO : collecting all words and their counts
2020-04-16 17:41:18,810 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-16 17:41:19,056 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2020-04-16 17:41:19,364 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2020-04-16 17:41:19,674 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2020-04-16 17:41:19,979 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2020-04-16 17:41:20,310 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2020-04-16 17:41:20,618 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2020-04-16 17:41:20,918 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2020-04-16 17:41:21,181 : INFO : PROG

2020-04-16 17:42:08,203 : INFO : EPOCH 2 - PROGRESS: at 44.91% examples, 1558821 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:42:09,207 : INFO : EPOCH 2 - PROGRESS: at 50.32% examples, 1557968 words/s, in_qsize 16, out_qsize 3
2020-04-16 17:42:10,212 : INFO : EPOCH 2 - PROGRESS: at 55.75% examples, 1561753 words/s, in_qsize 19, out_qsize 1
2020-04-16 17:42:11,213 : INFO : EPOCH 2 - PROGRESS: at 61.31% examples, 1565980 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:42:12,232 : INFO : EPOCH 2 - PROGRESS: at 66.87% examples, 1566003 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:42:13,234 : INFO : EPOCH 2 - PROGRESS: at 72.20% examples, 1569353 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:42:14,237 : INFO : EPOCH 2 - PROGRESS: at 77.43% examples, 1570425 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:42:15,248 : INFO : EPOCH 2 - PROGRESS: at 82.71% examples, 1571582 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:42:16,249 : INFO : EPOCH 2 - PROGRESS: at 88.09% examples, 1572962

2020-04-16 17:43:00,402 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-04-16 17:43:00,405 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-04-16 17:43:00,415 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-04-16 17:43:00,416 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-04-16 17:43:00,419 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-16 17:43:00,421 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-16 17:43:00,422 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-16 17:43:00,424 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-16 17:43:00,424 : INFO : EPOCH - 4 : training on 41519355 raw words (30348500 effective words) took 21.3s, 1427210 effective words/s
2020-04-16 17:43:01,446 : INFO : EPOCH 5 - PROGRESS: at 4.47% examples, 1356235 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:43:

2020-04-16 17:43:45,988 : INFO : EPOCH 2 - PROGRESS: at 4.36% examples, 1334489 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:43:46,995 : INFO : EPOCH 2 - PROGRESS: at 8.67% examples, 1331424 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:43:48,012 : INFO : EPOCH 2 - PROGRESS: at 12.26% examples, 1332650 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:43:49,015 : INFO : EPOCH 2 - PROGRESS: at 16.26% examples, 1339089 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:43:50,018 : INFO : EPOCH 2 - PROGRESS: at 19.93% examples, 1346276 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:43:51,024 : INFO : EPOCH 2 - PROGRESS: at 23.75% examples, 1349026 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:43:52,033 : INFO : EPOCH 2 - PROGRESS: at 28.39% examples, 1347296 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:43:53,040 : INFO : EPOCH 2 - PROGRESS: at 32.58% examples, 1323749 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:43:54,048 : INFO : EPOCH 2 - PROGRESS: at 36.50% examples, 1301662 w

2020-04-16 17:44:40,976 : INFO : EPOCH 4 - PROGRESS: at 20.11% examples, 1350299 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:44:41,980 : INFO : EPOCH 4 - PROGRESS: at 23.85% examples, 1347194 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:44:42,985 : INFO : EPOCH 4 - PROGRESS: at 28.08% examples, 1327088 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:44:43,995 : INFO : EPOCH 4 - PROGRESS: at 31.89% examples, 1291674 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:44:44,999 : INFO : EPOCH 4 - PROGRESS: at 36.10% examples, 1284168 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:44:46,028 : INFO : EPOCH 4 - PROGRESS: at 39.79% examples, 1258066 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:44:47,043 : INFO : EPOCH 4 - PROGRESS: at 44.05% examples, 1247025 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:44:48,044 : INFO : EPOCH 4 - PROGRESS: at 47.99% examples, 1237379 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:44:49,045 : INFO : EPOCH 4 - PROGRESS: at 51.62% examples, 1221940

2020-04-16 17:45:36,366 : INFO : EPOCH 6 - PROGRESS: at 18.78% examples, 1262288 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:45:37,375 : INFO : EPOCH 6 - PROGRESS: at 22.21% examples, 1252838 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:45:38,391 : INFO : EPOCH 6 - PROGRESS: at 25.88% examples, 1254191 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:45:39,392 : INFO : EPOCH 6 - PROGRESS: at 30.33% examples, 1251521 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:45:40,392 : INFO : EPOCH 6 - PROGRESS: at 34.87% examples, 1255326 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:45:41,412 : INFO : EPOCH 6 - PROGRESS: at 39.48% examples, 1256881 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:45:42,415 : INFO : EPOCH 6 - PROGRESS: at 43.96% examples, 1254328 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:45:43,433 : INFO : EPOCH 6 - PROGRESS: at 47.90% examples, 1241631 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:45:44,457 : INFO : EPOCH 6 - PROGRESS: at 51.32% examples, 1219203

2020-04-16 17:46:30,735 : INFO : EPOCH 8 - PROGRESS: at 37.23% examples, 1194647 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:46:31,746 : INFO : EPOCH 8 - PROGRESS: at 42.32% examples, 1211744 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:46:32,746 : INFO : EPOCH 8 - PROGRESS: at 46.85% examples, 1216420 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:46:33,748 : INFO : EPOCH 8 - PROGRESS: at 51.51% examples, 1225600 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:46:34,751 : INFO : EPOCH 8 - PROGRESS: at 56.07% examples, 1234161 words/s, in_qsize 20, out_qsize 3
2020-04-16 17:46:35,760 : INFO : EPOCH 8 - PROGRESS: at 60.88% examples, 1244201 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:46:36,765 : INFO : EPOCH 8 - PROGRESS: at 65.77% examples, 1253462 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:46:37,777 : INFO : EPOCH 8 - PROGRESS: at 70.21% examples, 1257807 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:46:38,779 : INFO : EPOCH 8 - PROGRESS: at 74.82% examples, 1262265

2020-04-16 17:47:24,784 : INFO : EPOCH 10 - PROGRESS: at 44.22% examples, 1067581 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:47:25,788 : INFO : EPOCH 10 - PROGRESS: at 47.91% examples, 1066600 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:47:26,793 : INFO : EPOCH 10 - PROGRESS: at 51.56% examples, 1064675 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:47:27,794 : INFO : EPOCH 10 - PROGRESS: at 55.78% examples, 1076422 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:47:28,815 : INFO : EPOCH 10 - PROGRESS: at 59.77% examples, 1079754 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:47:29,840 : INFO : EPOCH 10 - PROGRESS: at 62.97% examples, 1070272 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:47:30,874 : INFO : EPOCH 10 - PROGRESS: at 65.41% examples, 1047036 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:47:31,878 : INFO : EPOCH 10 - PROGRESS: at 67.67% examples, 1028664 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:47:32,895 : INFO : EPOCH 10 - PROGRESS: at 70.55% examples

CPU times: user 21min 51s, sys: 8.97 s, total: 22min
Wall time: 4min 17s


(303488617, 415193550)

### Train a char n-gram model (subword information) with fastText

In [7]:
from gensim.models.fasttext import FastText
model_subword = FastText(documents, size=150, window=10, min_count=2, workers=10, min_n=3, max_n=6)  # instantiate
%time model_subword.train(documents,total_examples=len(documents),epochs=10)

2020-04-16 17:47:40,197 : INFO : resetting layer weights
2020-04-16 17:47:40,198 : INFO : Total number of ngrams is 0
2020-04-16 17:47:40,210 : INFO : collecting all words and their counts
2020-04-16 17:47:40,211 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-16 17:47:40,505 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2020-04-16 17:47:40,783 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2020-04-16 17:47:41,092 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2020-04-16 17:47:41,363 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2020-04-16 17:47:41,652 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2020-04-16 17:47:41,945 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2020-04-16 17:47:42,182 : INFO : 

2020-04-16 17:50:19,548 : INFO : EPOCH 1 - PROGRESS: at 21.65% examples, 205082 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:50:20,585 : INFO : EPOCH 1 - PROGRESS: at 22.28% examples, 205745 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:50:21,591 : INFO : EPOCH 1 - PROGRESS: at 22.83% examples, 205615 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:50:22,619 : INFO : EPOCH 1 - PROGRESS: at 23.25% examples, 204819 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:50:23,646 : INFO : EPOCH 1 - PROGRESS: at 23.64% examples, 203320 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:50:24,653 : INFO : EPOCH 1 - PROGRESS: at 24.00% examples, 201997 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:50:25,663 : INFO : EPOCH 1 - PROGRESS: at 24.41% examples, 200731 words/s, in_qsize 17, out_qsize 1
2020-04-16 17:50:26,691 : INFO : EPOCH 1 - PROGRESS: at 25.10% examples, 200627 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:50:27,699 : INFO : EPOCH 1 - PROGRESS: at 25.82% examples, 200622 words/s,

2020-04-16 17:51:33,938 : INFO : EPOCH 1 - PROGRESS: at 70.86% examples, 197156 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:51:34,973 : INFO : EPOCH 1 - PROGRESS: at 71.61% examples, 197269 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:51:36,020 : INFO : EPOCH 1 - PROGRESS: at 72.37% examples, 197491 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:51:37,113 : INFO : EPOCH 1 - PROGRESS: at 73.11% examples, 197359 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:51:38,213 : INFO : EPOCH 1 - PROGRESS: at 73.82% examples, 197096 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:51:39,347 : INFO : EPOCH 1 - PROGRESS: at 74.38% examples, 196659 words/s, in_qsize 16, out_qsize 3
2020-04-16 17:51:40,381 : INFO : EPOCH 1 - PROGRESS: at 75.00% examples, 196577 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:51:41,397 : INFO : EPOCH 1 - PROGRESS: at 75.61% examples, 196547 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:51:42,423 : INFO : EPOCH 1 - PROGRESS: at 76.20% examples, 196477 words/s,

2020-04-16 17:52:39,605 : INFO : EPOCH 2 - PROGRESS: at 14.48% examples, 193600 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:40,635 : INFO : EPOCH 2 - PROGRESS: at 15.13% examples, 194834 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:41,635 : INFO : EPOCH 2 - PROGRESS: at 15.89% examples, 197035 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:42,662 : INFO : EPOCH 2 - PROGRESS: at 16.53% examples, 198038 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:43,702 : INFO : EPOCH 2 - PROGRESS: at 17.16% examples, 199139 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:44,708 : INFO : EPOCH 2 - PROGRESS: at 17.85% examples, 200907 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:45,748 : INFO : EPOCH 2 - PROGRESS: at 18.34% examples, 199929 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:46,791 : INFO : EPOCH 2 - PROGRESS: at 18.89% examples, 199446 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:52:47,862 : INFO : EPOCH 2 - PROGRESS: at 19.26% examples, 197729 words/s,

2020-04-16 17:53:54,001 : INFO : EPOCH 2 - PROGRESS: at 70.81% examples, 218888 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:53:55,006 : INFO : EPOCH 2 - PROGRESS: at 71.59% examples, 218997 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:53:56,012 : INFO : EPOCH 2 - PROGRESS: at 72.44% examples, 219320 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:53:57,054 : INFO : EPOCH 2 - PROGRESS: at 73.33% examples, 219469 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:53:58,062 : INFO : EPOCH 2 - PROGRESS: at 74.19% examples, 219697 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:53:59,111 : INFO : EPOCH 2 - PROGRESS: at 75.02% examples, 219966 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:54:00,132 : INFO : EPOCH 2 - PROGRESS: at 75.77% examples, 220105 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:54:01,198 : INFO : EPOCH 2 - PROGRESS: at 76.58% examples, 220392 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:54:02,211 : INFO : EPOCH 2 - PROGRESS: at 77.35% examples, 220593 words/s,

2020-04-16 17:54:59,830 : INFO : EPOCH 3 - PROGRESS: at 15.43% examples, 180719 words/s, in_qsize 20, out_qsize 2
2020-04-16 17:55:00,868 : INFO : EPOCH 3 - PROGRESS: at 16.14% examples, 182974 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:55:01,906 : INFO : EPOCH 3 - PROGRESS: at 16.90% examples, 185701 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:55:02,920 : INFO : EPOCH 3 - PROGRESS: at 17.55% examples, 187778 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:55:03,980 : INFO : EPOCH 3 - PROGRESS: at 18.28% examples, 189824 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:55:05,009 : INFO : EPOCH 3 - PROGRESS: at 18.93% examples, 191061 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:55:06,017 : INFO : EPOCH 3 - PROGRESS: at 19.48% examples, 192107 words/s, in_qsize 17, out_qsize 2
2020-04-16 17:55:07,079 : INFO : EPOCH 3 - PROGRESS: at 20.16% examples, 193805 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:55:08,085 : INFO : EPOCH 3 - PROGRESS: at 20.78% examples, 195185 words/s,

2020-04-16 17:56:13,513 : INFO : EPOCH 3 - PROGRESS: at 73.48% examples, 220382 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:56:14,566 : INFO : EPOCH 3 - PROGRESS: at 74.15% examples, 220013 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:56:15,568 : INFO : EPOCH 3 - PROGRESS: at 74.88% examples, 220033 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:56:16,570 : INFO : EPOCH 3 - PROGRESS: at 75.54% examples, 220004 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:56:17,581 : INFO : EPOCH 3 - PROGRESS: at 76.24% examples, 220069 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:56:18,595 : INFO : EPOCH 3 - PROGRESS: at 77.05% examples, 220341 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:56:19,598 : INFO : EPOCH 3 - PROGRESS: at 77.69% examples, 220287 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:56:20,766 : INFO : EPOCH 3 - PROGRESS: at 78.43% examples, 219974 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:56:21,803 : INFO : EPOCH 3 - PROGRESS: at 79.25% examples, 220122 words/s,

2020-04-16 17:57:19,412 : INFO : EPOCH 4 - PROGRESS: at 21.74% examples, 235865 words/s, in_qsize 20, out_qsize 1
2020-04-16 17:57:20,447 : INFO : EPOCH 4 - PROGRESS: at 22.42% examples, 236337 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:57:21,460 : INFO : EPOCH 4 - PROGRESS: at 23.04% examples, 236506 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:57:22,464 : INFO : EPOCH 4 - PROGRESS: at 23.66% examples, 236719 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:57:23,524 : INFO : EPOCH 4 - PROGRESS: at 24.32% examples, 236934 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:57:24,555 : INFO : EPOCH 4 - PROGRESS: at 25.14% examples, 236946 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:57:25,618 : INFO : EPOCH 4 - PROGRESS: at 26.00% examples, 236774 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:57:26,688 : INFO : EPOCH 4 - PROGRESS: at 26.89% examples, 236397 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:57:27,722 : INFO : EPOCH 4 - PROGRESS: at 27.87% examples, 236417 words/s,

2020-04-16 17:58:33,917 : INFO : EPOCH 4 - PROGRESS: at 80.54% examples, 233278 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:58:34,973 : INFO : EPOCH 4 - PROGRESS: at 81.32% examples, 233258 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:58:35,985 : INFO : EPOCH 4 - PROGRESS: at 82.13% examples, 233269 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:58:37,046 : INFO : EPOCH 4 - PROGRESS: at 82.94% examples, 233185 words/s, in_qsize 20, out_qsize 2
2020-04-16 17:58:38,140 : INFO : EPOCH 4 - PROGRESS: at 83.76% examples, 233157 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:58:39,163 : INFO : EPOCH 4 - PROGRESS: at 84.47% examples, 233073 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:58:40,215 : INFO : EPOCH 4 - PROGRESS: at 85.25% examples, 233065 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:58:41,228 : INFO : EPOCH 4 - PROGRESS: at 86.11% examples, 233259 words/s, in_qsize 20, out_qsize 0
2020-04-16 17:58:42,230 : INFO : EPOCH 4 - PROGRESS: at 87.00% examples, 233305 words/s,

2020-04-16 17:59:39,343 : INFO : EPOCH 5 - PROGRESS: at 29.19% examples, 236561 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:59:40,409 : INFO : EPOCH 5 - PROGRESS: at 30.01% examples, 236370 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:59:41,426 : INFO : EPOCH 5 - PROGRESS: at 30.95% examples, 236681 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:59:42,454 : INFO : EPOCH 5 - PROGRESS: at 31.88% examples, 236394 words/s, in_qsize 20, out_qsize 1
2020-04-16 17:59:43,526 : INFO : EPOCH 5 - PROGRESS: at 32.66% examples, 236033 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:59:44,529 : INFO : EPOCH 5 - PROGRESS: at 33.48% examples, 236063 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:59:45,536 : INFO : EPOCH 5 - PROGRESS: at 34.33% examples, 236407 words/s, in_qsize 18, out_qsize 1
2020-04-16 17:59:46,567 : INFO : EPOCH 5 - PROGRESS: at 35.08% examples, 236155 words/s, in_qsize 19, out_qsize 0
2020-04-16 17:59:47,579 : INFO : EPOCH 5 - PROGRESS: at 35.89% examples, 235858 words/s,

2020-04-16 18:00:53,265 : INFO : EPOCH 5 - PROGRESS: at 87.80% examples, 233235 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:00:54,271 : INFO : EPOCH 5 - PROGRESS: at 88.54% examples, 233011 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:00:55,306 : INFO : EPOCH 5 - PROGRESS: at 89.38% examples, 232993 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:00:56,315 : INFO : EPOCH 5 - PROGRESS: at 90.23% examples, 233021 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:00:57,348 : INFO : EPOCH 5 - PROGRESS: at 91.11% examples, 233126 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:00:58,356 : INFO : EPOCH 5 - PROGRESS: at 91.94% examples, 233157 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:00:59,368 : INFO : EPOCH 5 - PROGRESS: at 92.72% examples, 233108 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:01:00,414 : INFO : EPOCH 5 - PROGRESS: at 93.55% examples, 233177 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:01:01,415 : INFO : EPOCH 5 - PROGRESS: at 94.26% examples, 232928 words/s,

2020-04-16 18:03:33,770 : INFO : EPOCH 1 - PROGRESS: at 39.13% examples, 275565 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:03:34,806 : INFO : EPOCH 1 - PROGRESS: at 39.94% examples, 274538 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:03:35,822 : INFO : EPOCH 1 - PROGRESS: at 40.84% examples, 273983 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:03:36,882 : INFO : EPOCH 1 - PROGRESS: at 41.92% examples, 273493 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:03:37,886 : INFO : EPOCH 1 - PROGRESS: at 42.82% examples, 273461 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:03:38,969 : INFO : EPOCH 1 - PROGRESS: at 43.77% examples, 272612 words/s, in_qsize 19, out_qsize 1
2020-04-16 18:03:39,973 : INFO : EPOCH 1 - PROGRESS: at 44.79% examples, 272464 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:03:40,993 : INFO : EPOCH 1 - PROGRESS: at 45.69% examples, 271974 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:03:42,031 : INFO : EPOCH 1 - PROGRESS: at 46.56% examples, 271554 words/s,

2020-04-16 18:04:47,580 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-04-16 18:04:47,583 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-04-16 18:04:47,606 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-04-16 18:04:47,648 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-04-16 18:04:47,649 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-04-16 18:04:47,683 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-04-16 18:04:47,706 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-16 18:04:47,745 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-16 18:04:47,746 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-16 18:04:47,752 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-16 18:04:47,753 : INFO : EPOCH - 1 : training on 41519355 raw words (30344443 effe

2020-04-16 18:05:54,209 : INFO : EPOCH 2 - PROGRESS: at 49.04% examples, 230438 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:05:55,262 : INFO : EPOCH 2 - PROGRESS: at 49.83% examples, 230358 words/s, in_qsize 17, out_qsize 2
2020-04-16 18:05:56,345 : INFO : EPOCH 2 - PROGRESS: at 50.77% examples, 230610 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:05:57,349 : INFO : EPOCH 2 - PROGRESS: at 51.56% examples, 230574 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:05:58,377 : INFO : EPOCH 2 - PROGRESS: at 52.33% examples, 230671 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:05:59,434 : INFO : EPOCH 2 - PROGRESS: at 53.05% examples, 230578 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:06:00,464 : INFO : EPOCH 2 - PROGRESS: at 53.75% examples, 230188 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:06:01,481 : INFO : EPOCH 2 - PROGRESS: at 54.49% examples, 229968 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:06:02,526 : INFO : EPOCH 2 - PROGRESS: at 55.37% examples, 229847 words/s,

2020-04-16 18:07:00,487 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-16 18:07:00,488 : INFO : EPOCH - 2 : training on 41519355 raw words (30352224 effective words) took 132.7s, 228679 effective words/s
2020-04-16 18:07:01,516 : INFO : EPOCH 3 - PROGRESS: at 0.65% examples, 205635 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:07:02,526 : INFO : EPOCH 3 - PROGRESS: at 1.21% examples, 192356 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:07:03,542 : INFO : EPOCH 3 - PROGRESS: at 1.88% examples, 194414 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:07:04,542 : INFO : EPOCH 3 - PROGRESS: at 2.61% examples, 203212 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:07:05,638 : INFO : EPOCH 3 - PROGRESS: at 3.51% examples, 211843 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:07:06,659 : INFO : EPOCH 3 - PROGRESS: at 4.34% examples, 216540 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:07:07,667 : INFO : EPOCH 3 - PROGRESS: at 5.11% examples, 220410 words/s, in

2020-04-16 18:08:14,021 : INFO : EPOCH 3 - PROGRESS: at 55.79% examples, 235270 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:08:15,034 : INFO : EPOCH 3 - PROGRESS: at 56.56% examples, 235173 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:08:16,120 : INFO : EPOCH 3 - PROGRESS: at 57.41% examples, 234959 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:08:17,132 : INFO : EPOCH 3 - PROGRESS: at 58.17% examples, 234857 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:08:18,155 : INFO : EPOCH 3 - PROGRESS: at 59.07% examples, 235026 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:08:19,176 : INFO : EPOCH 3 - PROGRESS: at 59.85% examples, 234925 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:08:20,203 : INFO : EPOCH 3 - PROGRESS: at 60.65% examples, 234823 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:08:21,223 : INFO : EPOCH 3 - PROGRESS: at 61.45% examples, 234718 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:08:22,243 : INFO : EPOCH 3 - PROGRESS: at 62.29% examples, 234797 words/s,

2020-04-16 18:09:20,046 : INFO : EPOCH 4 - PROGRESS: at 6.96% examples, 228682 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:09:21,058 : INFO : EPOCH 4 - PROGRESS: at 7.66% examples, 228535 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:09:22,096 : INFO : EPOCH 4 - PROGRESS: at 8.30% examples, 225379 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:09:23,115 : INFO : EPOCH 4 - PROGRESS: at 8.97% examples, 224850 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:09:24,122 : INFO : EPOCH 4 - PROGRESS: at 9.59% examples, 224005 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:09:25,132 : INFO : EPOCH 4 - PROGRESS: at 10.14% examples, 223745 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:09:26,176 : INFO : EPOCH 4 - PROGRESS: at 10.67% examples, 220351 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:09:27,190 : INFO : EPOCH 4 - PROGRESS: at 11.21% examples, 219793 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:09:28,227 : INFO : EPOCH 4 - PROGRESS: at 11.77% examples, 219539 words/s, in_q

2020-04-16 18:10:34,482 : INFO : EPOCH 4 - PROGRESS: at 56.32% examples, 208311 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:10:35,510 : INFO : EPOCH 4 - PROGRESS: at 57.02% examples, 208182 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:10:36,520 : INFO : EPOCH 4 - PROGRESS: at 57.71% examples, 208078 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:10:37,562 : INFO : EPOCH 4 - PROGRESS: at 58.37% examples, 207826 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:10:38,621 : INFO : EPOCH 4 - PROGRESS: at 59.14% examples, 207785 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:10:39,632 : INFO : EPOCH 4 - PROGRESS: at 59.85% examples, 207790 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:10:40,649 : INFO : EPOCH 4 - PROGRESS: at 60.53% examples, 207622 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:10:41,710 : INFO : EPOCH 4 - PROGRESS: at 61.15% examples, 207109 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:10:42,769 : INFO : EPOCH 4 - PROGRESS: at 61.81% examples, 206930 words/s,

2020-04-16 18:11:40,052 : INFO : EPOCH 5 - PROGRESS: at 1.86% examples, 180319 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:11:41,084 : INFO : EPOCH 5 - PROGRESS: at 2.59% examples, 190572 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:11:42,104 : INFO : EPOCH 5 - PROGRESS: at 3.34% examples, 196160 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:11:43,122 : INFO : EPOCH 5 - PROGRESS: at 4.06% examples, 198887 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:11:44,126 : INFO : EPOCH 5 - PROGRESS: at 4.78% examples, 201240 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:11:45,151 : INFO : EPOCH 5 - PROGRESS: at 5.48% examples, 202329 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:11:46,171 : INFO : EPOCH 5 - PROGRESS: at 6.16% examples, 203252 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:11:47,258 : INFO : EPOCH 5 - PROGRESS: at 6.94% examples, 204181 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:11:48,259 : INFO : EPOCH 5 - PROGRESS: at 7.60% examples, 205152 words/s, in_qsize

2020-04-16 18:12:54,098 : INFO : EPOCH 5 - PROGRESS: at 51.78% examples, 208509 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:12:55,133 : INFO : EPOCH 5 - PROGRESS: at 52.49% examples, 208507 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:12:56,163 : INFO : EPOCH 5 - PROGRESS: at 53.13% examples, 208517 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:12:57,166 : INFO : EPOCH 5 - PROGRESS: at 53.85% examples, 208611 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:12:58,238 : INFO : EPOCH 5 - PROGRESS: at 54.61% examples, 208631 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:12:59,284 : INFO : EPOCH 5 - PROGRESS: at 55.50% examples, 208780 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:13:00,318 : INFO : EPOCH 5 - PROGRESS: at 56.21% examples, 208616 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:13:01,341 : INFO : EPOCH 5 - PROGRESS: at 56.95% examples, 208668 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:13:02,388 : INFO : EPOCH 5 - PROGRESS: at 57.69% examples, 208647 words/s,

2020-04-16 18:13:59,444 : INFO : EPOCH - 5 : training on 41519355 raw words (30351490 effective words) took 142.6s, 212782 effective words/s
2020-04-16 18:14:00,460 : INFO : EPOCH 6 - PROGRESS: at 0.68% examples, 215468 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:14:01,598 : INFO : EPOCH 6 - PROGRESS: at 1.38% examples, 205666 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:14:02,621 : INFO : EPOCH 6 - PROGRESS: at 2.14% examples, 214122 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:14:03,642 : INFO : EPOCH 6 - PROGRESS: at 2.89% examples, 215207 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:14:04,650 : INFO : EPOCH 6 - PROGRESS: at 3.63% examples, 216562 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:14:05,668 : INFO : EPOCH 6 - PROGRESS: at 4.34% examples, 214724 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:14:06,682 : INFO : EPOCH 6 - PROGRESS: at 5.03% examples, 214654 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:14:07,695 : INFO : EPOCH 6 - PROGRESS: at 5.73% exampl

2020-04-16 18:15:13,632 : INFO : EPOCH 6 - PROGRESS: at 51.02% examples, 214132 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:15:14,652 : INFO : EPOCH 6 - PROGRESS: at 51.72% examples, 213984 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:15:15,720 : INFO : EPOCH 6 - PROGRESS: at 52.28% examples, 213348 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:15:16,766 : INFO : EPOCH 6 - PROGRESS: at 52.89% examples, 212980 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:15:17,797 : INFO : EPOCH 6 - PROGRESS: at 53.42% examples, 212284 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:15:18,827 : INFO : EPOCH 6 - PROGRESS: at 54.15% examples, 212263 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:15:19,846 : INFO : EPOCH 6 - PROGRESS: at 54.88% examples, 212280 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:15:20,851 : INFO : EPOCH 6 - PROGRESS: at 55.73% examples, 212321 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:15:21,894 : INFO : EPOCH 6 - PROGRESS: at 56.46% examples, 212258 words/s,

2020-04-16 18:16:23,613 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-04-16 18:16:23,637 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-16 18:16:23,681 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-16 18:16:23,706 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-16 18:16:23,707 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-16 18:16:23,707 : INFO : EPOCH - 6 : training on 41519355 raw words (30347870 effective words) took 144.3s, 210373 effective words/s
2020-04-16 18:16:24,725 : INFO : EPOCH 7 - PROGRESS: at 0.51% examples, 164891 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:16:25,770 : INFO : EPOCH 7 - PROGRESS: at 1.21% examples, 190117 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:16:26,803 : INFO : EPOCH 7 - PROGRESS: at 1.93% examples, 196341 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:16:27,829 : INFO : EPOCH 7 - PROGRESS: at 2.61% exa

2020-04-16 18:17:33,367 : INFO : EPOCH 7 - PROGRESS: at 47.63% examples, 214105 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:17:34,378 : INFO : EPOCH 7 - PROGRESS: at 48.43% examples, 214410 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:17:35,384 : INFO : EPOCH 7 - PROGRESS: at 49.27% examples, 214522 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:17:36,391 : INFO : EPOCH 7 - PROGRESS: at 49.97% examples, 214421 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:17:37,437 : INFO : EPOCH 7 - PROGRESS: at 50.71% examples, 214310 words/s, in_qsize 20, out_qsize 2
2020-04-16 18:17:38,503 : INFO : EPOCH 7 - PROGRESS: at 51.34% examples, 213750 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:17:39,515 : INFO : EPOCH 7 - PROGRESS: at 52.00% examples, 213539 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:17:40,555 : INFO : EPOCH 7 - PROGRESS: at 52.67% examples, 213368 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:17:41,594 : INFO : EPOCH 7 - PROGRESS: at 53.21% examples, 212827 words/s,

2020-04-16 18:18:47,679 : INFO : EPOCH 7 - PROGRESS: at 99.79% examples, 210380 words/s, in_qsize 9, out_qsize 1
2020-04-16 18:18:47,680 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-04-16 18:18:47,696 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-04-16 18:18:47,702 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-04-16 18:18:47,717 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-04-16 18:18:47,719 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-04-16 18:18:47,747 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-04-16 18:18:47,753 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-16 18:18:47,789 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-16 18:18:47,808 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-16 18:18:47,820 : INFO : worker thread finished; awaiting fi

2020-04-16 18:19:52,264 : INFO : EPOCH 8 - PROGRESS: at 43.04% examples, 211532 words/s, in_qsize 17, out_qsize 2
2020-04-16 18:19:53,306 : INFO : EPOCH 8 - PROGRESS: at 43.96% examples, 211929 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:19:54,307 : INFO : EPOCH 8 - PROGRESS: at 44.79% examples, 211981 words/s, in_qsize 17, out_qsize 2
2020-04-16 18:19:55,328 : INFO : EPOCH 8 - PROGRESS: at 45.60% examples, 212086 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:19:56,350 : INFO : EPOCH 8 - PROGRESS: at 46.36% examples, 212184 words/s, in_qsize 18, out_qsize 2
2020-04-16 18:19:57,366 : INFO : EPOCH 8 - PROGRESS: at 47.10% examples, 212379 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:19:58,392 : INFO : EPOCH 8 - PROGRESS: at 47.84% examples, 212264 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:19:59,405 : INFO : EPOCH 8 - PROGRESS: at 48.56% examples, 212179 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:20:00,433 : INFO : EPOCH 8 - PROGRESS: at 49.34% examples, 212044 words/s,

2020-04-16 18:21:06,456 : INFO : EPOCH 8 - PROGRESS: at 96.63% examples, 211999 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:21:07,521 : INFO : EPOCH 8 - PROGRESS: at 97.50% examples, 212128 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:21:08,523 : INFO : EPOCH 8 - PROGRESS: at 98.28% examples, 212256 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:21:09,538 : INFO : EPOCH 8 - PROGRESS: at 99.16% examples, 212475 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:21:10,362 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-04-16 18:21:10,370 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-04-16 18:21:10,375 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-04-16 18:21:10,407 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-04-16 18:21:10,409 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-04-16 18:21:10,419 : INFO : worker thread finished; awaiting finish of 4 more thread

2020-04-16 18:22:12,715 : INFO : EPOCH 9 - PROGRESS: at 43.59% examples, 221532 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:22:13,718 : INFO : EPOCH 9 - PROGRESS: at 44.49% examples, 221672 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:22:14,724 : INFO : EPOCH 9 - PROGRESS: at 45.25% examples, 221453 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:22:15,745 : INFO : EPOCH 9 - PROGRESS: at 46.09% examples, 221526 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:22:16,755 : INFO : EPOCH 9 - PROGRESS: at 46.81% examples, 221498 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:22:17,781 : INFO : EPOCH 9 - PROGRESS: at 47.63% examples, 221766 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:22:18,802 : INFO : EPOCH 9 - PROGRESS: at 48.41% examples, 221718 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:22:19,834 : INFO : EPOCH 9 - PROGRESS: at 49.31% examples, 221949 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:22:20,842 : INFO : EPOCH 9 - PROGRESS: at 50.11% examples, 222042 words/s,

2020-04-16 18:23:22,659 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-04-16 18:23:22,663 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-04-16 18:23:22,672 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-16 18:23:22,704 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-16 18:23:22,712 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-16 18:23:22,719 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-16 18:23:22,720 : INFO : EPOCH - 9 : training on 41519355 raw words (30351077 effective words) took 132.2s, 229540 effective words/s
2020-04-16 18:23:23,844 : INFO : EPOCH 10 - PROGRESS: at 0.69% examples, 201113 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:23:24,848 : INFO : EPOCH 10 - PROGRESS: at 1.52% examples, 228460 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:23:25,865 : INFO : EPOCH 10 - PROGRESS: at 2.32% examples, 234565 words

2020-04-16 18:24:31,556 : INFO : EPOCH 10 - PROGRESS: at 53.31% examples, 241248 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:24:32,581 : INFO : EPOCH 10 - PROGRESS: at 54.03% examples, 240708 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:24:33,597 : INFO : EPOCH 10 - PROGRESS: at 54.77% examples, 240437 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:24:34,696 : INFO : EPOCH 10 - PROGRESS: at 55.75% examples, 240265 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:24:35,705 : INFO : EPOCH 10 - PROGRESS: at 56.55% examples, 240217 words/s, in_qsize 20, out_qsize 2
2020-04-16 18:24:36,708 : INFO : EPOCH 10 - PROGRESS: at 57.46% examples, 240388 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:24:37,750 : INFO : EPOCH 10 - PROGRESS: at 58.23% examples, 240122 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:24:38,770 : INFO : EPOCH 10 - PROGRESS: at 59.07% examples, 240041 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:24:39,776 : INFO : EPOCH 10 - PROGRESS: at 59.90% examples, 240114

CPU times: user 2h 42min 32s, sys: 30.8 s, total: 2h 43min 2s
Wall time: 24min 19s


### Train a SkipGram model

In [8]:
model_skipgram = gensim.models.Word2Vec (documents, size=150, window=10, min_count=2, workers=10, sg=1)
%time model_skipgram.train(documents,total_examples=len(documents),epochs=10)

2020-04-16 18:26:19,742 : INFO : collecting all words and their counts
2020-04-16 18:26:19,743 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-04-16 18:26:19,950 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2020-04-16 18:26:20,165 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2020-04-16 18:26:20,417 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2020-04-16 18:26:20,639 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2020-04-16 18:26:20,892 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2020-04-16 18:26:21,145 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2020-04-16 18:26:21,377 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2020-04-16 18:26:21,591 : INFO : PROG

2020-04-16 18:27:16,620 : INFO : EPOCH 1 - PROGRESS: at 41.52% examples, 348599 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:27:17,706 : INFO : EPOCH 1 - PROGRESS: at 42.76% examples, 348103 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:27:18,709 : INFO : EPOCH 1 - PROGRESS: at 44.09% examples, 348232 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:27:19,728 : INFO : EPOCH 1 - PROGRESS: at 45.31% examples, 347481 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:27:20,735 : INFO : EPOCH 1 - PROGRESS: at 46.44% examples, 347045 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:27:21,798 : INFO : EPOCH 1 - PROGRESS: at 47.65% examples, 346836 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:27:22,821 : INFO : EPOCH 1 - PROGRESS: at 48.92% examples, 346806 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:27:23,825 : INFO : EPOCH 1 - PROGRESS: at 50.09% examples, 346415 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:27:24,855 : INFO : EPOCH 1 - PROGRESS: at 51.29% examples, 346479 words/s,

2020-04-16 18:28:20,868 : INFO : EPOCH 2 - PROGRESS: at 10.83% examples, 307487 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:28:21,900 : INFO : EPOCH 2 - PROGRESS: at 11.67% examples, 309217 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:28:22,936 : INFO : EPOCH 2 - PROGRESS: at 12.48% examples, 308272 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:28:23,951 : INFO : EPOCH 2 - PROGRESS: at 13.56% examples, 310110 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:28:24,954 : INFO : EPOCH 2 - PROGRESS: at 14.41% examples, 309493 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:28:25,981 : INFO : EPOCH 2 - PROGRESS: at 15.30% examples, 308615 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:28:27,023 : INFO : EPOCH 2 - PROGRESS: at 16.24% examples, 308807 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:28:28,033 : INFO : EPOCH 2 - PROGRESS: at 17.10% examples, 309063 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:28:29,039 : INFO : EPOCH 2 - PROGRESS: at 17.92% examples, 309027 words/s,

2020-04-16 18:29:34,148 : INFO : EPOCH 2 - PROGRESS: at 85.37% examples, 309177 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:29:35,163 : INFO : EPOCH 2 - PROGRESS: at 86.41% examples, 308963 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:29:36,182 : INFO : EPOCH 2 - PROGRESS: at 87.59% examples, 309004 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:29:37,208 : INFO : EPOCH 2 - PROGRESS: at 88.73% examples, 309007 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:29:38,251 : INFO : EPOCH 2 - PROGRESS: at 89.76% examples, 308797 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:29:39,272 : INFO : EPOCH 2 - PROGRESS: at 90.78% examples, 308433 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:29:40,280 : INFO : EPOCH 2 - PROGRESS: at 91.74% examples, 308030 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:29:41,289 : INFO : EPOCH 2 - PROGRESS: at 92.80% examples, 308096 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:29:42,294 : INFO : EPOCH 2 - PROGRESS: at 93.87% examples, 308176 words/s,

2020-04-16 18:30:39,355 : INFO : EPOCH 3 - PROGRESS: at 51.44% examples, 311437 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:40,357 : INFO : EPOCH 3 - PROGRESS: at 52.29% examples, 310410 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:41,390 : INFO : EPOCH 3 - PROGRESS: at 53.15% examples, 309661 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:42,400 : INFO : EPOCH 3 - PROGRESS: at 54.22% examples, 309756 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:43,476 : INFO : EPOCH 3 - PROGRESS: at 55.43% examples, 309618 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:44,484 : INFO : EPOCH 3 - PROGRESS: at 56.54% examples, 309852 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:45,542 : INFO : EPOCH 3 - PROGRESS: at 57.67% examples, 309817 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:46,557 : INFO : EPOCH 3 - PROGRESS: at 58.84% examples, 310228 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:30:47,636 : INFO : EPOCH 3 - PROGRESS: at 59.90% examples, 309845 words/s,

2020-04-16 18:31:44,211 : INFO : EPOCH 4 - PROGRESS: at 16.72% examples, 297954 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:31:45,220 : INFO : EPOCH 4 - PROGRESS: at 17.62% examples, 299561 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:31:46,226 : INFO : EPOCH 4 - PROGRESS: at 18.45% examples, 299568 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:31:47,240 : INFO : EPOCH 4 - PROGRESS: at 19.21% examples, 299038 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:31:48,243 : INFO : EPOCH 4 - PROGRESS: at 19.81% examples, 296298 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:31:49,257 : INFO : EPOCH 4 - PROGRESS: at 20.57% examples, 296613 words/s, in_qsize 19, out_qsize 1
2020-04-16 18:31:50,327 : INFO : EPOCH 4 - PROGRESS: at 21.72% examples, 296459 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:31:51,338 : INFO : EPOCH 4 - PROGRESS: at 22.53% examples, 297124 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:31:52,346 : INFO : EPOCH 4 - PROGRESS: at 23.35% examples, 297779 words/s,

2020-04-16 18:32:57,792 : INFO : EPOCH 4 - PROGRESS: at 93.11% examples, 307318 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:32:58,863 : INFO : EPOCH 4 - PROGRESS: at 94.30% examples, 307434 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:32:59,886 : INFO : EPOCH 4 - PROGRESS: at 95.41% examples, 307467 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:33:00,901 : INFO : EPOCH 4 - PROGRESS: at 96.51% examples, 307606 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:33:01,914 : INFO : EPOCH 4 - PROGRESS: at 97.55% examples, 307495 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:33:02,956 : INFO : EPOCH 4 - PROGRESS: at 98.64% examples, 307461 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:33:03,937 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-04-16 18:33:03,947 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-04-16 18:33:03,955 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-04-16 18:33:03,986 : INFO : EPOCH 4 - 

2020-04-16 18:34:02,593 : INFO : EPOCH 5 - PROGRESS: at 58.17% examples, 307517 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:34:03,602 : INFO : EPOCH 5 - PROGRESS: at 59.16% examples, 307043 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:34:04,650 : INFO : EPOCH 5 - PROGRESS: at 60.15% examples, 306519 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:34:05,662 : INFO : EPOCH 5 - PROGRESS: at 61.24% examples, 306546 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:34:06,665 : INFO : EPOCH 5 - PROGRESS: at 62.37% examples, 306961 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:34:07,665 : INFO : EPOCH 5 - PROGRESS: at 63.47% examples, 306793 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:34:08,704 : INFO : EPOCH 5 - PROGRESS: at 64.67% examples, 306539 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:34:09,727 : INFO : EPOCH 5 - PROGRESS: at 65.57% examples, 306131 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:34:10,727 : INFO : EPOCH 5 - PROGRESS: at 66.51% examples, 305834 words/s,

2020-04-16 18:35:04,060 : INFO : EPOCH 1 - PROGRESS: at 15.75% examples, 283567 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:35:05,089 : INFO : EPOCH 1 - PROGRESS: at 16.50% examples, 282988 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:35:06,104 : INFO : EPOCH 1 - PROGRESS: at 17.27% examples, 283079 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:35:07,111 : INFO : EPOCH 1 - PROGRESS: at 18.05% examples, 283177 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:35:08,119 : INFO : EPOCH 1 - PROGRESS: at 18.84% examples, 283250 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:35:09,132 : INFO : EPOCH 1 - PROGRESS: at 19.52% examples, 282898 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:35:10,183 : INFO : EPOCH 1 - PROGRESS: at 20.26% examples, 282421 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:35:11,222 : INFO : EPOCH 1 - PROGRESS: at 20.97% examples, 282244 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:35:12,275 : INFO : EPOCH 1 - PROGRESS: at 22.05% examples, 281753 words/s,

2020-04-16 18:36:17,737 : INFO : EPOCH 1 - PROGRESS: at 86.00% examples, 286112 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:18,752 : INFO : EPOCH 1 - PROGRESS: at 86.98% examples, 285863 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:19,772 : INFO : EPOCH 1 - PROGRESS: at 88.01% examples, 285756 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:20,790 : INFO : EPOCH 1 - PROGRESS: at 89.04% examples, 285741 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:21,845 : INFO : EPOCH 1 - PROGRESS: at 90.09% examples, 285759 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:22,872 : INFO : EPOCH 1 - PROGRESS: at 91.23% examples, 286080 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:23,928 : INFO : EPOCH 1 - PROGRESS: at 92.39% examples, 286313 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:24,932 : INFO : EPOCH 1 - PROGRESS: at 93.36% examples, 286534 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:36:25,933 : INFO : EPOCH 1 - PROGRESS: at 94.30% examples, 286341 words/s,

2020-04-16 18:37:22,407 : INFO : EPOCH 2 - PROGRESS: at 50.27% examples, 306277 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:23,420 : INFO : EPOCH 2 - PROGRESS: at 51.24% examples, 305851 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:24,452 : INFO : EPOCH 2 - PROGRESS: at 52.25% examples, 305580 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:25,468 : INFO : EPOCH 2 - PROGRESS: at 53.31% examples, 306216 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:26,469 : INFO : EPOCH 2 - PROGRESS: at 54.38% examples, 306311 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:27,530 : INFO : EPOCH 2 - PROGRESS: at 55.52% examples, 306055 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:28,592 : INFO : EPOCH 2 - PROGRESS: at 56.65% examples, 306066 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:29,634 : INFO : EPOCH 2 - PROGRESS: at 57.76% examples, 306169 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:37:30,635 : INFO : EPOCH 2 - PROGRESS: at 58.88% examples, 306478 words/s,

2020-04-16 18:38:27,740 : INFO : EPOCH 3 - PROGRESS: at 14.34% examples, 292153 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:28,782 : INFO : EPOCH 3 - PROGRESS: at 15.22% examples, 292091 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:29,784 : INFO : EPOCH 3 - PROGRESS: at 16.13% examples, 293126 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:30,824 : INFO : EPOCH 3 - PROGRESS: at 17.05% examples, 294457 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:31,901 : INFO : EPOCH 3 - PROGRESS: at 17.83% examples, 293457 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:32,942 : INFO : EPOCH 3 - PROGRESS: at 18.63% examples, 292593 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:33,955 : INFO : EPOCH 3 - PROGRESS: at 19.34% examples, 292060 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:35,004 : INFO : EPOCH 3 - PROGRESS: at 20.16% examples, 292752 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:38:36,036 : INFO : EPOCH 3 - PROGRESS: at 20.97% examples, 293694 words/s,

2020-04-16 18:39:41,574 : INFO : EPOCH 3 - PROGRESS: at 88.51% examples, 300075 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:42,583 : INFO : EPOCH 3 - PROGRESS: at 89.68% examples, 300408 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:43,621 : INFO : EPOCH 3 - PROGRESS: at 90.81% examples, 300480 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:44,664 : INFO : EPOCH 3 - PROGRESS: at 91.98% examples, 300759 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:45,669 : INFO : EPOCH 3 - PROGRESS: at 93.02% examples, 300911 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:46,684 : INFO : EPOCH 3 - PROGRESS: at 94.19% examples, 301197 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:47,732 : INFO : EPOCH 3 - PROGRESS: at 95.28% examples, 301216 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:48,767 : INFO : EPOCH 3 - PROGRESS: at 96.42% examples, 301430 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:39:49,769 : INFO : EPOCH 3 - PROGRESS: at 97.48% examples, 301490 words/s,

2020-04-16 18:40:46,335 : INFO : EPOCH 4 - PROGRESS: at 53.96% examples, 309442 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:47,366 : INFO : EPOCH 4 - PROGRESS: at 55.07% examples, 309452 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:48,404 : INFO : EPOCH 4 - PROGRESS: at 56.18% examples, 309131 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:49,405 : INFO : EPOCH 4 - PROGRESS: at 57.29% examples, 309284 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:50,425 : INFO : EPOCH 4 - PROGRESS: at 58.37% examples, 309439 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:51,458 : INFO : EPOCH 4 - PROGRESS: at 59.41% examples, 309045 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:52,465 : INFO : EPOCH 4 - PROGRESS: at 60.56% examples, 309432 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:53,495 : INFO : EPOCH 4 - PROGRESS: at 61.63% examples, 309423 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:40:54,505 : INFO : EPOCH 4 - PROGRESS: at 62.83% examples, 309858 words/s,

2020-04-16 18:41:50,882 : INFO : EPOCH 5 - PROGRESS: at 18.82% examples, 310591 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:51,910 : INFO : EPOCH 5 - PROGRESS: at 19.59% examples, 310033 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:52,927 : INFO : EPOCH 5 - PROGRESS: at 20.41% examples, 310009 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:53,948 : INFO : EPOCH 5 - PROGRESS: at 21.48% examples, 309587 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:54,981 : INFO : EPOCH 5 - PROGRESS: at 22.35% examples, 310044 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:56,064 : INFO : EPOCH 5 - PROGRESS: at 23.16% examples, 309318 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:57,067 : INFO : EPOCH 5 - PROGRESS: at 23.95% examples, 309533 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:58,073 : INFO : EPOCH 5 - PROGRESS: at 24.64% examples, 307623 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:41:59,116 : INFO : EPOCH 5 - PROGRESS: at 25.72% examples, 306498 words/s,

2020-04-16 18:43:04,357 : INFO : EPOCH 5 - PROGRESS: at 95.71% examples, 309967 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:43:05,359 : INFO : EPOCH 5 - PROGRESS: at 96.79% examples, 310106 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:43:06,421 : INFO : EPOCH 5 - PROGRESS: at 97.89% examples, 309959 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:43:07,432 : INFO : EPOCH 5 - PROGRESS: at 98.99% examples, 309936 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:43:08,145 : INFO : worker thread finished; awaiting finish of 9 more threads
2020-04-16 18:43:08,166 : INFO : worker thread finished; awaiting finish of 8 more threads
2020-04-16 18:43:08,194 : INFO : worker thread finished; awaiting finish of 7 more threads
2020-04-16 18:43:08,199 : INFO : worker thread finished; awaiting finish of 6 more threads
2020-04-16 18:43:08,220 : INFO : worker thread finished; awaiting finish of 5 more threads
2020-04-16 18:43:08,233 : INFO : worker thread finished; awaiting finish of 4 more thread

2020-04-16 18:44:09,857 : INFO : EPOCH 6 - PROGRESS: at 62.23% examples, 311522 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:10,934 : INFO : EPOCH 6 - PROGRESS: at 63.46% examples, 311474 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:11,948 : INFO : EPOCH 6 - PROGRESS: at 64.71% examples, 311488 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:12,957 : INFO : EPOCH 6 - PROGRESS: at 65.70% examples, 311620 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:13,975 : INFO : EPOCH 6 - PROGRESS: at 66.72% examples, 311468 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:14,985 : INFO : EPOCH 6 - PROGRESS: at 67.81% examples, 311390 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:16,008 : INFO : EPOCH 6 - PROGRESS: at 68.89% examples, 311470 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:17,013 : INFO : EPOCH 6 - PROGRESS: at 69.90% examples, 311513 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:44:18,045 : INFO : EPOCH 6 - PROGRESS: at 70.92% examples, 311665 words/s,

2020-04-16 18:45:14,971 : INFO : EPOCH 7 - PROGRESS: at 26.81% examples, 307014 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:15,971 : INFO : EPOCH 7 - PROGRESS: at 28.01% examples, 307097 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:16,992 : INFO : EPOCH 7 - PROGRESS: at 29.21% examples, 307915 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:17,995 : INFO : EPOCH 7 - PROGRESS: at 30.26% examples, 307929 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:18,998 : INFO : EPOCH 7 - PROGRESS: at 31.57% examples, 308638 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:20,022 : INFO : EPOCH 7 - PROGRESS: at 32.64% examples, 308444 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:21,042 : INFO : EPOCH 7 - PROGRESS: at 33.66% examples, 308134 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:22,110 : INFO : EPOCH 7 - PROGRESS: at 34.77% examples, 308261 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:45:23,151 : INFO : EPOCH 7 - PROGRESS: at 35.86% examples, 307983 words/s,

2020-04-16 18:46:24,310 : INFO : worker thread finished; awaiting finish of 4 more threads
2020-04-16 18:46:24,317 : INFO : worker thread finished; awaiting finish of 3 more threads
2020-04-16 18:46:24,344 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-04-16 18:46:24,350 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-04-16 18:46:24,371 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-04-16 18:46:24,371 : INFO : EPOCH - 7 : training on 41519355 raw words (30349705 effective words) took 99.0s, 306618 effective words/s
2020-04-16 18:46:25,416 : INFO : EPOCH 8 - PROGRESS: at 0.69% examples, 217128 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:46:26,439 : INFO : EPOCH 8 - PROGRESS: at 1.63% examples, 249414 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:46:27,442 : INFO : EPOCH 8 - PROGRESS: at 2.71% examples, 278025 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:46:28,470 : INFO : EPOCH 8 - PROGRESS: at 3.73% exam

2020-04-16 18:47:34,113 : INFO : EPOCH 8 - PROGRESS: at 69.88% examples, 306926 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:47:35,130 : INFO : EPOCH 8 - PROGRESS: at 70.83% examples, 306799 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:47:36,158 : INFO : EPOCH 8 - PROGRESS: at 71.88% examples, 306931 words/s, in_qsize 18, out_qsize 1
2020-04-16 18:47:37,207 : INFO : EPOCH 8 - PROGRESS: at 73.06% examples, 306956 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:47:38,208 : INFO : EPOCH 8 - PROGRESS: at 74.24% examples, 307283 words/s, in_qsize 20, out_qsize 1
2020-04-16 18:47:39,216 : INFO : EPOCH 8 - PROGRESS: at 75.22% examples, 307279 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:47:40,224 : INFO : EPOCH 8 - PROGRESS: at 76.15% examples, 307190 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:47:41,226 : INFO : EPOCH 8 - PROGRESS: at 77.15% examples, 307299 words/s, in_qsize 17, out_qsize 2
2020-04-16 18:47:42,230 : INFO : EPOCH 8 - PROGRESS: at 78.10% examples, 307304 words/s,

2020-04-16 18:48:39,452 : INFO : EPOCH 9 - PROGRESS: at 35.16% examples, 309688 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:48:40,466 : INFO : EPOCH 9 - PROGRESS: at 36.37% examples, 309953 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:48:41,468 : INFO : EPOCH 9 - PROGRESS: at 37.43% examples, 310121 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:48:42,510 : INFO : EPOCH 9 - PROGRESS: at 38.65% examples, 310298 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:48:43,511 : INFO : EPOCH 9 - PROGRESS: at 39.70% examples, 310285 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:48:44,522 : INFO : EPOCH 9 - PROGRESS: at 40.87% examples, 310534 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:48:45,572 : INFO : EPOCH 9 - PROGRESS: at 42.16% examples, 310645 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:48:46,583 : INFO : EPOCH 9 - PROGRESS: at 43.28% examples, 310878 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:48:47,586 : INFO : EPOCH 9 - PROGRESS: at 44.54% examples, 311343 words/s,

2020-04-16 18:49:43,862 : INFO : EPOCH 10 - PROGRESS: at 3.90% examples, 297595 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:44,918 : INFO : EPOCH 10 - PROGRESS: at 4.97% examples, 299878 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:45,931 : INFO : EPOCH 10 - PROGRESS: at 6.03% examples, 304155 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:46,966 : INFO : EPOCH 10 - PROGRESS: at 7.07% examples, 304460 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:47,973 : INFO : EPOCH 10 - PROGRESS: at 8.11% examples, 308311 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:48,990 : INFO : EPOCH 10 - PROGRESS: at 9.05% examples, 307845 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:49,995 : INFO : EPOCH 10 - PROGRESS: at 9.91% examples, 310530 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:51,028 : INFO : EPOCH 10 - PROGRESS: at 10.81% examples, 309594 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:49:52,036 : INFO : EPOCH 10 - PROGRESS: at 11.62% examples, 309957 words/

2020-04-16 18:50:57,306 : INFO : EPOCH 10 - PROGRESS: at 79.14% examples, 312708 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:50:58,384 : INFO : EPOCH 10 - PROGRESS: at 80.16% examples, 312467 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:50:59,401 : INFO : EPOCH 10 - PROGRESS: at 81.19% examples, 312531 words/s, in_qsize 20, out_qsize 0
2020-04-16 18:51:00,401 : INFO : EPOCH 10 - PROGRESS: at 82.28% examples, 312582 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:51:01,409 : INFO : EPOCH 10 - PROGRESS: at 83.33% examples, 312524 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:51:02,411 : INFO : EPOCH 10 - PROGRESS: at 84.35% examples, 312649 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:51:03,424 : INFO : EPOCH 10 - PROGRESS: at 85.31% examples, 312545 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:51:04,450 : INFO : EPOCH 10 - PROGRESS: at 86.47% examples, 312669 words/s, in_qsize 19, out_qsize 0
2020-04-16 18:51:05,496 : INFO : EPOCH 10 - PROGRESS: at 87.64% examples, 312566

CPU times: user 2h 5min 47s, sys: 18 s, total: 2h 6min 5s
Wall time: 16min 31s


(303491013, 415193550)

### Save the models

In [9]:
# save only the word vectors
model_cbow.wv.save("cbow_vector.bin")
model_subword.wv.save("subword_vector.bin")
model_skipgram.wv.save("skipgram_vector.bin")

2020-04-16 18:51:17,117 : INFO : saving Word2VecKeyedVectors object under cbow_vector.bin, separately None
2020-04-16 18:51:17,118 : INFO : storing np array 'vectors' to cbow_vector.bin.vectors.npy
2020-04-16 18:51:18,814 : INFO : not storing attribute vectors_norm
2020-04-16 18:51:19,090 : INFO : saved cbow_vector.bin
2020-04-16 18:51:19,091 : INFO : saving FastTextKeyedVectors object under subword_vector.bin, separately None
2020-04-16 18:51:19,092 : INFO : storing np array 'vectors' to subword_vector.bin.vectors.npy
2020-04-16 18:51:21,169 : INFO : storing np array 'vectors_vocab' to subword_vector.bin.vectors_vocab.npy
2020-04-16 18:51:23,616 : INFO : storing np array 'vectors_ngrams' to subword_vector.bin.vectors_ngrams.npy
2020-04-16 18:51:33,844 : INFO : not storing attribute vectors_norm
2020-04-16 18:51:33,846 : INFO : not storing attribute vectors_vocab_norm
2020-04-16 18:51:33,846 : INFO : not storing attribute vectors_ngrams_norm
2020-04-16 18:51:33,848 : INFO : not storing

In [10]:
from gensim.models import KeyedVectors
from IPython.display import display_html
import pandas as pd 

cbow_vectors = KeyedVectors.load("cbow_vector.bin")
subword_vectors = KeyedVectors.load("subword_vector.bin")
skipgram_vectors = KeyedVectors.load("skipgram_vector.bin")

2020-04-16 18:51:37,336 : INFO : loading Word2VecKeyedVectors object from cbow_vector.bin
2020-04-16 18:51:38,283 : INFO : loading vectors from cbow_vector.bin.vectors.npy with mmap=None
2020-04-16 18:51:38,320 : INFO : setting ignored attribute vectors_norm to None
2020-04-16 18:51:38,322 : INFO : loaded cbow_vector.bin
2020-04-16 18:51:38,322 : INFO : loading Word2VecKeyedVectors object from subword_vector.bin
2020-04-16 18:51:38,479 : INFO : loading vectors from subword_vector.bin.vectors.npy with mmap=None
2020-04-16 18:51:38,509 : INFO : loading vectors_vocab from subword_vector.bin.vectors_vocab.npy with mmap=None
2020-04-16 18:51:38,538 : INFO : loading vectors_ngrams from subword_vector.bin.vectors_ngrams.npy with mmap=None
2020-04-16 18:51:38,650 : INFO : setting ignored attribute vectors_norm to None
2020-04-16 18:51:38,651 : INFO : setting ignored attribute vectors_vocab_norm to None
2020-04-16 18:51:38,652 : INFO : setting ignored attribute vectors_ngrams_norm to None
2020-

## Get most similar concepts

In [11]:
def display_html_table(html_str):
    """Change the look and display style of table"""
    
    display_html(html_str.replace('table','table style="padding:20px;display:inline;color:navy;font-size:1.1em"'),raw=True)
    
def display_side_by_side(*args):
    html_str=''
    
    for df in args:
        html_str+=df.to_html()    
    
    display_html_table(html_str)
 
def display_similar(positive:list,topn=10):
    """get similar concepts from 3 different models"""
    
    topn_cbow=cbow_vectors.wv.most_similar(positive=w1, topn=topn)
    topn_subword=subword_vectors.wv.most_similar(positive=w1, topn=topn)
    topn_skipgram=skipgram_vectors.wv.most_similar(positive=w1, topn=topn)
    
    display_side_by_side(
                     pd.DataFrame(topn_cbow,columns=['cbow','cosine_sim']),
                     pd.DataFrame(topn_skipgram,columns=['skipgram','cosine_sim']),
                     pd.DataFrame(topn_subword,columns=['skipgramsi','cosine_sim']))


In [12]:
w1=['room','hotel']
display_similar(w1,topn=8)

2020-04-16 18:51:38,804 : INFO : precomputing L2-norms of word weight vectors
2020-04-16 18:51:38,884 : INFO : precomputing L2-norms of word weight vectors
2020-04-16 18:51:38,947 : INFO : precomputing L2-norms of ngram weight vectors
2020-04-16 18:51:39,317 : INFO : precomputing L2-norms of word weight vectors


Unnamed: 0,cbow,cosine_sim
0,property,0.569428
1,rooms,0.543698
2,accommodation,0.481973
3,accomodation,0.474268
4,rooom,0.456527
5,accomodations,0.452022
6,establishment,0.451631
7,suite,0.440867

Unnamed: 0,skipgram,cosine_sim
0,it,0.719486
1,rooms,0.719015
2,and,0.712526
3,but,0.712267
4,the,0.693895
5,was,0.689339
6,very,0.641216
7,is,0.641195

Unnamed: 0,skipgramsi,cosine_sim
0,hotelroom,0.903827
1,roomex,0.764336
2,roomn,0.752075
3,roomone,0.748121
4,roomd,0.747376
5,roomroom,0.743782
6,roomm,0.740087
7,roomon,0.730841


In [13]:
w1=['bathroom']
display_similar(w1,topn=8)

Unnamed: 0,cbow,cosine_sim
0,bath,0.804668
1,washroom,0.766167
2,bathrooms,0.729341
3,bathtub,0.706509
4,bathroon,0.677728
5,shower,0.661831
6,bathrrom,0.606226
7,sink,0.601708

Unnamed: 0,skipgram,cosine_sim
0,shower,0.810073
1,washroom,0.80131
2,bath,0.791331
3,vanity,0.777241
4,bathtub,0.770735
5,bathrooms,0.748299
6,sink,0.734606
7,tub,0.72198

Unnamed: 0,skipgramsi,cosine_sim
0,bathrooom,0.979597
1,bathroomn,0.971657
2,thebathroom,0.96861
3,bathroomi,0.963778
4,etcbathroom,0.961953
5,bathrroom,0.95967
6,bathroomno,0.958648
7,bathroomhad,0.956708


In [14]:
w1=['cheap']
display_similar(w1,topn=8)

Unnamed: 0,cbow,cosine_sim
0,inexpensive,0.68663
1,expensive,0.59337
2,affordable,0.532053
3,basic,0.509702
4,overpriced,0.50596
5,fancy,0.505349
6,cheep,0.491824
7,reasonable,0.489795

Unnamed: 0,skipgram,cosine_sim
0,inexpensive,0.692817
1,expensive,0.625671
2,ratescons,0.592748
3,cheep,0.574069
4,reasonable,0.563898
5,cheapest,0.554841
6,yukkie,0.552849
7,coffeshops,0.549907

Unnamed: 0,skipgramsi,cosine_sim
0,cheapy,0.889195
1,cheapp,0.870639
2,scheap,0.866285
3,cheapo,0.84207
4,cheapish,0.830611
5,cheapos,0.819328
6,cheapie,0.817418
7,inexpensivegood,0.73502


In [55]:
w1=['fire']
display_similar(w1,topn=8)

Unnamed: 0,cbow,cosine_sim
0,evacuation,0.554263
1,police,0.539269
2,gas,0.49944
3,sprinkler,0.476637
4,emergency,0.471629
5,firetrucks,0.464275
6,roared,0.453321
7,evacuate,0.447788

Unnamed: 0,skipgram,cosine_sim
0,alarms,0.73733
1,alarm,0.706461
2,evacuation,0.660267
3,suppressor,0.642252
4,detection,0.638741
5,trooped,0.635955
6,evacuate,0.627489
7,extinguishers,0.620106

Unnamed: 0,skipgramsi,cosine_sim
0,firenze,0.843334
1,firey,0.816708
2,firefox,0.801221
3,befire,0.800566
4,gunfire,0.798251
5,firefly,0.787477
6,firehall,0.780089
7,firebrigade,0.765683


## Compute similarity between words

In [24]:
# similarity between two related words
def get_word_sim(w1,w2,concept_type):

    sim_cbow=model_cbow.wv.similarity(w1=w1,w2=w2)
    sim_skipgram=model_skipgram.wv.similarity(w1=w1,w2=w2)
    sim_subword=model_subword.wv.similarity(w1=w1,w2=w2)
    
    return {"a_word":w1,"b_word":w2,"score_cbow":sim_cbow,"score_skipgram":sim_skipgram,"score_skipgramsi":sim_subword,"concept_type":concept_type}

# word pairs
word_pairs=[['friendly','staff','neighboring'],['shower','curtain','neighboring'],['very','clean','neighboring'],['hotel','property','synonymous'],['dirty','filthy','synonymous'],['washroom','bathroom','synonymous'],['staff','staffs','near_duplicates'],['calendar','calender','near_duplicates'],['bathrroom','bathrooms','near_duplicates']]

# get similarity
results=[]
for p in word_pairs:
    results.append(get_word_sim(p[0],p[1],p[2]))

#put in dataframe
df=pd.DataFrame(results)    
display_html_table(df.to_html())

Unnamed: 0,a_word,b_word,concept_type,score_cbow,score_skipgram,score_skipgramsi
0,friendly,staff,neighboring,0.114944,0.749117,0.194114
1,shower,curtain,neighboring,0.26286,0.717065,0.333938
2,very,clean,neighboring,0.397924,0.678147,0.443427
3,hotel,property,synonymous,0.807957,0.667862,0.81629
4,dirty,filthy,synonymous,0.865373,0.878625,0.878053
5,washroom,bathroom,synonymous,0.766167,0.80131,0.89995
6,staff,staffs,near_duplicates,0.830538,0.623231,0.943355
7,calendar,calender,near_duplicates,0.209005,0.497958,0.696005
8,bathrroom,bathrooms,near_duplicates,0.165828,0.506214,0.830746


## Evaluate Similarity Between Phrases

In [16]:
def get_phrase_similarity(p1,p2,model):
    
     
    tokens_1=[t for t in p1.split() if t in model.wv.vocab]
    tokens_2=[t for t in p2.split() if t in model.wv.vocab]

    #compute cosine similarity using word embedings 
    cosine=0
    if (len(tokens_1) > 0 and len(tokens_2)>0):
        cosine=model.wv.n_similarity(tokens_1,tokens_2)
        
        if cosine > 0.6:
            return 1
        else:
            return 0
    else:
        return 0

items=[["room was very tidy","neat rooms"],["took a shower","had a bath"], ["staff was friendly","very polite manager"],["friendly staff","polite manager"],["friendly staff","rude personnel"],["the room was super dirty","very clean room"]]

import pandas as pd
df=pd.read_csv("similarity_test.txt")

df['cbow_sim']=df.apply(lambda x:get_phrase_similarity(x.phrase1,x.phrase2,model_cbow),axis=1)
df['skipgram_sim']=df.apply(lambda x:get_phrase_similarity(x.phrase1,x.phrase2,model_skipgram),axis=1)
df['skipgramsi_sim']=df.apply(lambda x:get_phrase_similarity(x.phrase1,x.phrase2,model_subword),axis=1)


display_html_table(df.to_html())

Unnamed: 0,phrase1,phrase2,similar,cbow_sim,skipgram_sim,skipgramsi_sim
0,polite staff,rude staff,0,1,1,1
1,friendly manager,rude manager,0,1,1,1
2,room was huge,large rooms,1,0,1,0
3,staff was friendly,very polite manager,1,1,1,1
4,bathroom was very dirty,filthy bathroom,1,1,1,1
5,clean and tidy rooms,the room was a mess,0,0,1,0
6,the views were awesome,the breakfast was nice,0,0,1,0
7,what lovely breakfast,friendly staff,0,0,0,0
8,would recommend,highly recommended,1,0,1,0
9,the manager was rude,staff were arrogant and rude,1,1,1,1


### Evaluation of Phrase Similarity

In [19]:
from sklearn.metrics import precision_recall_fscore_support   

def get_prf(model_type,x_true,x_pred):
    """Compute precision, recall and f-score"""
    
    per_class_prf=precision_recall_fscore_support(x_true,x_pred,average='binary')
    
    precision = per_class_prf[0]
    recall = per_class_prf[1]
    fscore = per_class_prf[2]
    
    return {"a_model_type":model_type,"b_precision":precision,"c_recall":recall,"d_fscore":fscore}
    

In [21]:
results=[]
results.append(get_prf("cbow_sim",df['similar'].values,df['cbow_sim'].values))
results.append(get_prf("skipgram_sim",df['similar'].values,df['skipgram_sim'].values))
results.append(get_prf("skipgramsi_sim",df['similar'].values,df['skipgramsi_sim'].values))

df_results=pd.DataFrame(results)
display_html_table(df_results.to_html())

Unnamed: 0,a_model_type,b_precision,c_recall,d_fscore
0,cbow_sim,0.777778,0.583333,0.666667
1,skipgram_sim,0.75,1.0,0.857143
2,skipgramsi_sim,0.714286,0.416667,0.526316
