preProcessing

Description: Creating the Word2Vec models for fake and real reviews from 600K yelp database (classified) from Kaggle

1. Remove stop-words, numbers, non-alphabet,...
2. Create w2v models

Data source:
@inproceedings{DBLP:conf/sigkdd/Akoglu15,
author = {Shebuti Rayana and Leman Akoglu},
title = {Collective Opinion Spam Detection: Bridging Review Networks and metadata},
booktitle = {Proceeding of the 21st ACM SIGKDD international conference
on Knowledge discovery and data mining, {KDD’15}},
year = {2015},
}


In [1]:
import pandas as pd
import json
import os
import pickle
import numpy as np

In [5]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [6]:
nlp = spacy.load('en_core_web_md', disable = ['parser','ner'])
def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [13]:
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
import multiprocessing

INFO - 18:05:19: adding document #0 to Dictionary(0 unique tokens: [])
INFO - 18:05:19: built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
INFO - 18:05:19: Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2022-01-26T18:05:19.990418', 'gensim': '4.1.2', 'python': '3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [None]:
#use encode in open files to explicitly choose the codes
def splitBigJson(jsonFile, sizeSplit=100000):
#     try:
    with open(jsonFile, encoding="utf8") as file:
        data = [json.loads(line) for line in file]
    fileOriginal = jsonFile
#     sizeSplit=100000
    print('JSON size:', len(data))
    sizeSplit = int(input('Input chunk size (100000):'))
    if sizeSplit == 0:
        return False
    total = int(len(data) /sizeSplit) + 1
    for i in range(total):
        filePath = os.path.splitext(fileOriginal)[0] + str(i) + '.json'
        print(filePath)
    #     json.dump(data[i * sizeSplit:(i + 1) * sizeSplit], open(filePath, 'w',encoding='utf8'), ensure_ascii=False, indent=True)
        json.dump(data[i * sizeSplit:(i + 1) * sizeSplit], open(filePath, 'w',encoding='utf8'))
    return True
#     except:
#         print('failed')
#         return False

START

Create subset data
    each chunk has 500K review

Read the 135K review

In [2]:
dfYelp = pd.read_pickle('yelp-600K-review.pkl', compression='gzip')

dfYelp

In [None]:
dfFake = dfYelp[dfYelp['realFake']==-1].copy()

In [3]:
dfReal = dfYelp[dfYelp['realFake']==1].copy()

In [4]:
dfReview = dfReal

Remove unwanted words
- Stop words and lemma funct
- Remove number and non-alphabet

In [7]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower().replace('\d+', '') for row in dfReview['text'])

In [9]:
txt = []
count = 0
start = time()
with nlp.select_pipes(enable="lemmatizer"):
    for doc in nlp.pipe(brief_cleaning, batch_size=1000, n_process=-1):
        if count % 50000 == 0:
            print('processed:', count, 'Time {} mins'.format(round((time() - start) / 60, 2)))
        count += 1
        txt.append(cleaning(doc))

processed: 0 Time 1.01 mins
processed: 50000 Time 2.23 mins
processed: 100000 Time 3.41 mins
processed: 150000 Time 4.61 mins
processed: 200000 Time 5.78 mins
processed: 250000 Time 6.94 mins
processed: 300000 Time 8.14 mins
processed: 350000 Time 9.31 mins
processed: 400000 Time 10.51 mins
processed: 450000 Time 11.68 mins
processed: 500000 Time 12.86 mins


In [10]:
df_clean = pd.DataFrame({'clean': txt})

In [12]:
df_clean.to_pickle('dfReal-clean.pkl', compression='gzip')

In [14]:
# Phrase & bigram
sent = [str(row).split() for row in df_clean['clean']]
phrases = Phrases(sent, min_count=1, progress_per=10000)
bigram = Phraser(phrases)
sentences = bigram[sent]

INFO - 18:05:32: collecting all words and their counts
INFO - 18:05:32: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:05:33: PROGRESS: at sentence #10000, processed 516788 words and 343209 word types
INFO - 18:05:34: PROGRESS: at sentence #20000, processed 1071560 words and 608004 word types
INFO - 18:05:34: PROGRESS: at sentence #30000, processed 1636945 words and 860548 word types
INFO - 18:05:35: PROGRESS: at sentence #40000, processed 2159248 words and 1075278 word types
INFO - 18:05:36: PROGRESS: at sentence #50000, processed 2689731 words and 1278065 word types
INFO - 18:05:37: PROGRESS: at sentence #60000, processed 3208581 words and 1465388 word types
INFO - 18:05:38: PROGRESS: at sentence #70000, processed 3742411 words and 1647147 word types
INFO - 18:05:38: PROGRESS: at sentence #80000, processed 4276375 words and 1831994 word types
INFO - 18:05:39: PROGRESS: at sentence #90000, processed 4800011 words and 2006428 word types
INFO - 18:05:40: PROGRESS

In [15]:
#Build Word2Vec vocabularies
w2v_model = Word2Vec(min_count=3,
                     window=4,
                     vector_size=100,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=5,
                     sg=1,
                     workers=multiprocessing.cpu_count()-1)

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

# print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

INFO - 18:06:39: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.03)', 'datetime': '2022-01-26T18:06:39.505242', 'gensim': '4.1.2', 'python': '3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
INFO - 18:06:39: collecting all words and their counts
INFO - 18:06:39: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:06:43: PROGRESS: at sentence #50000, processed 2384819 words, keeping 101650 word types
INFO - 18:06:46: PROGRESS: at sentence #100000, processed 4744268 words, keeping 149834 word types
INFO - 18:06:49: PROGRESS: at sentence #150000, processed 7088063 words, keeping 183239 word types
INFO - 18:06:52: PROGRESS: at sentence #200000, processed 9422790 words, keeping 212010 word types
INFO - 18:06:55: PROGRESS: at sentence #250000, processed 11710259 words, keeping 235602 word types
INFO - 18:06:59: PROGRESS: at sentence #300000, pr

In [16]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=60, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

w2v_model.init_sims(replace=True)

INFO - 18:15:05: Word2Vec lifecycle event {'msg': 'training model with 7 workers on 153773 vocabulary and 100 features, using sg=0 hs=0 sample=1e-05 negative=20 window=4 shrink_windows=True', 'datetime': '2022-01-26T18:15:05.542292', 'gensim': '4.1.2', 'python': '3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'train'}
INFO - 18:15:06: EPOCH 1 - PROGRESS: at 1.79% examples, 137903 words/s, in_qsize 13, out_qsize 4
INFO - 18:15:07: EPOCH 1 - PROGRESS: at 4.05% examples, 167214 words/s, in_qsize 14, out_qsize 0
INFO - 18:15:08: EPOCH 1 - PROGRESS: at 6.36% examples, 177381 words/s, in_qsize 14, out_qsize 1
INFO - 18:15:09: EPOCH 1 - PROGRESS: at 8.64% examples, 178567 words/s, in_qsize 12, out_qsize 6
INFO - 18:15:10: EPOCH 1 - PROGRESS: at 11.10% examples, 183700 words/s, in_qsize 11, out_qsize 2
INFO - 18:15:11: EPOCH 1 - PROGRESS: at 13.42% examples, 183721 words/s, in_qsize 12, out_qsize 6
INFO - 18:15

INFO - 18:16:22: EPOCH 2 - PROGRESS: at 72.28% examples, 184328 words/s, in_qsize 14, out_qsize 0
INFO - 18:16:23: EPOCH 2 - PROGRESS: at 74.58% examples, 184458 words/s, in_qsize 13, out_qsize 0
INFO - 18:16:24: EPOCH 2 - PROGRESS: at 77.02% examples, 184421 words/s, in_qsize 14, out_qsize 6
INFO - 18:16:25: EPOCH 2 - PROGRESS: at 79.54% examples, 184907 words/s, in_qsize 10, out_qsize 8
INFO - 18:16:26: EPOCH 2 - PROGRESS: at 82.05% examples, 185414 words/s, in_qsize 14, out_qsize 0
INFO - 18:16:27: EPOCH 2 - PROGRESS: at 84.44% examples, 185684 words/s, in_qsize 11, out_qsize 3
INFO - 18:16:28: EPOCH 2 - PROGRESS: at 86.75% examples, 185969 words/s, in_qsize 14, out_qsize 2
INFO - 18:16:29: EPOCH 2 - PROGRESS: at 88.93% examples, 186506 words/s, in_qsize 13, out_qsize 0
INFO - 18:16:30: EPOCH 2 - PROGRESS: at 91.33% examples, 187044 words/s, in_qsize 13, out_qsize 0
INFO - 18:16:31: EPOCH 2 - PROGRESS: at 93.72% examples, 186705 words/s, in_qsize 13, out_qsize 6
INFO - 18:16:32: EPO

INFO - 18:17:37: EPOCH 4 - PROGRESS: at 45.24% examples, 195393 words/s, in_qsize 12, out_qsize 4
INFO - 18:17:38: EPOCH 4 - PROGRESS: at 47.79% examples, 195971 words/s, in_qsize 14, out_qsize 0
INFO - 18:17:39: EPOCH 4 - PROGRESS: at 50.12% examples, 195952 words/s, in_qsize 14, out_qsize 0
INFO - 18:17:40: EPOCH 4 - PROGRESS: at 52.44% examples, 195248 words/s, in_qsize 13, out_qsize 6
INFO - 18:17:41: EPOCH 4 - PROGRESS: at 54.94% examples, 196253 words/s, in_qsize 14, out_qsize 0
INFO - 18:17:42: EPOCH 4 - PROGRESS: at 57.43% examples, 196192 words/s, in_qsize 13, out_qsize 0
INFO - 18:17:43: EPOCH 4 - PROGRESS: at 59.87% examples, 196133 words/s, in_qsize 13, out_qsize 0
INFO - 18:17:44: EPOCH 4 - PROGRESS: at 62.31% examples, 195561 words/s, in_qsize 12, out_qsize 6
INFO - 18:17:45: EPOCH 4 - PROGRESS: at 64.94% examples, 196391 words/s, in_qsize 8, out_qsize 0
INFO - 18:17:46: EPOCH 4 - PROGRESS: at 67.07% examples, 195473 words/s, in_qsize 11, out_qsize 3
INFO - 18:17:47: EPOC

INFO - 18:18:52: EPOCH 6 - PROGRESS: at 18.49% examples, 194521 words/s, in_qsize 13, out_qsize 1
INFO - 18:18:53: EPOCH 6 - PROGRESS: at 20.79% examples, 192922 words/s, in_qsize 14, out_qsize 5
INFO - 18:18:54: EPOCH 6 - PROGRESS: at 23.40% examples, 194148 words/s, in_qsize 10, out_qsize 4
INFO - 18:18:55: EPOCH 6 - PROGRESS: at 25.69% examples, 194492 words/s, in_qsize 11, out_qsize 5
INFO - 18:18:56: EPOCH 6 - PROGRESS: at 28.36% examples, 196365 words/s, in_qsize 12, out_qsize 0
INFO - 18:18:57: EPOCH 6 - PROGRESS: at 30.34% examples, 195272 words/s, in_qsize 12, out_qsize 5
INFO - 18:18:58: EPOCH 6 - PROGRESS: at 33.02% examples, 196278 words/s, in_qsize 13, out_qsize 0
INFO - 18:18:59: EPOCH 6 - PROGRESS: at 35.44% examples, 196270 words/s, in_qsize 13, out_qsize 0
INFO - 18:19:00: EPOCH 6 - PROGRESS: at 37.84% examples, 195965 words/s, in_qsize 12, out_qsize 3
INFO - 18:19:01: EPOCH 6 - PROGRESS: at 40.33% examples, 196507 words/s, in_qsize 13, out_qsize 0
INFO - 18:19:02: EPO

INFO - 18:20:10: worker thread finished; awaiting finish of 3 more threads
INFO - 18:20:10: worker thread finished; awaiting finish of 2 more threads
INFO - 18:20:10: worker thread finished; awaiting finish of 1 more threads
INFO - 18:20:10: worker thread finished; awaiting finish of 0 more threads
INFO - 18:20:10: EPOCH - 7 : training on 24789697 raw words (8436627 effective words) took 43.2s, 195161 effective words/s
INFO - 18:20:11: EPOCH 8 - PROGRESS: at 1.82% examples, 153016 words/s, in_qsize 13, out_qsize 1
INFO - 18:20:12: EPOCH 8 - PROGRESS: at 4.09% examples, 175969 words/s, in_qsize 11, out_qsize 2
INFO - 18:20:13: EPOCH 8 - PROGRESS: at 6.52% examples, 185593 words/s, in_qsize 12, out_qsize 1
INFO - 18:20:14: EPOCH 8 - PROGRESS: at 8.77% examples, 184977 words/s, in_qsize 10, out_qsize 6
INFO - 18:20:15: EPOCH 8 - PROGRESS: at 11.21% examples, 187413 words/s, in_qsize 14, out_qsize 2
INFO - 18:20:16: EPOCH 8 - PROGRESS: at 13.63% examples, 187960 words/s, in_qsize 13, out_q

INFO - 18:21:27: EPOCH 9 - PROGRESS: at 79.33% examples, 195749 words/s, in_qsize 13, out_qsize 0
INFO - 18:21:28: EPOCH 9 - PROGRESS: at 81.65% examples, 195565 words/s, in_qsize 11, out_qsize 4
INFO - 18:21:29: EPOCH 9 - PROGRESS: at 84.19% examples, 195690 words/s, in_qsize 13, out_qsize 0
INFO - 18:21:30: EPOCH 9 - PROGRESS: at 86.39% examples, 195368 words/s, in_qsize 13, out_qsize 6
INFO - 18:21:31: EPOCH 9 - PROGRESS: at 88.62% examples, 195738 words/s, in_qsize 11, out_qsize 4
INFO - 18:21:32: EPOCH 9 - PROGRESS: at 91.06% examples, 195972 words/s, in_qsize 14, out_qsize 1
INFO - 18:21:33: EPOCH 9 - PROGRESS: at 91.77% examples, 192477 words/s, in_qsize 13, out_qsize 0
INFO - 18:21:34: EPOCH 9 - PROGRESS: at 94.52% examples, 192535 words/s, in_qsize 13, out_qsize 0
INFO - 18:21:35: EPOCH 9 - PROGRESS: at 96.91% examples, 192655 words/s, in_qsize 13, out_qsize 0
INFO - 18:21:36: EPOCH 9 - PROGRESS: at 99.76% examples, 193535 words/s, in_qsize 7, out_qsize 0
INFO - 18:21:36: work

INFO - 18:22:40: EPOCH 11 - PROGRESS: at 47.87% examples, 194504 words/s, in_qsize 12, out_qsize 2
INFO - 18:22:41: EPOCH 11 - PROGRESS: at 50.12% examples, 194425 words/s, in_qsize 13, out_qsize 0
INFO - 18:22:42: EPOCH 11 - PROGRESS: at 52.68% examples, 195533 words/s, in_qsize 8, out_qsize 0
INFO - 18:22:43: EPOCH 11 - PROGRESS: at 54.64% examples, 195013 words/s, in_qsize 13, out_qsize 0
INFO - 18:22:44: EPOCH 11 - PROGRESS: at 56.87% examples, 194363 words/s, in_qsize 14, out_qsize 6
INFO - 18:22:45: EPOCH 11 - PROGRESS: at 59.55% examples, 195231 words/s, in_qsize 14, out_qsize 0
INFO - 18:22:46: EPOCH 11 - PROGRESS: at 62.10% examples, 194911 words/s, in_qsize 13, out_qsize 0
INFO - 18:22:47: EPOCH 11 - PROGRESS: at 64.54% examples, 194682 words/s, in_qsize 13, out_qsize 1
INFO - 18:22:48: EPOCH 11 - PROGRESS: at 67.12% examples, 194871 words/s, in_qsize 13, out_qsize 0
INFO - 18:22:49: EPOCH 11 - PROGRESS: at 69.34% examples, 194920 words/s, in_qsize 13, out_qsize 0
INFO - 18:2

INFO - 18:23:53: EPOCH 13 - PROGRESS: at 16.14% examples, 190145 words/s, in_qsize 14, out_qsize 1
INFO - 18:23:54: EPOCH 13 - PROGRESS: at 18.35% examples, 191049 words/s, in_qsize 13, out_qsize 0
INFO - 18:23:55: EPOCH 13 - PROGRESS: at 20.57% examples, 189996 words/s, in_qsize 11, out_qsize 5
INFO - 18:23:56: EPOCH 13 - PROGRESS: at 23.07% examples, 190417 words/s, in_qsize 10, out_qsize 4
INFO - 18:23:57: EPOCH 13 - PROGRESS: at 25.37% examples, 190198 words/s, in_qsize 11, out_qsize 7
INFO - 18:23:58: EPOCH 13 - PROGRESS: at 27.97% examples, 192737 words/s, in_qsize 12, out_qsize 1
INFO - 18:23:59: EPOCH 13 - PROGRESS: at 30.01% examples, 192706 words/s, in_qsize 14, out_qsize 0
INFO - 18:24:00: EPOCH 13 - PROGRESS: at 32.60% examples, 193363 words/s, in_qsize 14, out_qsize 0
INFO - 18:24:01: EPOCH 13 - PROGRESS: at 34.89% examples, 192821 words/s, in_qsize 8, out_qsize 6
INFO - 18:24:02: EPOCH 13 - PROGRESS: at 37.29% examples, 193155 words/s, in_qsize 13, out_qsize 0
INFO - 18:2

INFO - 18:25:12: worker thread finished; awaiting finish of 5 more threads
INFO - 18:25:12: worker thread finished; awaiting finish of 4 more threads
INFO - 18:25:12: worker thread finished; awaiting finish of 3 more threads
INFO - 18:25:12: worker thread finished; awaiting finish of 2 more threads
INFO - 18:25:12: worker thread finished; awaiting finish of 1 more threads
INFO - 18:25:12: worker thread finished; awaiting finish of 0 more threads
INFO - 18:25:12: EPOCH - 14 : training on 24789697 raw words (8434787 effective words) took 43.3s, 195005 effective words/s
INFO - 18:25:13: EPOCH 15 - PROGRESS: at 1.82% examples, 153799 words/s, in_qsize 13, out_qsize 0
INFO - 18:25:14: EPOCH 15 - PROGRESS: at 4.12% examples, 176740 words/s, in_qsize 13, out_qsize 0
INFO - 18:25:15: EPOCH 15 - PROGRESS: at 6.56% examples, 188986 words/s, in_qsize 10, out_qsize 0
INFO - 18:25:16: EPOCH 15 - PROGRESS: at 8.95% examples, 189273 words/s, in_qsize 13, out_qsize 0
INFO - 18:25:17: EPOCH 15 - PROGRE

INFO - 18:26:28: EPOCH 16 - PROGRESS: at 68.79% examples, 176542 words/s, in_qsize 13, out_qsize 0
INFO - 18:26:29: EPOCH 16 - PROGRESS: at 71.24% examples, 177197 words/s, in_qsize 13, out_qsize 0
INFO - 18:26:30: EPOCH 16 - PROGRESS: at 73.55% examples, 177704 words/s, in_qsize 13, out_qsize 0
INFO - 18:26:31: EPOCH 16 - PROGRESS: at 76.09% examples, 178270 words/s, in_qsize 13, out_qsize 0
INFO - 18:26:32: EPOCH 16 - PROGRESS: at 78.67% examples, 178823 words/s, in_qsize 14, out_qsize 1
INFO - 18:26:33: EPOCH 16 - PROGRESS: at 81.02% examples, 179167 words/s, in_qsize 14, out_qsize 4
INFO - 18:26:34: EPOCH 16 - PROGRESS: at 83.55% examples, 179787 words/s, in_qsize 14, out_qsize 0
INFO - 18:26:35: EPOCH 16 - PROGRESS: at 86.01% examples, 180369 words/s, in_qsize 13, out_qsize 0
INFO - 18:26:36: EPOCH 16 - PROGRESS: at 88.21% examples, 181012 words/s, in_qsize 13, out_qsize 0
INFO - 18:26:37: EPOCH 16 - PROGRESS: at 90.58% examples, 181589 words/s, in_qsize 13, out_qsize 1
INFO - 18:

INFO - 18:27:41: EPOCH 18 - PROGRESS: at 42.70% examples, 195037 words/s, in_qsize 14, out_qsize 6
INFO - 18:27:42: EPOCH 18 - PROGRESS: at 45.41% examples, 195871 words/s, in_qsize 13, out_qsize 1
INFO - 18:27:43: EPOCH 18 - PROGRESS: at 48.05% examples, 196297 words/s, in_qsize 14, out_qsize 0
INFO - 18:27:44: EPOCH 18 - PROGRESS: at 50.33% examples, 196254 words/s, in_qsize 10, out_qsize 3
INFO - 18:27:46: EPOCH 18 - PROGRESS: at 51.30% examples, 190752 words/s, in_qsize 12, out_qsize 0
INFO - 18:27:47: EPOCH 18 - PROGRESS: at 53.66% examples, 190654 words/s, in_qsize 13, out_qsize 2
INFO - 18:27:48: EPOCH 18 - PROGRESS: at 55.75% examples, 190779 words/s, in_qsize 10, out_qsize 6
INFO - 18:27:49: EPOCH 18 - PROGRESS: at 58.23% examples, 191432 words/s, in_qsize 14, out_qsize 0
INFO - 18:27:50: EPOCH 18 - PROGRESS: at 60.63% examples, 191115 words/s, in_qsize 14, out_qsize 4
INFO - 18:27:51: EPOCH 18 - PROGRESS: at 63.12% examples, 190956 words/s, in_qsize 14, out_qsize 5
INFO - 18:

INFO - 18:28:55: EPOCH 20 - PROGRESS: at 13.91% examples, 191496 words/s, in_qsize 13, out_qsize 1
INFO - 18:28:56: EPOCH 20 - PROGRESS: at 16.26% examples, 193053 words/s, in_qsize 13, out_qsize 0
INFO - 18:28:57: EPOCH 20 - PROGRESS: at 18.62% examples, 193657 words/s, in_qsize 12, out_qsize 6
INFO - 18:28:58: EPOCH 20 - PROGRESS: at 21.32% examples, 195540 words/s, in_qsize 13, out_qsize 0
INFO - 18:28:59: EPOCH 20 - PROGRESS: at 23.78% examples, 195535 words/s, in_qsize 11, out_qsize 4
INFO - 18:29:00: EPOCH 20 - PROGRESS: at 26.29% examples, 197395 words/s, in_qsize 13, out_qsize 0
INFO - 18:29:01: EPOCH 20 - PROGRESS: at 28.61% examples, 195986 words/s, in_qsize 14, out_qsize 5
INFO - 18:29:02: EPOCH 20 - PROGRESS: at 31.14% examples, 197953 words/s, in_qsize 10, out_qsize 3
INFO - 18:29:03: EPOCH 20 - PROGRESS: at 33.61% examples, 197820 words/s, in_qsize 9, out_qsize 6
INFO - 18:29:04: EPOCH 20 - PROGRESS: at 36.08% examples, 198411 words/s, in_qsize 14, out_qsize 0
INFO - 18:2

INFO - 18:30:14: worker thread finished; awaiting finish of 6 more threads
INFO - 18:30:14: worker thread finished; awaiting finish of 5 more threads
INFO - 18:30:14: worker thread finished; awaiting finish of 4 more threads
INFO - 18:30:14: worker thread finished; awaiting finish of 3 more threads
INFO - 18:30:14: worker thread finished; awaiting finish of 2 more threads
INFO - 18:30:14: worker thread finished; awaiting finish of 1 more threads
INFO - 18:30:14: worker thread finished; awaiting finish of 0 more threads
INFO - 18:30:14: EPOCH - 21 : training on 24789697 raw words (8436637 effective words) took 43.0s, 196121 effective words/s
INFO - 18:30:15: EPOCH 22 - PROGRESS: at 1.87% examples, 156960 words/s, in_qsize 14, out_qsize 0
INFO - 18:30:16: EPOCH 22 - PROGRESS: at 4.11% examples, 177814 words/s, in_qsize 13, out_qsize 0
INFO - 18:30:17: EPOCH 22 - PROGRESS: at 6.36% examples, 181694 words/s, in_qsize 14, out_qsize 5
INFO - 18:30:19: EPOCH 22 - PROGRESS: at 8.97% examples, 

INFO - 18:31:28: EPOCH 23 - PROGRESS: at 72.35% examples, 198484 words/s, in_qsize 14, out_qsize 0
INFO - 18:31:29: EPOCH 23 - PROGRESS: at 74.62% examples, 198166 words/s, in_qsize 11, out_qsize 4
INFO - 18:31:30: EPOCH 23 - PROGRESS: at 77.28% examples, 198043 words/s, in_qsize 13, out_qsize 6
INFO - 18:31:31: EPOCH 23 - PROGRESS: at 79.87% examples, 198549 words/s, in_qsize 13, out_qsize 0
INFO - 18:31:32: EPOCH 23 - PROGRESS: at 82.39% examples, 198435 words/s, in_qsize 13, out_qsize 0
INFO - 18:31:33: EPOCH 23 - PROGRESS: at 84.59% examples, 197901 words/s, in_qsize 14, out_qsize 6
INFO - 18:31:34: EPOCH 23 - PROGRESS: at 87.07% examples, 198341 words/s, in_qsize 9, out_qsize 6
INFO - 18:31:35: EPOCH 23 - PROGRESS: at 89.37% examples, 198576 words/s, in_qsize 14, out_qsize 0
INFO - 18:31:36: EPOCH 23 - PROGRESS: at 91.64% examples, 198606 words/s, in_qsize 9, out_qsize 4
INFO - 18:31:37: EPOCH 23 - PROGRESS: at 94.42% examples, 198617 words/s, in_qsize 13, out_qsize 0
INFO - 18:31

INFO - 18:32:41: EPOCH 25 - PROGRESS: at 40.82% examples, 198827 words/s, in_qsize 11, out_qsize 7
INFO - 18:32:42: EPOCH 25 - PROGRESS: at 43.60% examples, 198833 words/s, in_qsize 14, out_qsize 6
INFO - 18:32:43: EPOCH 25 - PROGRESS: at 46.22% examples, 199468 words/s, in_qsize 14, out_qsize 3
INFO - 18:32:44: EPOCH 25 - PROGRESS: at 48.85% examples, 199809 words/s, in_qsize 13, out_qsize 0
INFO - 18:32:45: EPOCH 25 - PROGRESS: at 51.14% examples, 199789 words/s, in_qsize 13, out_qsize 0
INFO - 18:32:46: EPOCH 25 - PROGRESS: at 53.43% examples, 199252 words/s, in_qsize 14, out_qsize 5
INFO - 18:32:47: EPOCH 25 - PROGRESS: at 55.81% examples, 200025 words/s, in_qsize 14, out_qsize 0
INFO - 18:32:48: EPOCH 25 - PROGRESS: at 58.04% examples, 199668 words/s, in_qsize 14, out_qsize 3
INFO - 18:32:49: EPOCH 25 - PROGRESS: at 60.62% examples, 199907 words/s, in_qsize 13, out_qsize 0
INFO - 18:32:50: EPOCH 25 - PROGRESS: at 63.16% examples, 199707 words/s, in_qsize 13, out_qsize 0
INFO - 18:

INFO - 18:33:55: EPOCH 27 - PROGRESS: at 15.06% examples, 175978 words/s, in_qsize 14, out_qsize 3
INFO - 18:33:56: EPOCH 27 - PROGRESS: at 17.44% examples, 178397 words/s, in_qsize 14, out_qsize 6
INFO - 18:33:57: EPOCH 27 - PROGRESS: at 20.02% examples, 181346 words/s, in_qsize 12, out_qsize 6
INFO - 18:33:58: EPOCH 27 - PROGRESS: at 22.57% examples, 183092 words/s, in_qsize 11, out_qsize 6
INFO - 18:33:59: EPOCH 27 - PROGRESS: at 25.03% examples, 185706 words/s, in_qsize 13, out_qsize 0
INFO - 18:34:00: EPOCH 27 - PROGRESS: at 27.32% examples, 186165 words/s, in_qsize 13, out_qsize 5
INFO - 18:34:01: EPOCH 27 - PROGRESS: at 29.72% examples, 188726 words/s, in_qsize 11, out_qsize 3
INFO - 18:34:02: EPOCH 27 - PROGRESS: at 32.56% examples, 190453 words/s, in_qsize 13, out_qsize 0
INFO - 18:34:03: EPOCH 27 - PROGRESS: at 34.89% examples, 190650 words/s, in_qsize 13, out_qsize 0
INFO - 18:34:04: EPOCH 27 - PROGRESS: at 37.36% examples, 190850 words/s, in_qsize 12, out_qsize 4
INFO - 18:

INFO - 18:35:11: worker thread finished; awaiting finish of 4 more threads
INFO - 18:35:11: worker thread finished; awaiting finish of 3 more threads
INFO - 18:35:11: worker thread finished; awaiting finish of 2 more threads
INFO - 18:35:11: worker thread finished; awaiting finish of 1 more threads
INFO - 18:35:11: worker thread finished; awaiting finish of 0 more threads
INFO - 18:35:11: EPOCH - 28 : training on 24789697 raw words (8435603 effective words) took 41.6s, 202799 effective words/s
INFO - 18:35:12: EPOCH 29 - PROGRESS: at 1.91% examples, 157135 words/s, in_qsize 13, out_qsize 0
INFO - 18:35:13: EPOCH 29 - PROGRESS: at 4.33% examples, 184334 words/s, in_qsize 13, out_qsize 0
INFO - 18:35:14: EPOCH 29 - PROGRESS: at 6.48% examples, 186381 words/s, in_qsize 11, out_qsize 5
INFO - 18:35:16: EPOCH 29 - PROGRESS: at 9.14% examples, 193248 words/s, in_qsize 14, out_qsize 0
INFO - 18:35:17: EPOCH 29 - PROGRESS: at 11.67% examples, 194330 words/s, in_qsize 13, out_qsize 0
INFO - 18:

INFO - 18:36:28: EPOCH 30 - PROGRESS: at 81.10% examples, 201903 words/s, in_qsize 9, out_qsize 6
INFO - 18:36:29: EPOCH 30 - PROGRESS: at 83.49% examples, 201560 words/s, in_qsize 13, out_qsize 6
INFO - 18:36:30: EPOCH 30 - PROGRESS: at 85.97% examples, 201787 words/s, in_qsize 11, out_qsize 4
INFO - 18:36:31: EPOCH 30 - PROGRESS: at 88.21% examples, 202076 words/s, in_qsize 13, out_qsize 0
INFO - 18:36:32: EPOCH 30 - PROGRESS: at 90.63% examples, 202234 words/s, in_qsize 13, out_qsize 0
INFO - 18:36:33: EPOCH 30 - PROGRESS: at 93.25% examples, 202164 words/s, in_qsize 13, out_qsize 0
INFO - 18:36:34: EPOCH 30 - PROGRESS: at 95.58% examples, 201730 words/s, in_qsize 14, out_qsize 6
INFO - 18:36:35: EPOCH 30 - PROGRESS: at 98.19% examples, 202237 words/s, in_qsize 13, out_qsize 0
INFO - 18:36:35: worker thread finished; awaiting finish of 6 more threads
INFO - 18:36:35: worker thread finished; awaiting finish of 5 more threads
INFO - 18:36:35: worker thread finished; awaiting finish of

INFO - 18:37:42: EPOCH 32 - PROGRESS: at 55.77% examples, 200160 words/s, in_qsize 14, out_qsize 5
INFO - 18:37:43: EPOCH 32 - PROGRESS: at 58.37% examples, 201066 words/s, in_qsize 6, out_qsize 4
INFO - 18:37:44: EPOCH 32 - PROGRESS: at 60.81% examples, 200674 words/s, in_qsize 13, out_qsize 0
INFO - 18:37:45: EPOCH 32 - PROGRESS: at 63.42% examples, 200308 words/s, in_qsize 14, out_qsize 3
INFO - 18:37:46: EPOCH 32 - PROGRESS: at 65.98% examples, 200475 words/s, in_qsize 13, out_qsize 0
INFO - 18:37:47: EPOCH 32 - PROGRESS: at 68.16% examples, 200161 words/s, in_qsize 12, out_qsize 6
INFO - 18:37:48: EPOCH 32 - PROGRESS: at 70.78% examples, 200683 words/s, in_qsize 13, out_qsize 0
INFO - 18:37:49: EPOCH 32 - PROGRESS: at 73.07% examples, 200097 words/s, in_qsize 13, out_qsize 6
INFO - 18:37:50: EPOCH 32 - PROGRESS: at 75.74% examples, 200392 words/s, in_qsize 13, out_qsize 0
INFO - 18:37:51: EPOCH 32 - PROGRESS: at 78.36% examples, 200576 words/s, in_qsize 13, out_qsize 0
INFO - 18:3

INFO - 18:38:55: EPOCH 34 - PROGRESS: at 31.42% examples, 200590 words/s, in_qsize 13, out_qsize 0
INFO - 18:38:57: EPOCH 34 - PROGRESS: at 33.77% examples, 199745 words/s, in_qsize 12, out_qsize 6
INFO - 18:38:58: EPOCH 34 - PROGRESS: at 36.36% examples, 200302 words/s, in_qsize 13, out_qsize 0
INFO - 18:38:59: EPOCH 34 - PROGRESS: at 38.76% examples, 199893 words/s, in_qsize 14, out_qsize 3
INFO - 18:39:00: EPOCH 34 - PROGRESS: at 41.31% examples, 200728 words/s, in_qsize 14, out_qsize 0
INFO - 18:39:01: EPOCH 34 - PROGRESS: at 43.77% examples, 199640 words/s, in_qsize 12, out_qsize 8
INFO - 18:39:02: EPOCH 34 - PROGRESS: at 46.43% examples, 200782 words/s, in_qsize 12, out_qsize 1
INFO - 18:39:03: EPOCH 34 - PROGRESS: at 48.89% examples, 200772 words/s, in_qsize 13, out_qsize 0
INFO - 18:39:04: EPOCH 34 - PROGRESS: at 51.24% examples, 200952 words/s, in_qsize 13, out_qsize 0
INFO - 18:39:05: EPOCH 34 - PROGRESS: at 53.69% examples, 200883 words/s, in_qsize 14, out_qsize 2
INFO - 18:

INFO - 18:40:08: EPOCH 36 - PROGRESS: at 4.24% examples, 180436 words/s, in_qsize 14, out_qsize 1
INFO - 18:40:09: EPOCH 36 - PROGRESS: at 6.56% examples, 186685 words/s, in_qsize 11, out_qsize 4
INFO - 18:40:11: EPOCH 36 - PROGRESS: at 9.06% examples, 191386 words/s, in_qsize 13, out_qsize 0
INFO - 18:40:12: EPOCH 36 - PROGRESS: at 11.54% examples, 193744 words/s, in_qsize 13, out_qsize 0
INFO - 18:40:13: EPOCH 36 - PROGRESS: at 13.99% examples, 194165 words/s, in_qsize 12, out_qsize 4
INFO - 18:40:14: EPOCH 36 - PROGRESS: at 16.34% examples, 195046 words/s, in_qsize 11, out_qsize 7
INFO - 18:40:15: EPOCH 36 - PROGRESS: at 18.84% examples, 196560 words/s, in_qsize 12, out_qsize 6
INFO - 18:40:16: EPOCH 36 - PROGRESS: at 21.43% examples, 198156 words/s, in_qsize 13, out_qsize 0
INFO - 18:40:17: EPOCH 36 - PROGRESS: at 23.93% examples, 198865 words/s, in_qsize 13, out_qsize 0
INFO - 18:40:18: EPOCH 36 - PROGRESS: at 26.25% examples, 199404 words/s, in_qsize 10, out_qsize 4
INFO - 18:40:

INFO - 18:41:28: EPOCH 37 - PROGRESS: at 95.56% examples, 200245 words/s, in_qsize 14, out_qsize 6
INFO - 18:41:29: EPOCH 37 - PROGRESS: at 98.20% examples, 200831 words/s, in_qsize 13, out_qsize 1
INFO - 18:41:30: worker thread finished; awaiting finish of 6 more threads
INFO - 18:41:30: worker thread finished; awaiting finish of 5 more threads
INFO - 18:41:30: worker thread finished; awaiting finish of 4 more threads
INFO - 18:41:30: worker thread finished; awaiting finish of 3 more threads
INFO - 18:41:30: worker thread finished; awaiting finish of 2 more threads
INFO - 18:41:30: worker thread finished; awaiting finish of 1 more threads
INFO - 18:41:30: worker thread finished; awaiting finish of 0 more threads
INFO - 18:41:30: EPOCH - 37 : training on 24789697 raw words (8434881 effective words) took 41.8s, 201947 effective words/s
INFO - 18:41:31: EPOCH 38 - PROGRESS: at 1.83% examples, 153239 words/s, in_qsize 13, out_qsize 0
INFO - 18:41:32: EPOCH 38 - PROGRESS: at 4.08% examples

INFO - 18:42:42: EPOCH 39 - PROGRESS: at 69.44% examples, 200868 words/s, in_qsize 13, out_qsize 0
INFO - 18:42:43: EPOCH 39 - PROGRESS: at 71.95% examples, 200780 words/s, in_qsize 14, out_qsize 0
INFO - 18:42:44: EPOCH 39 - PROGRESS: at 74.32% examples, 200903 words/s, in_qsize 12, out_qsize 0
INFO - 18:42:45: EPOCH 39 - PROGRESS: at 76.89% examples, 200747 words/s, in_qsize 14, out_qsize 0
INFO - 18:42:46: EPOCH 39 - PROGRESS: at 79.31% examples, 200385 words/s, in_qsize 13, out_qsize 5
INFO - 18:42:47: EPOCH 39 - PROGRESS: at 81.93% examples, 200640 words/s, in_qsize 13, out_qsize 0
INFO - 18:42:48: EPOCH 39 - PROGRESS: at 84.49% examples, 200670 words/s, in_qsize 13, out_qsize 0
INFO - 18:42:49: EPOCH 39 - PROGRESS: at 86.84% examples, 200694 words/s, in_qsize 13, out_qsize 0
INFO - 18:42:50: EPOCH 39 - PROGRESS: at 89.10% examples, 200961 words/s, in_qsize 13, out_qsize 0
INFO - 18:42:51: EPOCH 39 - PROGRESS: at 91.38% examples, 200809 words/s, in_qsize 14, out_qsize 3
INFO - 18:

INFO - 18:43:54: EPOCH 41 - PROGRESS: at 41.11% examples, 200639 words/s, in_qsize 13, out_qsize 0
INFO - 18:43:55: EPOCH 41 - PROGRESS: at 43.58% examples, 199946 words/s, in_qsize 14, out_qsize 4
INFO - 18:43:56: EPOCH 41 - PROGRESS: at 46.30% examples, 200937 words/s, in_qsize 13, out_qsize 0
INFO - 18:43:57: EPOCH 41 - PROGRESS: at 48.71% examples, 200528 words/s, in_qsize 13, out_qsize 3
INFO - 18:43:59: EPOCH 41 - PROGRESS: at 51.14% examples, 200328 words/s, in_qsize 14, out_qsize 4
INFO - 18:44:00: EPOCH 41 - PROGRESS: at 53.83% examples, 200974 words/s, in_qsize 13, out_qsize 0
INFO - 18:44:01: EPOCH 41 - PROGRESS: at 56.23% examples, 201641 words/s, in_qsize 12, out_qsize 0
INFO - 18:44:02: EPOCH 41 - PROGRESS: at 58.65% examples, 201304 words/s, in_qsize 13, out_qsize 0
INFO - 18:44:03: EPOCH 41 - PROGRESS: at 61.12% examples, 201020 words/s, in_qsize 10, out_qsize 7
INFO - 18:44:04: EPOCH 41 - PROGRESS: at 64.01% examples, 201253 words/s, in_qsize 14, out_qsize 0
INFO - 18:

INFO - 18:45:07: EPOCH 43 - PROGRESS: at 14.11% examples, 194596 words/s, in_qsize 13, out_qsize 0
INFO - 18:45:08: EPOCH 43 - PROGRESS: at 16.46% examples, 195328 words/s, in_qsize 13, out_qsize 3
INFO - 18:45:09: EPOCH 43 - PROGRESS: at 19.00% examples, 197962 words/s, in_qsize 10, out_qsize 2
INFO - 18:45:10: EPOCH 43 - PROGRESS: at 21.34% examples, 196593 words/s, in_qsize 13, out_qsize 2
INFO - 18:45:11: EPOCH 43 - PROGRESS: at 23.97% examples, 198184 words/s, in_qsize 14, out_qsize 0
INFO - 18:45:12: EPOCH 43 - PROGRESS: at 26.38% examples, 198539 words/s, in_qsize 13, out_qsize 0
INFO - 18:45:13: EPOCH 43 - PROGRESS: at 28.81% examples, 198738 words/s, in_qsize 11, out_qsize 4
INFO - 18:45:14: EPOCH 43 - PROGRESS: at 31.22% examples, 199341 words/s, in_qsize 13, out_qsize 0
INFO - 18:45:15: EPOCH 43 - PROGRESS: at 33.61% examples, 198465 words/s, in_qsize 13, out_qsize 5
INFO - 18:45:16: EPOCH 43 - PROGRESS: at 36.12% examples, 199007 words/s, in_qsize 14, out_qsize 3
INFO - 18:

INFO - 18:46:25: worker thread finished; awaiting finish of 5 more threads
INFO - 18:46:25: worker thread finished; awaiting finish of 4 more threads
INFO - 18:46:25: worker thread finished; awaiting finish of 3 more threads
INFO - 18:46:25: worker thread finished; awaiting finish of 2 more threads
INFO - 18:46:25: worker thread finished; awaiting finish of 1 more threads
INFO - 18:46:25: worker thread finished; awaiting finish of 0 more threads
INFO - 18:46:25: EPOCH - 44 : training on 24789697 raw words (8432613 effective words) took 42.5s, 198494 effective words/s
INFO - 18:46:26: EPOCH 45 - PROGRESS: at 1.98% examples, 165860 words/s, in_qsize 13, out_qsize 0
INFO - 18:46:27: EPOCH 45 - PROGRESS: at 4.28% examples, 183066 words/s, in_qsize 13, out_qsize 0
INFO - 18:46:28: EPOCH 45 - PROGRESS: at 6.52% examples, 185075 words/s, in_qsize 14, out_qsize 6
INFO - 18:46:30: EPOCH 45 - PROGRESS: at 9.10% examples, 190332 words/s, in_qsize 13, out_qsize 6
INFO - 18:46:31: EPOCH 45 - PROGRE

INFO - 18:47:39: EPOCH 46 - PROGRESS: at 75.45% examples, 198704 words/s, in_qsize 13, out_qsize 0
INFO - 18:47:41: EPOCH 46 - PROGRESS: at 77.94% examples, 198518 words/s, in_qsize 12, out_qsize 6
INFO - 18:47:42: EPOCH 46 - PROGRESS: at 80.33% examples, 194816 words/s, in_qsize 14, out_qsize 0
INFO - 18:47:43: EPOCH 46 - PROGRESS: at 82.69% examples, 194731 words/s, in_qsize 13, out_qsize 0
INFO - 18:47:44: EPOCH 46 - PROGRESS: at 84.90% examples, 194418 words/s, in_qsize 14, out_qsize 6
INFO - 18:47:45: EPOCH 46 - PROGRESS: at 87.35% examples, 194714 words/s, in_qsize 11, out_qsize 7
INFO - 18:47:46: EPOCH 46 - PROGRESS: at 89.61% examples, 195170 words/s, in_qsize 10, out_qsize 5
INFO - 18:47:47: EPOCH 46 - PROGRESS: at 92.26% examples, 195424 words/s, in_qsize 13, out_qsize 0
INFO - 18:47:48: EPOCH 46 - PROGRESS: at 94.82% examples, 195443 words/s, in_qsize 12, out_qsize 1
INFO - 18:47:49: EPOCH 46 - PROGRESS: at 97.19% examples, 195671 words/s, in_qsize 13, out_qsize 0
INFO - 18:

INFO - 18:48:54: EPOCH 48 - PROGRESS: at 50.72% examples, 200244 words/s, in_qsize 12, out_qsize 6
INFO - 18:48:55: EPOCH 48 - PROGRESS: at 53.27% examples, 200652 words/s, in_qsize 13, out_qsize 0
INFO - 18:48:56: EPOCH 48 - PROGRESS: at 55.47% examples, 200387 words/s, in_qsize 14, out_qsize 5
INFO - 18:48:57: EPOCH 48 - PROGRESS: at 57.90% examples, 200514 words/s, in_qsize 14, out_qsize 4
INFO - 18:48:58: EPOCH 48 - PROGRESS: at 60.50% examples, 200876 words/s, in_qsize 13, out_qsize 0
INFO - 18:48:59: EPOCH 48 - PROGRESS: at 63.16% examples, 200908 words/s, in_qsize 14, out_qsize 0
INFO - 18:49:00: EPOCH 48 - PROGRESS: at 65.52% examples, 200212 words/s, in_qsize 13, out_qsize 6
INFO - 18:49:01: EPOCH 48 - PROGRESS: at 68.04% examples, 200773 words/s, in_qsize 10, out_qsize 5
INFO - 18:49:02: EPOCH 48 - PROGRESS: at 70.46% examples, 200644 words/s, in_qsize 11, out_qsize 5
INFO - 18:49:03: EPOCH 48 - PROGRESS: at 72.77% examples, 200346 words/s, in_qsize 9, out_qsize 8
INFO - 18:4

INFO - 18:50:07: EPOCH 50 - PROGRESS: at 23.86% examples, 197678 words/s, in_qsize 13, out_qsize 0
INFO - 18:50:08: EPOCH 50 - PROGRESS: at 26.24% examples, 198685 words/s, in_qsize 14, out_qsize 1
INFO - 18:50:09: EPOCH 50 - PROGRESS: at 28.70% examples, 198955 words/s, in_qsize 13, out_qsize 0
INFO - 18:50:10: EPOCH 50 - PROGRESS: at 31.14% examples, 199587 words/s, in_qsize 13, out_qsize 0
INFO - 18:50:11: EPOCH 50 - PROGRESS: at 33.81% examples, 200336 words/s, in_qsize 13, out_qsize 0
INFO - 18:50:12: EPOCH 50 - PROGRESS: at 36.29% examples, 200719 words/s, in_qsize 11, out_qsize 0
INFO - 18:50:13: EPOCH 50 - PROGRESS: at 38.65% examples, 200090 words/s, in_qsize 13, out_qsize 0
INFO - 18:50:14: EPOCH 50 - PROGRESS: at 40.94% examples, 199307 words/s, in_qsize 12, out_qsize 8
INFO - 18:50:15: EPOCH 50 - PROGRESS: at 43.82% examples, 200741 words/s, in_qsize 13, out_qsize 0
INFO - 18:50:16: EPOCH 50 - PROGRESS: at 46.27% examples, 200532 words/s, in_qsize 13, out_qsize 0
INFO - 18:

INFO - 18:51:21: EPOCH - 51 : training on 24789697 raw words (8432582 effective words) took 42.4s, 198929 effective words/s
INFO - 18:51:22: EPOCH 52 - PROGRESS: at 1.86% examples, 152882 words/s, in_qsize 9, out_qsize 6
INFO - 18:51:23: EPOCH 52 - PROGRESS: at 4.18% examples, 173907 words/s, in_qsize 14, out_qsize 5
INFO - 18:51:24: EPOCH 52 - PROGRESS: at 6.64% examples, 187347 words/s, in_qsize 9, out_qsize 5
INFO - 18:51:25: EPOCH 52 - PROGRESS: at 9.01% examples, 188958 words/s, in_qsize 9, out_qsize 8
INFO - 18:51:26: EPOCH 52 - PROGRESS: at 11.52% examples, 192471 words/s, in_qsize 14, out_qsize 5
INFO - 18:51:27: EPOCH 52 - PROGRESS: at 14.21% examples, 196725 words/s, in_qsize 14, out_qsize 0
INFO - 18:51:28: EPOCH 52 - PROGRESS: at 16.65% examples, 198352 words/s, in_qsize 11, out_qsize 2
INFO - 18:51:29: EPOCH 52 - PROGRESS: at 18.96% examples, 198244 words/s, in_qsize 11, out_qsize 5
INFO - 18:51:30: EPOCH 52 - PROGRESS: at 21.57% examples, 199101 words/s, in_qsize 14, out_

INFO - 18:52:41: EPOCH 53 - PROGRESS: at 89.37% examples, 195733 words/s, in_qsize 13, out_qsize 0
INFO - 18:52:42: EPOCH 53 - PROGRESS: at 91.73% examples, 195855 words/s, in_qsize 14, out_qsize 2
INFO - 18:52:43: EPOCH 53 - PROGRESS: at 94.49% examples, 195935 words/s, in_qsize 13, out_qsize 0
INFO - 18:52:44: EPOCH 53 - PROGRESS: at 96.91% examples, 196238 words/s, in_qsize 14, out_qsize 0
INFO - 18:52:45: EPOCH 53 - PROGRESS: at 99.76% examples, 197057 words/s, in_qsize 6, out_qsize 2
INFO - 18:52:45: worker thread finished; awaiting finish of 6 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 5 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 4 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 3 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 2 more threads
INFO - 18:52:45: worker thread finished; awaiting finish of 1 more threads
INFO - 18:52:45: worker thread finished; awaiting finish

INFO - 18:53:54: EPOCH 55 - PROGRESS: at 63.29% examples, 199607 words/s, in_qsize 14, out_qsize 0
INFO - 18:53:56: EPOCH 55 - PROGRESS: at 64.89% examples, 194156 words/s, in_qsize 12, out_qsize 4
INFO - 18:53:57: EPOCH 55 - PROGRESS: at 67.29% examples, 194249 words/s, in_qsize 12, out_qsize 5
INFO - 18:53:58: EPOCH 55 - PROGRESS: at 69.81% examples, 194785 words/s, in_qsize 13, out_qsize 0
INFO - 18:53:59: EPOCH 55 - PROGRESS: at 72.25% examples, 194815 words/s, in_qsize 13, out_qsize 0
INFO - 18:54:00: EPOCH 55 - PROGRESS: at 74.79% examples, 195391 words/s, in_qsize 10, out_qsize 0
INFO - 18:54:01: EPOCH 55 - PROGRESS: at 77.24% examples, 195135 words/s, in_qsize 13, out_qsize 0
INFO - 18:54:02: EPOCH 55 - PROGRESS: at 79.66% examples, 194971 words/s, in_qsize 13, out_qsize 7
INFO - 18:54:03: EPOCH 55 - PROGRESS: at 82.35% examples, 195615 words/s, in_qsize 14, out_qsize 0
INFO - 18:54:04: EPOCH 55 - PROGRESS: at 84.70% examples, 195608 words/s, in_qsize 11, out_qsize 3
INFO - 18:

INFO - 18:55:08: EPOCH 57 - PROGRESS: at 36.16% examples, 201475 words/s, in_qsize 14, out_qsize 0
INFO - 18:55:09: EPOCH 57 - PROGRESS: at 38.70% examples, 201290 words/s, in_qsize 14, out_qsize 1
INFO - 18:55:10: EPOCH 57 - PROGRESS: at 41.12% examples, 201392 words/s, in_qsize 14, out_qsize 0
INFO - 18:55:11: EPOCH 57 - PROGRESS: at 43.48% examples, 200379 words/s, in_qsize 14, out_qsize 6
INFO - 18:55:12: EPOCH 57 - PROGRESS: at 46.10% examples, 201422 words/s, in_qsize 13, out_qsize 0
INFO - 18:55:13: EPOCH 57 - PROGRESS: at 48.61% examples, 201495 words/s, in_qsize 13, out_qsize 0
INFO - 18:55:14: EPOCH 57 - PROGRESS: at 51.02% examples, 201666 words/s, in_qsize 13, out_qsize 1
INFO - 18:55:15: EPOCH 57 - PROGRESS: at 53.55% examples, 201636 words/s, in_qsize 13, out_qsize 0
INFO - 18:55:16: EPOCH 57 - PROGRESS: at 55.55% examples, 201160 words/s, in_qsize 14, out_qsize 6
INFO - 18:55:17: EPOCH 57 - PROGRESS: at 58.13% examples, 201576 words/s, in_qsize 11, out_qsize 5
INFO - 18:

INFO - 18:56:21: EPOCH 59 - PROGRESS: at 9.19% examples, 193554 words/s, in_qsize 13, out_qsize 2
INFO - 18:56:22: EPOCH 59 - PROGRESS: at 11.50% examples, 193206 words/s, in_qsize 14, out_qsize 4
INFO - 18:56:23: EPOCH 59 - PROGRESS: at 14.19% examples, 197250 words/s, in_qsize 13, out_qsize 0
INFO - 18:56:24: EPOCH 59 - PROGRESS: at 16.60% examples, 199049 words/s, in_qsize 14, out_qsize 0
INFO - 18:56:25: EPOCH 59 - PROGRESS: at 18.92% examples, 198860 words/s, in_qsize 12, out_qsize 3
INFO - 18:56:26: EPOCH 59 - PROGRESS: at 21.41% examples, 199563 words/s, in_qsize 9, out_qsize 5
INFO - 18:56:27: EPOCH 59 - PROGRESS: at 23.85% examples, 198662 words/s, in_qsize 13, out_qsize 6
INFO - 18:56:28: EPOCH 59 - PROGRESS: at 26.41% examples, 200454 words/s, in_qsize 13, out_qsize 0
INFO - 18:56:29: EPOCH 59 - PROGRESS: at 28.87% examples, 200748 words/s, in_qsize 13, out_qsize 0
INFO - 18:56:30: EPOCH 59 - PROGRESS: at 31.31% examples, 201104 words/s, in_qsize 13, out_qsize 0
INFO - 18:56

INFO - 18:57:40: EPOCH 60 - PROGRESS: at 97.37% examples, 199539 words/s, in_qsize 14, out_qsize 7
INFO - 18:57:41: worker thread finished; awaiting finish of 6 more threads
INFO - 18:57:41: worker thread finished; awaiting finish of 5 more threads
INFO - 18:57:41: worker thread finished; awaiting finish of 4 more threads
INFO - 18:57:41: worker thread finished; awaiting finish of 3 more threads
INFO - 18:57:41: worker thread finished; awaiting finish of 2 more threads
INFO - 18:57:41: worker thread finished; awaiting finish of 1 more threads
INFO - 18:57:41: worker thread finished; awaiting finish of 0 more threads
INFO - 18:57:41: EPOCH - 60 : training on 24789697 raw words (8434342 effective words) took 41.9s, 201282 effective words/s
INFO - 18:57:41: Word2Vec lifecycle event {'msg': 'training on 1487381820 raw words (506055495 effective words) took 2555.7s, 198010 effective words/s', 'datetime': '2022-01-26T18:57:41.253439', 'gensim': '4.1.2', 'python': '3.8.6 (tags/v3.8.6:db45529,

Time to train the model: 42.6 mins


In [17]:
w2v_model.save("word2vec-Yelp-real-500Kcomment-v2-100vec.model")

INFO - 17:19:29: Word2Vec lifecycle event {'fname_or_handle': 'word2vec-Yelp-real-500Kcomment-v2-100vec.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-01-27T17:19:29.204451', 'gensim': '4.1.2', 'python': '3.8.6 (tags/v3.8.6:db45529, Sep 23 2020, 15:52:53) [MSC v.1927 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'saving'}
INFO - 17:19:29: storing np array 'vectors' to word2vec-Yelp-real-500Kcomment-v2-100vec.model.wv.vectors.npy
INFO - 17:19:29: storing np array 'syn1neg' to word2vec-Yelp-real-500Kcomment-v2-100vec.model.syn1neg.npy
INFO - 17:19:30: not storing attribute cum_table
INFO - 17:19:31: saved word2vec-Yelp-real-500Kcomment-v2-100vec.model


--------

TEST

In [None]:
pathTip = 'D:\\t2n_projects\\yelp\\yelp_academic_dataset_tip.json'
pathCheckin = 'D:\\t2n_projects\\yelp\\yelp_academic_dataset_checkin.json'
pathReview = 'D:\\t2n_projects\\yelp\\original-data\\yelp_academic_dataset_review.json'
pathUser = 'D:\\t2n_projects\\yelp\\yelp_academic_dataset_user.json'
pathBusiness = 'D:\\t2n_projects\\yelp\\yelp_academic_dataset_business.json'

In [None]:
from sklearn.cluster import KMeans

import numpy as np

np.shape(w2v_model.wv.vectors)

w2v_model.wv['good']

# word_vectors = Word2Vec.load("../preprocessing_and_embeddings/word2vec.model").wv
word_vectors = w2v_model.wv
model = KMeans(n_clusters=5, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors)
positive_cluster_center = model.cluster_centers_[0]
negative_cluster_center = model.cluster_centers_[1]

with open('kmean-5cluster.pkl','wb') as f:
    pickle.dump(model,f)

word_vectors.vectors

w2v_model = Word2Vec.load('word2vec.model')

w2v_model.wv.most_similar(positive=['Trump'], topn=10)

dfContent = pd.read_csv('reviewContent', sep='\t', header=None)

dfContent.groupby(1).size()

dfContent.rename(columns={0:'userId', 2:'date', 3:'text'})

In [None]:
splitBigJson(pathReview)

pathReview = 'D:\\t2n_projects\\yelp\\reviews\\yelp_academic_dataset_review'

lstDF = []
for i in range(10):
    pathN = pathReview + str(i) + '.json'
    print('read file', pathN)
    with open(pathN, encoding="utf8") as file:
        data = [json.loads(line) for line in file]
    dfTmp = pd.DataFrame(data[0])
    lstDF.append(dfTmp)
dfReview = pd.concat(lstDF, ignore_index=True)
dfReview = dfReview.loc[:, ~dfReview.columns.str.contains('^Unnamed')]

len(dfReview)

dfReview.to_pickle('sampleReview5M.pkl', compression='gzip')