In [1]:
import numpy as np
b=np.load('ny.eater-content.npy')
content=b.tolist()

In [2]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [5]:
new_content = []
for article in content:
    text = ""
    for sent in article:
        text = text + sent
    new_content.append(text)

In [7]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(article)).lower() for article in new_content)

In [8]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.28 mins


In [9]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(243, 1)

In [10]:
from gensim.models.phrases import Phrases, Phraser

In [11]:
sent = [row.split() for row in df_clean['clean']]

In [12]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 03:23:44: collecting all words and their counts
INFO - 03:23:44: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 03:23:44: collected 157893 word types from a corpus of 181837 words (unigram + bigrams) and 243 sentences
INFO - 03:23:44: using 157893 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>


In [14]:
bigram = Phraser(phrases)

INFO - 03:26:10: source_vocab length 157893
INFO - 03:26:12: Phraser built with 56 phrasegrams


In [15]:
sentences = bigram[sent]

In [19]:
print(bigram[sent])

<gensim.interfaces.TransformedCorpus object at 0x000001B717BCE148>


In [20]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

17112

In [21]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['s',
 'restaurant',
 'like',
 'be',
 'good',
 "'s",
 'dish',
 'menu',
 'chef',
 'chicken']

In [22]:
import multiprocessing

from gensim.models import Word2Vec

In [23]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [24]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [25]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 03:29:57: collecting all words and their counts
INFO - 03:29:57: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 03:29:58: collected 17112 word types from a corpus of 177414 raw words and 243 sentences
INFO - 03:29:58: Loading a fresh vocabulary
INFO - 03:29:58: effective_min_count=20 retains 1738 unique words (10% of original 17112, drops 15374)
INFO - 03:29:58: effective_min_count=20 leaves 123955 word corpus (69% of original 177414, drops 53459)
INFO - 03:29:58: deleting the raw counts dictionary of 17112 items
INFO - 03:29:58: sample=6e-05 downsamples 1738 most-common words
INFO - 03:29:58: downsampling leaves estimated 48827 word corpus (39.4% of prior 123955)
INFO - 03:29:58: estimated required memory for 1738 words and 300 dimensions: 5040200 bytes
INFO - 03:29:58: resetting layer weights


Time to build vocab: 0.01 mins


In [26]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 03:30:13: training model with 11 workers on 1738 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 03:30:13: worker thread finished; awaiting finish of 10 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 9 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 8 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 7 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 6 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 5 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 4 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 3 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 2 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 1 more threads
INFO - 03:30:13: worker thread finished; awaiting finish of 0 more threads
INFO - 03:30:13: EPOCH - 1 : training o

INFO - 03:30:17: worker thread finished; awaiting finish of 3 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 2 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 1 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 0 more threads
INFO - 03:30:17: EPOCH - 9 : training on 177414 raw words (48849 effective words) took 0.5s, 106774 effective words/s
INFO - 03:30:17: worker thread finished; awaiting finish of 10 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 9 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 8 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 7 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 6 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 5 more threads
INFO - 03:30:17: worker thread finished; awaiting finish of 4 more threads
INFO - 03:30:17: worker thread finished; awaiting finish

INFO - 03:30:22: worker thread finished; awaiting finish of 7 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 6 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 5 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 4 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 3 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 2 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 1 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 0 more threads
INFO - 03:30:22: EPOCH - 18 : training on 177414 raw words (48760 effective words) took 0.5s, 92999 effective words/s
INFO - 03:30:22: worker thread finished; awaiting finish of 10 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 9 more threads
INFO - 03:30:22: worker thread finished; awaiting finish of 8 more threads
INFO - 03:30:22: worker thread finished; awaiting finish

INFO - 03:30:27: worker thread finished; awaiting finish of 10 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 9 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 8 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 7 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 6 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 5 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 4 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 3 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 2 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 1 more threads
INFO - 03:30:27: worker thread finished; awaiting finish of 0 more threads
INFO - 03:30:27: EPOCH - 27 : training on 177414 raw words (48530 effective words) took 0.5s, 99146 effective words/s
INFO - 03:30:27: worker thread finished; awaiting finish

Time to train the model: 0.26 mins


In [27]:
w2v_model.init_sims(replace=True)

INFO - 03:30:55: precomputing L2-norms of word weight vectors


In [29]:
w2v_model.wv.most_similar(positive=["dumpling"])

[('masala', 0.9988456964492798),
 ('vegetable', 0.9986583590507507),
 ('noodle', 0.9985232353210449),
 ('casserole', 0.9984180927276611),
 ('cauliflower', 0.9978220462799072),
 ('bread', 0.9977096319198608),
 ('meatball', 0.9975254535675049),
 ('pea', 0.9973939061164856),
 ('eel', 0.997329831123352),
 ('stuff', 0.997185468673706)]