In [None]:
import gzip
import gensim
import logging
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.manifold import TSNE

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [17]:
input_file = 'reviews_data.txt.gz'
with gzip.open(input_file, 'rb') as f:
    for i, line in enumerate(f):
        print(line)
        print()
        print(gensim.utils.simple_preprocess(line))
        break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [23]:
def read_input(input_file):
    logging.info("reading file {0}...this may take a while".format(input_file))
    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):
            if (i % 10000 == 0):
                logging.info("read {0} reviews".format(i))
            yield gensim.utils.simple_preprocess(line)

In [24]:
## word2vec model expects a list of tokenized sentences 
## sentences with 2 and 3 words: [['word1', 'word2'], ['word3', 'word4', 'word5']]

# read the tokenized reviews into a list each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(input_file))
logging.info("Done reading data file")

2018-09-04 14:36:54,096 : INFO : reading file reviews_data.txt.gz...this may take a while
2018-09-04 14:36:54,099 : INFO : read 0 reviews
2018-09-04 14:36:57,748 : INFO : read 10000 reviews
2018-09-04 14:37:01,034 : INFO : read 20000 reviews
2018-09-04 14:37:05,054 : INFO : read 30000 reviews
2018-09-04 14:37:08,800 : INFO : read 40000 reviews
2018-09-04 14:37:13,245 : INFO : read 50000 reviews
2018-09-04 14:37:17,080 : INFO : read 60000 reviews
2018-09-04 14:37:20,291 : INFO : read 70000 reviews
2018-09-04 14:37:23,638 : INFO : read 80000 reviews
2018-09-04 14:37:26,822 : INFO : read 90000 reviews
2018-09-04 14:37:30,089 : INFO : read 100000 reviews
2018-09-04 14:37:33,180 : INFO : read 110000 reviews
2018-09-04 14:37:36,320 : INFO : read 120000 reviews
2018-09-04 14:37:39,311 : INFO : read 130000 reviews
2018-09-04 14:37:42,614 : INFO : read 140000 reviews
2018-09-04 14:37:45,616 : INFO : read 150000 reviews
2018-09-04 14:37:48,734 : INFO : read 160000 reviews
2018-09-04 14:37:52,568

In [26]:
## build vocabulary and train model
model = gensim.models.Word2Vec(documents, size=128, window=10, min_count=2, workers=10)
model.train(documents, total_examples=len(ducoments), epochs=10)
# note: sg=0,1 (skip gram or cbow by default)
# note: hs=0,1 (hierarchical softmax or negative sampling by default)

2018-09-04 14:45:49,963 : INFO : collecting all words and their counts
2018-09-04 14:45:49,965 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-09-04 14:45:50,322 : INFO : PROGRESS: at sentence #10000, processed 1655714 words, keeping 25777 word types
2018-09-04 14:45:50,703 : INFO : PROGRESS: at sentence #20000, processed 3317863 words, keeping 35016 word types
2018-09-04 14:45:51,154 : INFO : PROGRESS: at sentence #30000, processed 5264072 words, keeping 47518 word types
2018-09-04 14:45:51,566 : INFO : PROGRESS: at sentence #40000, processed 7081746 words, keeping 56675 word types
2018-09-04 14:45:52,023 : INFO : PROGRESS: at sentence #50000, processed 9089491 words, keeping 63744 word types
2018-09-04 14:45:52,508 : INFO : PROGRESS: at sentence #60000, processed 11013723 words, keeping 76781 word types
2018-09-04 14:45:52,886 : INFO : PROGRESS: at sentence #70000, processed 12637525 words, keeping 83194 word types
2018-09-04 14:45:53,231 : INFO : PROG

2018-09-04 14:46:41,218 : INFO : EPOCH 1 - PROGRESS: at 66.22% examples, 525669 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:46:42,227 : INFO : EPOCH 1 - PROGRESS: at 68.06% examples, 525702 words/s, in_qsize 16, out_qsize 3
2018-09-04 14:46:43,293 : INFO : EPOCH 1 - PROGRESS: at 69.79% examples, 525009 words/s, in_qsize 17, out_qsize 2
2018-09-04 14:46:44,300 : INFO : EPOCH 1 - PROGRESS: at 71.81% examples, 527557 words/s, in_qsize 20, out_qsize 4
2018-09-04 14:46:45,327 : INFO : EPOCH 1 - PROGRESS: at 73.77% examples, 527349 words/s, in_qsize 20, out_qsize 3
2018-09-04 14:46:46,390 : INFO : EPOCH 1 - PROGRESS: at 75.58% examples, 527571 words/s, in_qsize 19, out_qsize 1
2018-09-04 14:46:47,397 : INFO : EPOCH 1 - PROGRESS: at 77.19% examples, 527278 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:46:48,442 : INFO : EPOCH 1 - PROGRESS: at 78.81% examples, 526380 words/s, in_qsize 19, out_qsize 4
2018-09-04 14:46:49,442 : INFO : EPOCH 1 - PROGRESS: at 80.62% examples, 526994 words/s,

2018-09-04 14:47:46,144 : INFO : EPOCH 2 - PROGRESS: at 77.41% examples, 503662 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:47:47,146 : INFO : EPOCH 2 - PROGRESS: at 79.20% examples, 504679 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:47:48,160 : INFO : EPOCH 2 - PROGRESS: at 81.11% examples, 506431 words/s, in_qsize 18, out_qsize 1
2018-09-04 14:47:49,188 : INFO : EPOCH 2 - PROGRESS: at 82.98% examples, 506833 words/s, in_qsize 17, out_qsize 2
2018-09-04 14:47:50,204 : INFO : EPOCH 2 - PROGRESS: at 84.90% examples, 508706 words/s, in_qsize 17, out_qsize 2
2018-09-04 14:47:51,231 : INFO : EPOCH 2 - PROGRESS: at 86.85% examples, 509578 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:47:52,261 : INFO : EPOCH 2 - PROGRESS: at 88.61% examples, 508677 words/s, in_qsize 20, out_qsize 0
2018-09-04 14:47:53,275 : INFO : EPOCH 2 - PROGRESS: at 90.40% examples, 508642 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:47:54,284 : INFO : EPOCH 2 - PROGRESS: at 92.31% examples, 509159 words/s,

2018-09-04 14:48:50,783 : INFO : EPOCH 3 - PROGRESS: at 85.82% examples, 502779 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:48:51,800 : INFO : EPOCH 3 - PROGRESS: at 87.91% examples, 504019 words/s, in_qsize 18, out_qsize 5
2018-09-04 14:48:52,801 : INFO : EPOCH 3 - PROGRESS: at 89.68% examples, 504031 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:48:53,829 : INFO : EPOCH 3 - PROGRESS: at 91.45% examples, 503804 words/s, in_qsize 20, out_qsize 5
2018-09-04 14:48:54,862 : INFO : EPOCH 3 - PROGRESS: at 93.39% examples, 504804 words/s, in_qsize 20, out_qsize 1
2018-09-04 14:48:55,865 : INFO : EPOCH 3 - PROGRESS: at 95.46% examples, 506311 words/s, in_qsize 20, out_qsize 0
2018-09-04 14:48:56,869 : INFO : EPOCH 3 - PROGRESS: at 97.48% examples, 507606 words/s, in_qsize 20, out_qsize 1
2018-09-04 14:48:57,875 : INFO : EPOCH 3 - PROGRESS: at 99.17% examples, 507389 words/s, in_qsize 20, out_qsize 0
2018-09-04 14:48:58,176 : INFO : worker thread finished; awaiting finish of 9 more threa

2018-09-04 14:49:55,448 : INFO : EPOCH 4 - PROGRESS: at 97.43% examples, 518559 words/s, in_qsize 16, out_qsize 3
2018-09-04 14:49:56,449 : INFO : EPOCH 4 - PROGRESS: at 99.52% examples, 519945 words/s, in_qsize 19, out_qsize 0
2018-09-04 14:49:56,551 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-09-04 14:49:56,566 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-09-04 14:49:56,567 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-09-04 14:49:56,568 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-09-04 14:49:56,570 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-09-04 14:49:56,571 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-09-04 14:49:56,583 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-09-04 14:49:56,593 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-09-04 14:49:56,607 : INFO : worker thre

2018-09-04 14:50:53,594 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-09-04 14:50:53,600 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-09-04 14:50:53,601 : INFO : EPOCH - 5 : training on 41519355 raw words (30347573 effective words) took 57.0s, 532645 effective words/s
2018-09-04 14:50:53,602 : INFO : training on a 207596775 raw words (151746172 effective words) took 291.0s, 521423 effective words/s


NameError: name 'ducoments' is not defined

In [41]:
w1 = 'king'
model.wv.most_similar(positive=w1)

[('queen', 0.9223220348358154),
 ('kingsize', 0.7811055183410645),
 ('double', 0.7755463123321533),
 ('twin', 0.7431386709213257),
 ('kingsized', 0.6692562103271484),
 ('queensize', 0.6658189296722412),
 ('dbl', 0.6539133787155151),
 ('rollaway', 0.6269456148147583),
 ('murphy', 0.618873119354248),
 ('bunk', 0.6067122220993042)]

In [51]:
model.wv.most_similar(positive=w1) == model.wv.similar_by_word(word=w1)

True

In [60]:
queen_vector = model.wv['king'] - model.wv['man'] + model.wv['woman']

In [61]:
model.wv.similar_by_vector(vector=queen_vector)

[('king', 0.886066734790802),
 ('queen', 0.8435098528862),
 ('twin', 0.685721218585968),
 ('double', 0.6638386249542236),
 ('kingsize', 0.6603277921676636),
 ('rollaway', 0.5847185850143433),
 ('dbl', 0.5829118490219116),
 ('doubles', 0.5670033693313599),
 ('kingsized', 0.5556350946426392),
 ('queensize', 0.5528479814529419)]

In [45]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])[0]

('queen', 0.8155180811882019)

In [64]:
w1 = ['polite']
model.wv.most_similar(positive=w1, topn=6)

[('courteous', 0.9206915497779846),
 ('curteous', 0.8683255910873413),
 ('cordial', 0.8621584177017212),
 ('curtious', 0.8323178291320801),
 ('friendly', 0.8258741497993469),
 ('freindly', 0.8191377520561218)]

In [65]:
model.wv.most_similar(positive='france', topn=5)

[('germany', 0.75689697265625),
 ('canada', 0.7543196678161621),
 ('manchester', 0.7101644277572632),
 ('hawaii', 0.6989458799362183),
 ('england', 0.6869292855262756)]

In [69]:
model.wv.similarity(w1='sunset', w2='cliff')

0.4099631743394041

In [73]:
vocab = list(model.wv.vocab)

In [76]:
embedding_matrix = model.wv[vocab] # shape = [vocab_size, embedding_size]

In [None]:
tsne = TSNE(n_components=2, init='pca', n_iter=1000).fit_transform(embedding_matrix)

In [None]:
df = pd.DataFrame(tsne, index=vocab, columns=[
    'comp'+str(i) for i in range(0, embedding_matrix.shape[1])])

In [None]:
def find_clustered_embeddings(embeddings,distance_threshold,sample_threshold):
    ''' 
    Find only the closely clustered embeddings. 
    This gets rid of more sparsly distributed word embeddings and make the visualization clearer
    This is useful for t-SNE visualization
    
    distance_threshold: maximum distance between two points to qualify as neighbors
    sample_threshold: number of neighbors required to be considered a cluster
    '''
    
    # calculate cosine similarity
    cosine_sim = np.dot(embeddings,np.transpose(embeddings))
    norm = np.dot(np.sum(embeddings**2,axis=1).reshape(-1,1),np.sum(np.transpose(embeddings)**2,axis=0).reshape(1,-1))
    assert cosine_sim.shape == norm.shape
    cosine_sim /= norm
    
    # make all the diagonal entries zero otherwise this will be picked as highest
    np.fill_diagonal(cosine_sim, -1.0)
    
    argmax_cos_sim = np.argmax(cosine_sim, axis=1)
    mod_cos_sim = cosine_sim
    # find the maximums in a loop to count if there are more than n items above threshold
    for _ in range(sample_threshold-1):
        argmax_cos_sim = np.argmax(cosine_sim, axis=1)
        mod_cos_sim[np.arange(mod_cos_sim.shape[0]),argmax_cos_sim] = -1
    
    max_cosine_sim = np.max(mod_cos_sim,axis=1)

    return np.where(max_cosine_sim>distance_threshold)[0]