In [3]:
import gzip
import gensim
import logging
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.manifold import TSNE

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
input_file = 'reviews_data.txt.gz'
with gzip.open(input_file, 'rb') as f:
    for i, line in enumerate(f):
        print(line)
        print()
        print(gensim.utils.simple_preprocess(line))
        break

b"Oct 12 2009 \tNice trendy hotel location not too bad.\tI stayed in this hotel for one night. As this is a fairly new place some of the taxi drivers did not know where it was and/or did not want to drive there. Once I have eventually arrived at the hotel, I was very pleasantly surprised with the decor of the lobby/ground floor area. It was very stylish and modern. I found the reception's staff geeting me with 'Aloha' a bit out of place, but I guess they are briefed to say that to keep up the coroporate image.As I have a Starwood Preferred Guest member, I was given a small gift upon-check in. It was only a couple of fridge magnets in a gift box, but nevertheless a nice gesture.My room was nice and roomy, there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by 'bliss'.The location is not great. It is at the last metro stop and you then need to take a taxi, but if you are not planning on going to see the historic sites in Be

In [5]:
def read_input(input_file):
    logging.info("reading file {0}...this may take a while".format(input_file))
    with gzip.open(input_file, 'rb') as f:
        for i, line in enumerate(f):
            if (i % 10000 == 0):
                logging.info("read {0} reviews".format(i))
            yield gensim.utils.simple_preprocess(line)

In [6]:
## word2vec model expects a list of tokenized sentences 
## sentences with 2 and 3 words: [['word1', 'word2'], ['word3', 'word4', 'word5']]

# read the tokenized reviews into a list each review item becomes a serries of words
# so this becomes a list of lists
documents = list(read_input(input_file))
logging.info("Done reading data file")

2018-09-04 20:06:33,353 : INFO : reading file reviews_data.txt.gz...this may take a while
2018-09-04 20:06:33,355 : INFO : read 0 reviews
2018-09-04 20:06:36,211 : INFO : read 10000 reviews
2018-09-04 20:06:39,064 : INFO : read 20000 reviews
2018-09-04 20:06:42,381 : INFO : read 30000 reviews
2018-09-04 20:06:45,601 : INFO : read 40000 reviews
2018-09-04 20:06:48,974 : INFO : read 50000 reviews
2018-09-04 20:06:52,222 : INFO : read 60000 reviews
2018-09-04 20:06:54,982 : INFO : read 70000 reviews
2018-09-04 20:06:57,507 : INFO : read 80000 reviews
2018-09-04 20:07:00,248 : INFO : read 90000 reviews
2018-09-04 20:07:02,867 : INFO : read 100000 reviews
2018-09-04 20:07:05,456 : INFO : read 110000 reviews
2018-09-04 20:07:08,044 : INFO : read 120000 reviews
2018-09-04 20:07:11,068 : INFO : read 130000 reviews
2018-09-04 20:07:13,899 : INFO : read 140000 reviews
2018-09-04 20:07:16,538 : INFO : read 150000 reviews
2018-09-04 20:07:19,300 : INFO : read 160000 reviews
2018-09-04 20:07:21,938

In [None]:
%%time
## build vocabulary and train model
model = gensim.models.Word2Vec(documents, size=128, window=10, min_count=2, workers=10, sg=0, hs=0)
model.train(documents, total_examples=len(documents), epochs=10)
# note: sg=0,1 (skip gram or cbow by default)
# note: hs=0,1 (hierarchical softmax or negative sampling by default)

In [9]:
w1 = 'king'
model.wv.most_similar(positive=w1)

2018-09-04 20:19:04,576 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('queen', 0.9263930916786194),
 ('kingsize', 0.7762641906738281),
 ('double', 0.7677521705627441),
 ('twin', 0.7344762086868286),
 ('dbl', 0.7093721032142639),
 ('kingsized', 0.6804887056350708),
 ('queensize', 0.645519495010376),
 ('murphy', 0.6436346769332886),
 ('superking', 0.6359125375747681),
 ('rollaway', 0.6305528879165649)]

In [10]:
model.wv.most_similar(positive=w1) == model.wv.similar_by_word(word=w1)

  if np.issubdtype(vec.dtype, np.int):


True

In [11]:
queen_vector = model.wv['king'] - model.wv['man'] + model.wv['woman']

In [12]:
model.wv.similar_by_vector(vector=queen_vector)

  if np.issubdtype(vec.dtype, np.int):


[('king', 0.8901026844978333),
 ('queen', 0.8384253978729248),
 ('twin', 0.6772350072860718),
 ('kingsize', 0.6649421453475952),
 ('double', 0.6635952591896057),
 ('dbl', 0.6494052410125732),
 ('rollaway', 0.588722288608551),
 ('kingsized', 0.5806706547737122),
 ('superking', 0.5766728520393372),
 ('murphy', 0.5696359872817993)]

In [13]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'])[0]

  if np.issubdtype(vec.dtype, np.int):


('queen', 0.8070516586303711)

In [14]:
w1 = ['polite']
model.wv.most_similar(positive=w1, topn=6)

  if np.issubdtype(vec.dtype, np.int):


[('courteous', 0.9182888269424438),
 ('cordial', 0.8687198162078857),
 ('curteous', 0.849873960018158),
 ('friendly', 0.8324247598648071),
 ('freindly', 0.8151437640190125),
 ('courtious', 0.8142062425613403)]

In [15]:
model.wv.most_similar(positive='france', topn=5)

  if np.issubdtype(vec.dtype, np.int):


[('germany', 0.7070672512054443),
 ('canada', 0.7017298936843872),
 ('florida', 0.6712579727172852),
 ('austria', 0.6668193340301514),
 ('england', 0.666599690914154)]

In [16]:
model.wv.similarity(w1='sunset', w2='cliff')

  if np.issubdtype(vec.dtype, np.int):


0.41794556

In [17]:
vocab = list(model.wv.vocab)

In [18]:
embedding_matrix = model.wv[vocab] # shape = [vocab_size, embedding_size]

In [19]:
%%time
tsne = TSNE(n_components=2, init='pca', n_iter=1000, verbose=1).fit_transform(embedding_matrix)

KeyboardInterrupt: 

In [20]:
df = pd.DataFrame(tsne, index=vocab, columns=[
    'comp'+str(i) for i in range(0, embedding_matrix.shape[1])])

NameError: name 'tsne' is not defined

In [None]:
def find_clustered_embeddings(embeddings,distance_threshold,sample_threshold):
    ''' 
    Find only the closely clustered embeddings. 
    This gets rid of more sparsly distributed word embeddings and make the visualization clearer
    This is useful for t-SNE visualization
    
    distance_threshold: maximum distance between two points to qualify as neighbors
    sample_threshold: number of neighbors required to be considered a cluster
    '''
    
    # calculate cosine similarity
    cosine_sim = np.dot(embeddings,np.transpose(embeddings))
    norm = np.dot(np.sum(embeddings**2,axis=1).reshape(-1,1),np.sum(np.transpose(embeddings)**2,axis=0).reshape(1,-1))
    assert cosine_sim.shape == norm.shape
    cosine_sim /= norm
    
    # make all the diagonal entries zero otherwise this will be picked as highest
    np.fill_diagonal(cosine_sim, -1.0)
    
    argmax_cos_sim = np.argmax(cosine_sim, axis=1)
    mod_cos_sim = cosine_sim
    # find the maximums in a loop to count if there are more than n items above threshold
    for _ in range(sample_threshold-1):
        argmax_cos_sim = np.argmax(cosine_sim, axis=1)
        mod_cos_sim[np.arange(mod_cos_sim.shape[0]),argmax_cos_sim] = -1
    
    max_cosine_sim = np.max(mod_cos_sim,axis=1)

    return np.where(max_cosine_sim>distance_threshold)[0]