# Word2Vec example with gensim, NLTK

In [1]:
import MySQLdb as mysql
import pandas as pd
import nltk
from gensim.models import word2vec
from nltk.corpus import stopwords # Import the stop word list
import re # regular expression library

import numpy as np

In [2]:
# download text datasets including stop words
#nltk.download() 

In [3]:
# hook up to mysql
# to fix ascii problem when tokenising, important to specify character set
# https://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte
ip = "localhost"
username = 'jfarrugia'
password = 'jfarrugia'

db = mysql.connect(ip, username, password, "yelp_db", charset='utf8',
use_unicode=True)
# load some data from a previously created table
pd_review = pd.read_sql("select id, name, text, stars from toronto_50K_random_reviews", con=db)

# close connection
db.close()

In [4]:
# confirm review shape
pd_review.shape

(50000, 4)

In [5]:
# show 1 review
pd_review["text"][0]

u"Not too bad overall. Got the 4 course menu which included a whole fish (grilled or seared) an appetizer and desert. The fish was wonderful. The pita appetizer was ok but one of the dips was not very good. I can't comment on dessert as I didn't try it. The others at my table seemed to think it was ok. The service was pretty good. Nothing special. We did think it was odd that the waiter spoke to people at the front door while he was taking our order. Not too bad overall. Would go back but won't be sprinting."

In [6]:
# details from https://www.kaggle.com/c/word2vec-nlp-tutorial#part-1-for-beginners-bag-of-words
# lower case all text
lc_review = pd_review["text"][0].lower()


In [7]:
# split one review into separate words
words = lc_review.split(" ")
# remove stop words from review text
words_no_stop = [w for w in words if w not in stopwords.words("english")]

In [8]:
from nltk.stem.porter import PorterStemmer
# removel morphological affices from words, leaving word stem
stemmer = PorterStemmer()
words_no_stop_stem = [stemmer.stem(w) for w in words_no_stop]
print words_no_stop_stem

[u'bad', u'overall.', u'got', u'4', u'cours', u'menu', u'includ', u'whole', u'fish', u'(grill', u'seared)', u'appet', u'desert.', u'fish', u'wonderful.', u'pita', u'appet', u'ok', u'one', u'dip', u'good.', u"can't", u'comment', u'dessert', u'tri', u'it.', u'other', u'tabl', u'seem', u'think', u'ok.', u'servic', u'pretti', u'good.', u'noth', u'special.', u'think', u'odd', u'waiter', u'spoke', u'peopl', u'front', u'door', u'take', u'order.', u'bad', u'overall.', u'would', u'go', u'back', u'sprinting.']


In [9]:
def process_review(base_review, remove_stop=False, stem = False, join=False):
    words = re.sub("[^a-zA-Z0-9]", " ", base_review) 
    # convert to lower case + split    
    words = words.lower().split(" ")    
    # searching a set is faster than a list    
    # might contemplate tweaking stop word list
    #stop = {x for x in set(stopwords.words("english")) if x not in ['not', 'no']
    if remove_stop:
        stop = set(stopwords.words("english"))
        words = [word for word in words if word not in stop]
    # run porter stemmer
    if stem:
        words = [stemmer.stem(w) for w in words]
    # return string
    if join:
        return " ".join(words)
    else:
        return words

# Word2Vec stuff

In [10]:
# word2vec requires review paragraphs split into individual sentences
# the datastructure to hold this data is a list of lists - 
# inner list holds sentences

In [11]:
# NLTK's punkt includes a pre-trained tokenizer for english which can
# be used to transform (split) new paragraph observations into sentences
punkt = nltk.data.load('tokenizers/punkt/english.pickle')

In [12]:
# split review corpus into sentences
# cannot use clean_reviews since punctuation was removed

#process_review(pd_review["text"][0], False, False, False)
def split_to_sentence(base_reviews, tokeniser, remove_stop=False):
    raw_sentences = tokeniser.tokenize(base_reviews.strip())
    sentences = []
    for rs in raw_sentences:
        # consider only strings with length >= 1
        if (len(rs) > 0):
            sentences.append( process_review(rs, remove_stop=remove_stop) )
    return sentences

sentences = pd_review["text"].apply(lambda x: split_to_sentence(x, punkt)).tolist()

In [13]:
# we need to flatten sentences list since we have a triple level list
# that we need to convert to a list of lists (2 levels)
sentence_list = [item for sublist in sentences for item in sublist]

# format will be ok with word2vector
print sentence_list[100]

[u'my', u'pasta', u'primavera', u'was', u'nice', u'', u'and', u'the', u'soup', u'special', u'we', u'had', u'was', u'delicious', u'']


In [14]:
# we have aroiund 444000 sentences minded from 50K reviews of
# Toronto restaurants
print len(sentence_list)

444454


In [15]:
# Import the built-in logging module and configure it to have clean messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)


# Set values for various parameters
num_features = 200    # Word vector dimensionality                      
min_word_count = 30   # Minimum word count                        
num_workers = 2       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# train word2vec model based on my 50K review sample

print "Training model..."
model = word2vec.Word2Vec(sentence_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# in case we need to port model without re-training
model_name = "200features_30minwords_10context"
model.save(model_name)


2018-01-27 16:53:10,030 : INFO : collecting all words and their counts
2018-01-27 16:53:10,033 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-27 16:53:10,086 : INFO : PROGRESS: at sentence #10000, processed 166483 words, keeping 9036 word types
2018-01-27 16:53:10,140 : INFO : PROGRESS: at sentence #20000, processed 332568 words, keeping 12587 word types


Training model...


2018-01-27 16:53:10,203 : INFO : PROGRESS: at sentence #30000, processed 497948 words, keeping 15344 word types
2018-01-27 16:53:10,263 : INFO : PROGRESS: at sentence #40000, processed 664084 words, keeping 17603 word types
2018-01-27 16:53:10,314 : INFO : PROGRESS: at sentence #50000, processed 829665 words, keeping 19661 word types
2018-01-27 16:53:10,356 : INFO : PROGRESS: at sentence #60000, processed 997906 words, keeping 21318 word types
2018-01-27 16:53:10,405 : INFO : PROGRESS: at sentence #70000, processed 1168609 words, keeping 23082 word types
2018-01-27 16:53:10,455 : INFO : PROGRESS: at sentence #80000, processed 1333955 words, keeping 24453 word types
2018-01-27 16:53:10,499 : INFO : PROGRESS: at sentence #90000, processed 1497282 words, keeping 25699 word types
2018-01-27 16:53:10,548 : INFO : PROGRESS: at sentence #100000, processed 1663384 words, keeping 26999 word types
2018-01-27 16:53:10,604 : INFO : PROGRESS: at sentence #110000, processed 1827603 words, keeping 28

2018-01-27 16:53:37,858 : INFO : PROGRESS: at 53.74% examples, 506178 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:38,861 : INFO : PROGRESS: at 56.20% examples, 509205 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:39,871 : INFO : PROGRESS: at 58.73% examples, 512560 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:40,876 : INFO : PROGRESS: at 60.78% examples, 511652 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:41,880 : INFO : PROGRESS: at 62.99% examples, 512149 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:42,884 : INFO : PROGRESS: at 65.17% examples, 512188 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:43,897 : INFO : PROGRESS: at 67.58% examples, 513916 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:44,900 : INFO : PROGRESS: at 69.80% examples, 514312 words/s, in_qsize 3, out_qsize 0
2018-01-27 16:53:45,903 : INFO : PROGRESS: at 71.94% examples, 514079 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:53:46,915 : INFO : PROGRESS: at 73.65% examples, 510741 wor

In [16]:
# should we need to load the model
model = word2vec.Word2Vec.load("200features_30minwords_10context")

2018-01-27 16:53:58,378 : INFO : loading Word2Vec object from 200features_30minwords_10context
2018-01-27 16:53:58,423 : INFO : loading wv recursively from 200features_30minwords_10context.wv.* with mmap=None
2018-01-27 16:53:58,424 : INFO : setting ignored attribute syn0norm to None
2018-01-27 16:53:58,426 : INFO : setting ignored attribute cum_table to None
2018-01-27 16:53:58,427 : INFO : loaded 200features_30minwords_10context


In [17]:
# get terms most similar to cantonese
model.wv.most_similar("cantonese")

2018-01-27 16:53:58,475 : INFO : precomputing L2-norms of word weight vectors


[(u'chinese', 0.7550970315933228),
 (u'malaysian', 0.7533572912216187),
 (u'szechuan', 0.7189332842826843),
 (u'northern', 0.7153961658477783),
 (u'hong', 0.697137713432312),
 (u'mein', 0.6929337978363037),
 (u'korean', 0.6914548277854919),
 (u'chow', 0.6905477046966553),
 (u'hakka', 0.6875939965248108),
 (u'kong', 0.6864943504333496)]

In [18]:
# model.wv.syn0 consists of a feature vector for each work
type(model.wv.syn0)
# with a min word count of 30, a vocab of 6,793 words as created
len(model.wv.vocab)
# shape of wv.syn0 should be 6793, 200
model.wv.syn0.shape

(6793, 200)

In [19]:
# simple word algebra example:
model.wv.most_similar(positive=['pasta','chinese'], negative=['italian'])


[(u'seafood', 0.6129480600357056),
 (u'udon', 0.5720325708389282),
 (u'spaghetti', 0.5618943572044373),
 (u'noodles', 0.5600389242172241),
 (u'bibimbap', 0.5471193790435791),
 (u'noodle', 0.5407552123069763),
 (u'rice', 0.5379495620727539),
 (u'vermicelli', 0.5336020588874817),
 (u'broccoli', 0.5316550731658936),
 (u'mein', 0.5210241079330444)]

In [20]:
# create a feature vector composed of the average of word vectors in
# a review's paragraph
def convert_review_feature_vector(word_list, model, feature_count):
    # initialise array of length feature_count (200 )
    feature_vector = np.zeros((feature_count,), dtype='float32')
    # stores count of words that are features in learned vocab
    word_count = 0.
    # convert learned vocab to set for faster processing
    vocab_set = set(model.wv.index2word)
    # iterate over words in word_list, adding feature vectors together
    for word in word_list:
        if word in vocab_set:
            word_count += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    
    # finally divide feature_vector by number of words ot get arithmetic vector mean
    feature_vector = np.divide(feature_vector, word_count)
    return feature_vector


In [21]:
clean_reviews2 = pd_review["text"].apply(lambda x: process_review(x, remove_stop=True))
# creates a 2D array of feature vector of size review count x feature count
review_vectors =\
np.array(clean_reviews2.apply(lambda x: 
                              convert_review_feature_vector(x, model, 200)).tolist())


In [22]:
# execute this code to compare each individual review with the search string
import time

start = time.time() # Start time

search_string = "cantonese"

search_vect = convert_review_feature_vector(search_string.split(), model, 200)

from scipy import spatial
# calculate cosine similarity of search string with review vectors
distances = []
for rv in review_vectors:
    distances.append(np.round(spatial.distance.cosine(search_vect, rv),3))

end = time.time()
elapsed = end - start
print "Time taken for search: ", elapsed, "seconds."

Time taken for search:  2.83883404732 seconds.


In [23]:
print np.argsort(distances)
# print top 20 cosine similarity
results = [(pd_review["name"][x], pd_review["id"][x], distances[x]) for x in np.argsort(distances)[:20]]
for result in results:
    print result
    

[ 3055  5150 21927 ... 49828 49459 14855]
(u'Chopstick House', u'nPatYo3wQ7tcvx7nzOU4GQ', 0.344)
(u'Ajisen Ramen', u'6SAfQKe2oM5g_EtcYXyAMg', 0.355)
(u'Lotus Garden Hakka Indian Style Chinese', u'TBzgzTFSa7pJXiLD7emYaQ', 0.399)
(u'Kaiju', u'6EVBc9kdc3Hd8KZkLVPnGA', 0.417)
(u'Chinese Dumpling House', u'ag8gM2YKZkjndCvl2ti7kQ', 0.419)
(u"Lee's Thai Spring Roll", u'uaCYXxCsZSD3KMg8XiOdwg', 0.424)
(u'Sushi Garden', u'pPhuVbly0ZjyrhIhFazANA', 0.427)
(u'Jim Chai Kee Wonton Noodle', u'X9ftU-exKhTMOjtr3B52rw', 0.428)
(u'Sansotei', u'-BbnAc9YEO6pjvJGEtFbVQ', 0.43)
(u'Rol San', u'O1TvPrgkK2bUo5O5aSZ7lw', 0.433)
(u'Seor Ak San', u'4twpbw7n4DmsLxAm6-sMkg', 0.433)
(u'New Sky Restaurant', u'J_btDyZbIv0hZNjrw56zlA', 0.434)
(u'Seor Ak San', u'4twpbw7n4DmsLxAm6-sMkg', 0.441)
(u'Bi Bim Bap', u'ruR-mrEaNbFJGnM-WCbcgg', 0.446)
(u'Ho Su Bistro', u'QTSCFDPcuROE8UCvGS8Fiw', 0.448)
(u'Huibin', u'HuWUIXfaXt9hcP5MKG-Qyg', 0.448)
(u'Lime Asian Cuisine', u'Lft-0Xy72YbwRkn_n5hfXA', 0.45)
(u'Phoenix Restaurant', u'

In [24]:
# a newer technique which first concatenates all reviews for a particular
# resto together.  
# the review dataframe row count is reduced to the number of restaurants.
# the aggregated review becomes our new document

# first group by resto id and aggregate reviews by first converting to list
# and then joining
concat_query = pd_review.groupby('id')['text'].apply(lambda x: " ".join(list(x)))
# extract unique id restaurant tuples from original dataframe
uniq_restaurants = pd_review.loc[:,["id","name"]].drop_duplicates()
# join aggregated reviews with unique resto data frame
joint_reviews = uniq_restaurants.join(concat_query, on="id").reset_index(drop=True)

clean_reviews3 = joint_reviews["text"].apply(lambda x: process_review(x, remove_stop=True))
# creates a 2D array of feature vector of size review count x feature count
review_vectors2 =\
np.array(clean_reviews3.apply(lambda x: 
                              convert_review_feature_vector(x, model, 200)).tolist())


In [25]:
# execute this code to compare each individual review with the search string
import time

start = time.time() # Start time
search_string = "cantonese"

search_vect = convert_review_feature_vector(search_string.split(), model, 200)

from scipy import spatial
# calculate cosine similarity of search string with review vectors
distances = []
for rv in review_vectors2:
    distances.append(np.round(spatial.distance.cosine(search_vect, rv),3))

end = time.time()
elapsed = end - start
print "Time taken for search: ", elapsed, "seconds."

Time taken for search:  0.478939056396 seconds.


In [26]:
# print top 20 cosine similarity
results = [(joint_reviews["name"][x], joint_reviews["id"][x], distances[x]) for x in np.argsort(distances)[:20]]
for result in results:
    print result
    
# mixed results here.  The more reviews there are for a few place, the more
# penalised the restaurant is.  The mean of the review's representation in vector
# space depends on the total number of words. 
# sometimes shorter reviews (or less reviews) come up trumps
# on the other hand, we avoid duplice results

(u'Huibin', u'HuWUIXfaXt9hcP5MKG-Qyg', 0.448)
(u'Golden House', u'zTJg9_VFyXiQQ0PegucaJg', 0.467)
(u'Chopstick House', u'nPatYo3wQ7tcvx7nzOU4GQ', 0.497)
(u'Pepper Chili', u'Pejzx2YgZvywGXJo-thfnw', 0.5)
(u'River Tai Restaurant', u'Iov02yUKZVj-Z3A3u37ExA', 0.521)
(u'Thai Bright', u'UGG8EIfEfAIbyGhgLIX7Mw', 0.529)
(u'The 5 Spices Restaurant', u'UI4lGUcqc4YyzXJ2Uqe6BQ', 0.54)
(u'Red Mango', u'SXAXzOwp0I2wiA1V3iMtbg', 0.551)
(u'Sometime Cafe', u'baY3pCVhwAKyWFXagiOCNw', 0.555)
(u'China Ocean', u'hzdFL2bdWohzZ2RM4fiYYQ', 0.565)
(u'Sala Modern Thai Kitchen & Bar', u'lIEahf71RLPJ_rFBJ5fqzQ', 0.566)
(u'D Pavilion Restaurant Lounge', u'0K-XQZRh_56WCky5REiHmQ', 0.566)
(u'TAO Northern Chinese Cuisine', u'bwFxrxHrz9I36awGc-yjjw', 0.57)
(u'Chada Thai', u'E7zuWvHH3XoVKJE8yEGIyw', 0.571)
(u'Rice & Noodle', u'Mv3pO01Alty1pXQwi-Uy5A', 0.574)
(u'Szechuan Express', u'i3jZgPgXPtXbZIjv7obagQ', 0.579)
(u'Green Tea Restaurant', u'TBOKIAMxv0OHKJbarNvSeg', 0.582)
(u'California Thai', u'Ri_K4vaiRNQjlyutXgadog',

In [27]:
# pickle original reviews, review_vectors for use in application
import pickle;
pickle_out = open ("pd_review.pkl", "wb")
pickle.dump(pd_review, pickle_out)
pickle_out.close()

pickle_out = open ("review_vector.pkl", "wb")
pickle.dump(review_vectors, pickle_out)
pickle_out.close()

# Attempt to project onto 2D using PCA

In [28]:
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

%matplotlib inline

def plot_closestwords(model, word, feature_count):
    
    arr = np.empty((0,feature_count), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.wv.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model.wv[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model.wv[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    #np.set_printoptions(suppress=True)
    result = tsne.fit_transform(arr)

    x_coords = result[:, 0]
    y_coords = result[:, 1]
    # display scatter plot
    #fig = plt.figure(figsize=(20, 10))
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)