# Word2Vec example with gensim, NLTK

In [1]:
import MySQLdb as mysql
import pandas as pd
import nltk
from gensim.models import word2vec
from nltk.corpus import stopwords # Import the stop word list
import re # regular expression library

import numpy as np

In [None]:
# download text datasets including stop words
#nltk.download() 

In [2]:
# hook up to mysql
# to fix ascii problem when tokenising, important to specify character set
# https://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte
db = mysql.connect("localhost", "jfarrugia", "jfarrugia", "yelp_db", charset='utf8',
use_unicode=True)
# load some data from a previously created table
pd_review = pd.read_sql("select id, name, text, stars from toronto_50K_random_reviews", con=db)

# close connection
db.close()

In [3]:
# confirm review shape
pd_review.shape

(50000, 4)

In [None]:
# show 1 review
pd_review["text"][0]

In [None]:
# details from https://www.kaggle.com/c/word2vec-nlp-tutorial#part-1-for-beginners-bag-of-words
# lower case all text
lc_review = pd_review["text"][0].lower()


In [None]:
# split one review into separate words
words = lc_review.split(" ")
# remove stop words from review text
words_no_stop = [w for w in words if w not in stopwords.words("english")]

In [None]:
from nltk.stem.porter import PorterStemmer
# removel morphological affices from words, leaving word stem
stemmer = PorterStemmer()
words_no_stop_stem = [stemmer.stem(w) for w in words_no_stop]
print words_no_stop_stem

In [12]:
def process_review(base_review, remove_stop=False, stem = False, join=False):
    words = re.sub("[^a-zA-Z0-9]", " ", base_review) 
    # convert to lower case + split    
    words = words.lower().split(" ")    
    # searching a set is faster than a list    
    # might contemplate tweaking stop word list
    #stop = {x for x in set(stopwords.words("english")) if x not in ['not', 'no']
    if remove_stop:
        stop = set(stopwords.words("english"))
        words = [word for word in words if word not in stop]
    # run porter stemmer
    if stem:
        words = [stemmer.stem(w) for w in words]
    # return string
    if join:
        return " ".join(words)
    else:
        return words

In [None]:
# test one review
print process_review(pd_review["text"][0])
clean_reviews = pd_review["text"].apply(process_review)


In [None]:
# now create bag-of-words with vectoriser
from sklearn.feature_extraction.text import CountVectorizer

# limit vocab to 5000 words for now
cv = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None,
                     stop_words = None, max_features=5000)

review_features = cv.fit_transform(clean_reviews)
# convert from sparse matrix to numpy array
review_features = review_features.toarray()

In [None]:
# check size of bag of words model
print review_features.shape
# have a look at the vocab
#print cv.get_feature_names()

def get_top_n_features(bow, cv, n):
    weights = bow.mean(axis=0).ravel().tolist()
    weights_df = pd.DataFrame({'term': cv.get_feature_names(), 'weight':weights})
    print weights_df.sort_values(by='weight', ascending=False).head(n)    

# print 50 top terms
get_top_n_features(review_features, cv, 10)

# Word2Vec stuff

In [None]:
# word2vec requires review paragraphs split into individual sentences
# the datastructure to hold this data is a list of lists - 
# inner list holds sentences

In [5]:
# NLTK's punkt includes a pre-trained tokenizer for english which can
# be used to transform (split) new paragraph observations into sentences
punkt = nltk.data.load('tokenizers/punkt/english.pickle')

In [6]:
# split review corpus into sentences
# cannot use clean_reviews since punctuation was removed

#process_review(pd_review["text"][0], False, False, False)
def split_to_sentence(base_reviews, tokeniser, remove_stop=False):
    raw_sentences = tokeniser.tokenize(base_reviews.strip())
    sentences = []
    for rs in raw_sentences:
        # consider only strings with length >= 1
        if (len(rs) > 0):
            sentences.append( process_review(rs, remove_stop=remove_stop) )
    return sentences

sentences = pd_review["text"].apply(lambda x: split_to_sentence(x, punkt)).tolist()

In [7]:
# we need to flatten sentences list since we have a triple level list
# that we need to convert to a list of lists (2 levels)
sentence_list = [item for sublist in sentences for item in sublist]

# format will be ok with word2vector
print sentence_list[100]

[u'my', u'pasta', u'primavera', u'was', u'nice', u'', u'and', u'the', u'soup', u'special', u'we', u'had', u'was', u'delicious', u'']


In [8]:
# we have aroiund 444000 sentences minded from 50K reviews of
# Toronto restaurants
print len(sentence_list)

444454


In [None]:
# Import the built-in logging module and configure it to have clean messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)


# Set values for various parameters
num_features = 200    # Word vector dimensionality                      
min_word_count = 30   # Minimum word count                        
num_workers = 2       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# train word2vec model based on my 50K review sample

print "Training model..."
model = word2vec.Word2Vec(sentence_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# in case we need to port model without re-training
model_name = "200features_30minwords_10context"
model.save(model_name)


In [4]:
# should we need to load the model
model = word2vec.Word2Vec.load("200features_30minwords_10context")

In [9]:
# terms most similar to sushi
#print model.wv.most_similar("sushi")
# terms most similar to sushi
model.wv.most_similar("cantonese")

[(u'chinese', 0.7390034794807434),
 (u'malaysian', 0.7228488922119141),
 (u'szechuan', 0.720445990562439),
 (u'hakka', 0.716964066028595),
 (u'shanghai', 0.7040140628814697),
 (u'mein', 0.6960002183914185),
 (u'kong', 0.6846320629119873),
 (u'hong', 0.6804869174957275),
 (u'northern', 0.6803617477416992),
 (u'korean', 0.6744514107704163)]

In [11]:
# model.wv.syn0 consists of a feature vector for each work
type(model.wv.syn0)
# with a min word count of 30, a vocab of 6,793 words as created
len(model.wv.vocab)
# shape of wv.syn0 should be 6793, 200
model.wv.syn0.shape

(6793, 200)

In [186]:
# 200 dimension feature vector returned for word 'italian'
model.wv["italian"].shape
model.wv.most_similar(positive=['pasta','chinese'], negative=['italian'])
#model.wv.most_similar('disgusting')
#model.wv.most_similar('fresh')
#model.wv.most_similar(positive=['kobe','beef'])

model.wv.most_similar("bolognese")

[(u'alla', 0.8697019815444946),
 (u'pappardelle', 0.8591920733451843),
 (u'carbonara', 0.8361091613769531),
 (u'gnocchi', 0.8305134773254395),
 (u'fettuccine', 0.8223904967308044),
 (u'rigatoni', 0.820175290107727),
 (u'penne', 0.8139102458953857),
 (u'spaghetti', 0.8118501901626587),
 (u'ravioli', 0.8116596937179565),
 (u'tagliatelle', 0.8072959184646606)]

In [11]:
# create a feature vector composed of the average of word vectors in
# a review's paragraph
def convert_review_feature_vector(word_list, model, feature_count):
    # initialise array of length feature_count (200 )
    feature_vector = np.zeros((feature_count,), dtype='float32')
    # stores count of words that are features in learned vocab
    word_count = 0.
    # convert learned vocab to set for faster processing
    vocab_set = set(model.wv.index2word)
    # iterate over words in word_list, adding feature vectors together
    for word in word_list:
        if word in vocab_set:
            word_count += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    
    # finally divide feature_vector by number of words ot get arithmetic vector mean
    feature_vector = np.divide(feature_vector, word_count)
    return feature_vector


In [13]:
clean_reviews2 = pd_review["text"].apply(lambda x: process_review(x, remove_stop=True))
# creates a 2D array of feature vector of size review count x feature count
review_vectors =\
np.array(clean_reviews2.apply(lambda x: 
                              convert_review_feature_vector(x, model, 200)).tolist())


In [18]:
# execute this code to compare each individual review with the search string
import time

start = time.time() # Start time

search_string = "cantonese"

search_vect = convert_review_feature_vector(search_string.split(), model, 200)

from scipy import spatial
# calculate cosine similarity of search string with review vectors
distances = []
for rv in review_vectors:
    distances.append(np.round(spatial.distance.cosine(search_vect, rv),3))

end = time.time()
elapsed = end - start
print "Time taken for search: ", elapsed, "seconds."

Time taken for search:  3.03672599792 seconds.


In [20]:
print np.argsort(distances)
# print top 20 cosine similarity
results = [(pd_review["name"][x], pd_review["id"][x], distances[x]) for x in np.argsort(distances)[:20]]
for result in results:
    print result
    

[ 3055  5150 33156 ... 14855 22600 24955]
(u'Chopstick House', u'nPatYo3wQ7tcvx7nzOU4GQ', 0.331)
(u'Ajisen Ramen', u'6SAfQKe2oM5g_EtcYXyAMg', 0.366)
(u"Lee's Thai Spring Roll", u'uaCYXxCsZSD3KMg8XiOdwg', 0.393)
(u'Chinese Dumpling House', u'ag8gM2YKZkjndCvl2ti7kQ', 0.41)
(u'Kaiju', u'6EVBc9kdc3Hd8KZkLVPnGA', 0.428)
(u'Lotus Garden Hakka Indian Style Chinese', u'TBzgzTFSa7pJXiLD7emYaQ', 0.429)
(u'Green Papaya', u'kM91Woq__EKVzLjo4dOTaw', 0.433)
(u'Bi Bim Bap', u'ruR-mrEaNbFJGnM-WCbcgg', 0.433)
(u'Pho Vietnam', u'1Epby_tsFDci4sP0Nbjwsw', 0.445)
(u'Seor Ak San', u'4twpbw7n4DmsLxAm6-sMkg', 0.447)
(u'Lime Asian Cuisine', u'Lft-0Xy72YbwRkn_n5hfXA', 0.453)
(u'Jim Chai Kee Wonton Noodle', u'X9ftU-exKhTMOjtr3B52rw', 0.462)
(u'Sansotei', u'-BbnAc9YEO6pjvJGEtFbVQ', 0.463)
(u'Rol San', u'O1TvPrgkK2bUo5O5aSZ7lw', 0.465)
(u'Ho Su Bistro', u'QTSCFDPcuROE8UCvGS8Fiw', 0.465)
(u'Sabai Sabai Kitchen and Bar', u'DPA9MQKkCqT0qnevsG740A', 0.465)
(u'Flip Toss & Thai Kitchen', u'Et9tn7nEpEs043pQVa2HZg', 0.47)

Unnamed: 0,id,name,text,stars
901,nPatYo3wQ7tcvx7nzOU4GQ,Chopstick House,"I've tried two other hakka Chinese places, but...",5
3055,nPatYo3wQ7tcvx7nzOU4GQ,Chopstick House,"The food was awful, I love Hakka food but the ...",1
39338,nPatYo3wQ7tcvx7nzOU4GQ,Chopstick House,I think I had the worst Hakka Food ever. So ba...,1


In [22]:
# a newer technique which first concatenates all reviews for a particular
# resto together.  
# the review dataframe row count is reduced to the number of restaurants.
# the aggregated review becomes our new document

# first group by resto id and aggregate reviews by first converting to list
# and then joining
concat_query = pd_review.groupby('id')['text'].apply(lambda x: " ".join(list(x)))
# extract unique id restaurant tuples from original dataframe
uniq_restaurants = pd_review.loc[:,["id","name"]].drop_duplicates()
# join aggregated reviews with unique resto data frame
joint_reviews = uniq_restaurants.join(concat_query, on="id").reset_index(drop=True)

clean_reviews3 = joint_reviews["text"].apply(lambda x: process_review(x, remove_stop=True))
# creates a 2D array of feature vector of size review count x feature count
review_vectors2 =\
np.array(clean_reviews3.apply(lambda x: 
                              convert_review_feature_vector(x, model, 200)).tolist())


In [23]:
# execute this code to compare each individual review with the search string
import time

start = time.time() # Start time
search_string = "crumpet cream"

search_vect = convert_review_feature_vector(search_string.split(), model, 200)

from scipy import spatial
# calculate cosine similarity of search string with review vectors
distances = []
for rv in review_vectors2:
    distances.append(np.round(spatial.distance.cosine(search_vect, rv),3))

end = time.time()
elapsed = end - start
print "Time taken for search: ", elapsed, "seconds."

Time taken for search:  0.368130922318 seconds.


In [24]:
# print top 20 cosine similarity
results = [(joint_reviews["name"][x], joint_reviews["id"][x], distances[x]) for x in np.argsort(distances)[:20]]
for result in results:
    print result
    
# mixed results here.  The more reviews there are for a few place, the more
# penalised the restaurant is.  The mean of the review's representation in vector
# space depends on the total number of words. 
# sometimes shorter reviews (or less reviews) come up trumps
# on the other hand, we avoid duplice results

(u'Oven Fresh', u'hjZ4cVn3PZk_1hqfoxMXXg', 0.561)
(u"Summer's Ice Cream", u'fPISsMIXOYZP1uins2Bwyw', 0.574)
(u'Real Fruit Bubble Tea', u'ZPI1t-WcZruILq7OQYnZHg', 0.584)
(u"Zelden's Deli and Desserts", u'_eRcc1OFDbi3fnVBskXP9g', 0.599)
(u"Menchie's Frozen Yogurt", u'lnZhnZzBiG5rD9brVv5uXA', 0.622)
(u'Fugo Desserts', u'EK38MXW_OsC5CZVvIZodIw', 0.64)
(u"Annie's Tea House", u'kBYBXh7M9wKXvHgluJWiLQ', 0.642)
(u'Sugar N Spice Cafe', u'2nW_hHaOt0DpsL1lPpuMXA', 0.646)
(u'The Cups', u'z9SjyM0Ixr1ud8zI7Y93-A', 0.647)
(u'Fresh Cup Bubble Tea', u'aeEVWgcUf1-f46Dk-EzgTQ', 0.654)
(u'Sugar Marmalade', u'2PCz_uVX7GOXtGHNXAPXhw', 0.654)
(u'WaffleBar', u'XlUgRG9zJsqBKpPztvlpYQ', 0.654)
(u'The Mad Italian', u'Ik3CEeFyMFDnqrhc30NubQ', 0.655)
(u'FUEL+', u'GJX_96Dldb0JZLUlHBgAcA', 0.659)
(u'Petit Nuage', u'XC23vvqHdCJqev-bmeh2HQ', 0.664)
(u"Shopsy's Sports Grill", u'C3Q_yp1ylwXOvj8M-1-OLw', 0.664)
(u'Oscar Coffee & Espresso Bar', u'1nEYOQZAbtGa42ZGZ5fnaQ', 0.679)
(u'Nova Era Bakery', u'd_NZqSypBaSs-2-r32CWr

In [28]:
# pickle original reviews, review_vectors for use in application
import pickle;
pickle_out = open ("pd_review.pkl", "wb")
pickle.dump(pd_review, pickle_out)
pickle_out.close()

pickle_out = open ("review_vector.pkl", "wb")
pickle.dump(review_vectors, pickle_out)
pickle_out.close()

# Attempt to project onto 2D using PCA

In [None]:
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

%matplotlib inline

def plot_closestwords(model, word, feature_count):
    
    arr = np.empty((0,feature_count), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.wv.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model.wv[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model.wv[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    #np.set_printoptions(suppress=True)
    result = tsne.fit_transform(arr)

    x_coords = result[:, 0]
    y_coords = result[:, 1]
    # display scatter plot
    #fig = plt.figure(figsize=(20, 10))
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)