# Word2Vec example with gensim, NLTK

In [1]:
import MySQLdb as mysql
import pandas as pd
import nltk
from gensim.models import word2vec
from nltk.corpus import stopwords # Import the stop word list
import re # regular expression library

import numpy as np

In [2]:
# download text datasets including stop words
#nltk.download() 

In [3]:
# hook up to mysql
# to fix ascii problem when tokenising, important to specify character set
# https://stackoverflow.com/questions/21129020/how-to-fix-unicodedecodeerror-ascii-codec-cant-decode-byte
ip = "localhost"
username = 'jfarrugia'
password = 'jfarrugia'

db = mysql.connect(ip, username, password, "yelp_db", charset='utf8',
use_unicode=True)
# load some data from a previously created table
pd_review = pd.read_sql("select id, name, text, stars from toronto_50K_random_reviews", con=db)

# close connection
db.close()

In [4]:
# confirm review shape
pd_review.shape

(50000, 4)

In [5]:
# show 1 review
pd_review["text"][0]

u"Not too bad overall. Got the 4 course menu which included a whole fish (grilled or seared) an appetizer and desert. The fish was wonderful. The pita appetizer was ok but one of the dips was not very good. I can't comment on dessert as I didn't try it. The others at my table seemed to think it was ok. The service was pretty good. Nothing special. We did think it was odd that the waiter spoke to people at the front door while he was taking our order. Not too bad overall. Would go back but won't be sprinting."

In [6]:
# details from https://www.kaggle.com/c/word2vec-nlp-tutorial#part-1-for-beginners-bag-of-words
# lower case all text
lc_review = pd_review["text"][0].lower()


In [7]:
# split one review into separate words
words = lc_review.split(" ")
# remove stop words from review text
words_no_stop = [w for w in words if w not in stopwords.words("english")]

In [8]:
from nltk.stem.porter import PorterStemmer
# removel morphological affices from words, leaving word stem
stemmer = PorterStemmer()
words_no_stop_stem = [stemmer.stem(w) for w in words_no_stop]
print words_no_stop_stem

[u'bad', u'overall.', u'got', u'4', u'cours', u'menu', u'includ', u'whole', u'fish', u'(grill', u'seared)', u'appet', u'desert.', u'fish', u'wonderful.', u'pita', u'appet', u'ok', u'one', u'dip', u'good.', u"can't", u'comment', u'dessert', u'tri', u'it.', u'other', u'tabl', u'seem', u'think', u'ok.', u'servic', u'pretti', u'good.', u'noth', u'special.', u'think', u'odd', u'waiter', u'spoke', u'peopl', u'front', u'door', u'take', u'order.', u'bad', u'overall.', u'would', u'go', u'back', u'sprinting.']


In [9]:
def process_review(base_review, remove_stop=False, stem = False, join=False):
    words = re.sub("[^a-zA-Z0-9]", " ", base_review) 
    # convert to lower case + split    
    words = words.lower().split(" ")    
    # searching a set is faster than a list    
    # might contemplate tweaking stop word list
    #stop = {x for x in set(stopwords.words("english")) if x not in ['not', 'no']
    if remove_stop:
        stop = set(stopwords.words("english"))
        words = [word for word in words if word not in stop]
    # run porter stemmer
    if stem:
        words = [stemmer.stem(w) for w in words]
    # return string
    if join:
        return " ".join(words)
    else:
        return words

# Word2Vec stuff

In [None]:
# word2vec requires review paragraphs split into individual sentences
# the datastructure to hold this data is a list of lists - 
# inner list holds sentences

In [17]:
# NLTK's punkt includes a pre-trained tokenizer for english which can
# be used to transform (split) new paragraph observations into sentences
punkt = nltk.data.load('tokenizers/punkt/english.pickle')

In [18]:
# split review corpus into sentences
# cannot use clean_reviews since punctuation was removed

#process_review(pd_review["text"][0], False, False, False)
def split_to_sentence(base_reviews, tokeniser, remove_stop=False):
    raw_sentences = tokeniser.tokenize(base_reviews.strip())
    sentences = []
    for rs in raw_sentences:
        # consider only strings with length >= 1
        if (len(rs) > 0):
            sentences.append( process_review(rs, remove_stop=remove_stop) )
    return sentences

sentences = pd_review["text"].apply(lambda x: split_to_sentence(x, punkt)).tolist()

In [19]:
# we need to flatten sentences list since we have a triple level list
# that we need to convert to a list of lists (2 levels)
sentence_list = [item for sublist in sentences for item in sublist]

# format will be ok with word2vector
print sentence_list[100]

[u'my', u'pasta', u'primavera', u'was', u'nice', u'', u'and', u'the', u'soup', u'special', u'we', u'had', u'was', u'delicious', u'']


In [20]:
# we have aroiund 444000 sentences minded from 50K reviews of
# Toronto restaurants
print len(sentence_list)

444454


In [21]:
# Import the built-in logging module and configure it to have clean messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)


# Set values for various parameters
num_features = 200    # Word vector dimensionality                      
min_word_count = 30   # Minimum word count                        
num_workers = 2       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# train word2vec model based on my 50K review sample

print "Training model..."
model = word2vec.Word2Vec(sentence_list, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# in case we need to port model without re-training
model_name = "200features_30minwords_10context"
model.save(model_name)


2018-01-27 16:34:56,631 : INFO : collecting all words and their counts
2018-01-27 16:34:56,646 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-01-27 16:34:56,718 : INFO : PROGRESS: at sentence #10000, processed 166483 words, keeping 9036 word types
2018-01-27 16:34:56,797 : INFO : PROGRESS: at sentence #20000, processed 332568 words, keeping 12587 word types


Training model...


2018-01-27 16:34:56,865 : INFO : PROGRESS: at sentence #30000, processed 497948 words, keeping 15344 word types
2018-01-27 16:34:56,928 : INFO : PROGRESS: at sentence #40000, processed 664084 words, keeping 17603 word types
2018-01-27 16:34:56,983 : INFO : PROGRESS: at sentence #50000, processed 829665 words, keeping 19661 word types
2018-01-27 16:34:57,053 : INFO : PROGRESS: at sentence #60000, processed 997906 words, keeping 21318 word types
2018-01-27 16:34:57,111 : INFO : PROGRESS: at sentence #70000, processed 1168609 words, keeping 23082 word types
2018-01-27 16:34:57,161 : INFO : PROGRESS: at sentence #80000, processed 1333955 words, keeping 24453 word types
2018-01-27 16:34:57,215 : INFO : PROGRESS: at sentence #90000, processed 1497282 words, keeping 25699 word types
2018-01-27 16:34:57,286 : INFO : PROGRESS: at sentence #100000, processed 1663384 words, keeping 26999 word types
2018-01-27 16:34:57,352 : INFO : PROGRESS: at sentence #110000, processed 1827603 words, keeping 28

2018-01-27 16:35:24,666 : INFO : PROGRESS: at 54.24% examples, 511756 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:25,672 : INFO : PROGRESS: at 56.83% examples, 515476 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:26,679 : INFO : PROGRESS: at 59.43% examples, 519351 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:27,686 : INFO : PROGRESS: at 61.92% examples, 521825 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:28,697 : INFO : PROGRESS: at 64.21% examples, 522527 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:29,700 : INFO : PROGRESS: at 66.55% examples, 523501 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:30,724 : INFO : PROGRESS: at 68.25% examples, 519324 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:31,733 : INFO : PROGRESS: at 70.45% examples, 519250 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:32,742 : INFO : PROGRESS: at 73.10% examples, 522429 words/s, in_qsize 4, out_qsize 0
2018-01-27 16:35:33,748 : INFO : PROGRESS: at 75.87% examples, 526303 wor

In [None]:
# should we need to load the model
model = word2vec.Word2Vec.load("200features_30minwords_10context")

In [22]:
# get terms most similar to cantonese
model.wv.most_similar("cantonese")

[(u'chinese', 0.7640732526779175),
 (u'malaysian', 0.6850325465202332),
 (u'hakka', 0.6848489046096802),
 (u'shanghai', 0.6819875240325928),
 (u'northern', 0.6764723062515259),
 (u'szechuan', 0.6727597117424011),
 (u'korean', 0.6708306670188904),
 (u'kong', 0.6656321883201599),
 (u'vietnamese', 0.6430572271347046),
 (u'westernized', 0.6387507915496826)]

In [23]:
# model.wv.syn0 consists of a feature vector for each work
type(model.wv.syn0)
# with a min word count of 30, a vocab of 6,793 words as created
len(model.wv.vocab)
# shape of wv.syn0 should be 6793, 200
model.wv.syn0.shape

(6793, 200)

In [24]:
# simple word algebra example:
model.wv.most_similar(positive=['pasta','chinese'], negative=['italian'])


[(u'seafood', 0.6198824644088745),
 (u'udon', 0.5855196714401245),
 (u'vegetable', 0.5377663373947144),
 (u'noodles', 0.5332452654838562),
 (u'rice', 0.532392680644989),
 (u'broccoli', 0.5315754413604736),
 (u'bibimbap', 0.5264356136322021),
 (u'congee', 0.5161706209182739),
 (u'soba', 0.5133492946624756),
 (u'tripe', 0.5107491612434387)]

In [25]:
# create a feature vector composed of the average of word vectors in
# a review's paragraph
def convert_review_feature_vector(word_list, model, feature_count):
    # initialise array of length feature_count (200 )
    feature_vector = np.zeros((feature_count,), dtype='float32')
    # stores count of words that are features in learned vocab
    word_count = 0.
    # convert learned vocab to set for faster processing
    vocab_set = set(model.wv.index2word)
    # iterate over words in word_list, adding feature vectors together
    for word in word_list:
        if word in vocab_set:
            word_count += 1
            feature_vector = np.add(feature_vector, model.wv[word])
    
    # finally divide feature_vector by number of words ot get arithmetic vector mean
    feature_vector = np.divide(feature_vector, word_count)
    return feature_vector


In [None]:
clean_reviews2 = pd_review["text"].apply(lambda x: process_review(x, remove_stop=True))
# creates a 2D array of feature vector of size review count x feature count
review_vectors =\
np.array(clean_reviews2.apply(lambda x: 
                              convert_review_feature_vector(x, model, 200)).tolist())


In [None]:
# execute this code to compare each individual review with the search string
import time

start = time.time() # Start time

search_string = "cantonese"

search_vect = convert_review_feature_vector(search_string.split(), model, 200)

from scipy import spatial
# calculate cosine similarity of search string with review vectors
distances = []
for rv in review_vectors:
    distances.append(np.round(spatial.distance.cosine(search_vect, rv),3))

end = time.time()
elapsed = end - start
print "Time taken for search: ", elapsed, "seconds."

In [None]:
print np.argsort(distances)
# print top 20 cosine similarity
results = [(pd_review["name"][x], pd_review["id"][x], distances[x]) for x in np.argsort(distances)[:20]]
for result in results:
    print result
    

In [None]:
# a newer technique which first concatenates all reviews for a particular
# resto together.  
# the review dataframe row count is reduced to the number of restaurants.
# the aggregated review becomes our new document

# first group by resto id and aggregate reviews by first converting to list
# and then joining
concat_query = pd_review.groupby('id')['text'].apply(lambda x: " ".join(list(x)))
# extract unique id restaurant tuples from original dataframe
uniq_restaurants = pd_review.loc[:,["id","name"]].drop_duplicates()
# join aggregated reviews with unique resto data frame
joint_reviews = uniq_restaurants.join(concat_query, on="id").reset_index(drop=True)

clean_reviews3 = joint_reviews["text"].apply(lambda x: process_review(x, remove_stop=True))
# creates a 2D array of feature vector of size review count x feature count
review_vectors2 =\
np.array(clean_reviews3.apply(lambda x: 
                              convert_review_feature_vector(x, model, 200)).tolist())


In [None]:
# execute this code to compare each individual review with the search string
import time

start = time.time() # Start time
search_string = "cantonese"

search_vect = convert_review_feature_vector(search_string.split(), model, 200)

from scipy import spatial
# calculate cosine similarity of search string with review vectors
distances = []
for rv in review_vectors2:
    distances.append(np.round(spatial.distance.cosine(search_vect, rv),3))

end = time.time()
elapsed = end - start
print "Time taken for search: ", elapsed, "seconds."

In [None]:
# print top 20 cosine similarity
results = [(joint_reviews["name"][x], joint_reviews["id"][x], distances[x]) for x in np.argsort(distances)[:20]]
for result in results:
    print result
    
# mixed results here.  The more reviews there are for a few place, the more
# penalised the restaurant is.  The mean of the review's representation in vector
# space depends on the total number of words. 
# sometimes shorter reviews (or less reviews) come up trumps
# on the other hand, we avoid duplice results

In [None]:
# pickle original reviews, review_vectors for use in application
import pickle;
pickle_out = open ("pd_review.pkl", "wb")
pickle.dump(pd_review, pickle_out)
pickle_out.close()

pickle_out = open ("review_vector.pkl", "wb")
pickle.dump(review_vectors, pickle_out)
pickle_out.close()

# Attempt to project onto 2D using PCA

In [None]:
# https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import random

%matplotlib inline

def plot_closestwords(model, word, feature_count):
    
    arr = np.empty((0,feature_count), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.wv.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model.wv[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model.wv[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    #np.set_printoptions(suppress=True)
    result = tsne.fit_transform(arr)

    x_coords = result[:, 0]
    y_coords = result[:, 1]
    # display scatter plot
    #fig = plt.figure(figsize=(20, 10))
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005)
    plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005)