In [1]:
import nltk
import re
import sys
import pandas as pd
import numpy as np

from collections import defaultdict
from bs4 import BeautifulSoup 

from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

In [2]:
# Read DF
df = pd.read_csv('200Reviews.csv')
raw_reviews = df['review'].tolist()

## Preprocessing 

In [3]:
# Tokenization, lemmatization, stopword removal
stopWords = set(stopwords.words('english'))
WNL = WordNetLemmatizer()

def replacer(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " is ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = text.strip()
    return text

reviews = []

for i in range(len(raw_reviews)):
    sentence = BeautifulSoup(raw_reviews[i]).get_text()
    sentence = replacer(sentence)
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    words = nltk.word_tokenize(sentence)
    clean_sentence = ''
    for w in words:
        w = w.lower()
        w = WNL.lemmatize(w)
        if w not in stopWords:
            clean_sentence += w + ' '
    reviews.append(clean_sentence.rstrip(' '))

## Co-occurrence matrix

In [4]:
# Create co-occurrence matrix
window_size = 5

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        text = text.split()
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1

    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
        
    return df

df = co_occurrence(reviews, window_size)
df_array = np.array(df)
df

Unnamed: 0,0,000,1,10,100,101,11,117,12,13th,...,zhou,zigfield,zion,zip,zombi,zombie,zone,zoom,zuniga,êxtase
0,0,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
000,0,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,2,0,0,1,0,0,0,...,0,0,0,0,0,4,0,0,0,0
10,2,1,2,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zombie,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,2,13,0,0,0,0
zone,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zoom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
zuniga,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Create reference dictionaries
words = df.index.tolist()
word_to_index = {}
index_to_word = {}

for i in range(len(words)):
    word_to_index[words[i]] = i
    index_to_word[i] = words[i]

I fed the co-occurrence matrix into an SVD to output dense word vectors of length 100. I also created some dictionaries to be used in helping me map words to index to word embeddings, these will be used later on to compare the similarity between words.

In [6]:
# SVD
word_embedding_size = 100

svd = TruncatedSVD(n_components = word_embedding_size, n_iter = 7, random_state = 42)
svd_output = svd.fit_transform(df_array)
svd_output.shape # 6556 vectors of dimension 100 each

(6556, 100)

In [7]:
index_to_embedding = {} # Another reference dictionary
for i in range(svd_output.shape[0]):
    index_to_embedding[i] = svd_output[i]

I created a function svd_most_similar(word) that takes in a word and outputs the top 10 similar words found. Since word2vec uses cosine similarity in its most_similar function, I opted to use cosine similarity as well for consistency sake.

In [40]:
# Create word similarity function similar to word2vec's most_similar function
def svd_most_similar(word):
    idx = word_to_index[word]
    embedding = index_to_embedding[idx]
    diff_dict = {}
    
    x = cosine_similarity(np.array(embedding.tolist()).reshape((1, 100)), svd_output)
    for i in range(len(x[0])):
        diff_dict[i] = x[0][i]

    sorted_diff_dict = {k: v for k, v in sorted(diff_dict.items(), key=lambda item: item[1])}

    top_10_words = list(sorted_diff_dict.keys())[-11:len(sorted_diff_dict)-1][::-1]
    top_10_values = list(sorted_diff_dict.values())[-11:len(sorted_diff_dict)-1][::-1]
    
    output = []
    for x, y in zip(top_10_words, top_10_values):
        output.append((index_to_word[x], y))
        
    return output

## Word2Vec

In [9]:
# Convert input to list of lists before input into word2vec model
reviews_word2vec = []
for sentence in reviews:
    new_sentence = []
    for word in sentence.split():
        new_sentence.append(word)
    reviews_word2vec.append(new_sentence)

In [10]:
# Creating the model and setting values for the various parameters
num_features = 100  # Word vector dimensionality
min_word_count = 1 # Minimum word count
num_workers = 4     # Number of parallel threads
downsampling = 1e-3 # (0.001) Downsample setting for frequent words

# Initializing the train model
from gensim.models import word2vec
model = word2vec.Word2Vec(reviews_word2vec,
                          workers=num_workers,
                          size=num_features,
                          min_count=min_word_count,
                          sample=downsampling)

## Comparing results

In [41]:
# Function to compare outputs of the 2 models
def compare_models(word):
    print('Input word: ' + word + '\n')
    
    print('***** SVD *****')
    for i in svd_most_similar(word):
        print(i[0], i[1])
        
    print('\n***** Word2Vec *****')
    for i in model.wv.most_similar(word):
        print(i[0], i[1])

In [36]:
compare_models('zombie')

Input word: zombie

***** SVD *****
cannibal 0.7456572548014732
preferably 0.7293954645808435
gruesome 0.7191783274525281
killing 0.7073168188135955
attack 0.7046966135758148
bird 0.7016420484746623
canister 0.6840418880685827
dough 0.6690725971780732
raking 0.6681803862492934
assed 0.6643085694074632

***** Word2Vec *****
movie 0.946296751499176
wa 0.9317489862442017
film 0.9287232160568237
bad 0.9258207082748413
time 0.9248310327529907
character 0.9244245886802673
could 0.9228945970535278
ha 0.9227113127708435
get 0.9217485785484314
even 0.9197666049003601


In [37]:
compare_models('pleasant')

Input word: pleasant

***** SVD *****
activity 0.6775511592106531
delightful 0.6649102371021132
help 0.6494918047517543
unifying 0.6293391662251103
heartstrings 0.6292837517127319
cry 0.6280458558038033
simple 0.6258560734387171
williams 0.6250934180805605
tug 0.6174138520791097
connects 0.6109611203633987

***** Word2Vec *****
vampire 0.4110491871833801
imperative 0.37511640787124634
keep 0.35370296239852905
tear 0.3486976623535156
sarcastic 0.3375144600868225
glad 0.32664573192596436
somehow 0.3167136013507843
similarity 0.3162207305431366
element 0.31535667181015015
reunite 0.31460872292518616


In [38]:
compare_models('horrible')

Input word: horrible

***** SVD *****
substandard 0.8575662926996017
splatter 0.8255799390131067
gunk 0.7439345253922862
effect 0.6876224269840946
university 0.6637234780406462
junk 0.6442515511683667
resulted 0.6390119351595118
special 0.6338946697944025
jeremy 0.6319104283926702
script 0.6299355995607481

***** Word2Vec *****
way 0.6739404797554016
know 0.6629747152328491
would 0.6596165895462036
go 0.6570124626159668
wa 0.6563858985900879
thing 0.6500155925750732
work 0.6489824056625366
time 0.6457241773605347
show 0.640852689743042
ha 0.6395604610443115


In [39]:
compare_models('loved')

Input word: loved

***** SVD *****
saw 0.7240355996286192
amazing 0.7202507572225153
one 0.7160288545049089
great 0.7085565501348268
treat 0.7064553825062345
see 0.697794047790514
kind 0.6932661382686476
despite 0.6812781107393988
disappointment 0.6801986666524373
highly 0.6768343932707598

***** Word2Vec *****
joke 0.521105170249939
ever 0.5123059749603271
go 0.5119848847389221
8 0.4847155213356018
going 0.48196229338645935
truly 0.47532257437705994
human 0.4704762399196625
know 0.46724292635917664
work 0.46481919288635254
case 0.45912566781044006


## Conclusion
Judging from the results, I think that the word embeddings generated from SVD created more accurate results as the similar-words generated seem to make more sense than those produced from the word2vec model.

In the pleasant example, results produced from the SVD word embeddings include delightful, help, unifying, heartstrings etc which are indeed words that are associated with the word pleasant. On the other hand, the results produced by the word2vec model include vampire, imperative, sarcastic, tear etc which are hardly close the meaning of pleasant.

This same behaviour can be found in the other examples as well. Moreover, the cosine similarity scores for SVD word embedding results are generally higher than that of word2vec’s, with the exception of the zombie example. Hence I conclude that for this corpus, SVD word embedding is on average a better approach compared to using word2vec.