# Word Embedding (Word2Vec)

## Load the data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

import pandas as pd
# load the data into panda dataframe
data_file_name = "/content/Health_and_Personal_Care.json"
raw_df = pd.read_json(data_file_name, lines=True, nrows=10000, encoding_errors='ignore')
print("Data loaded")

Data loaded


In [3]:
##Check the top 5 rows of the DataFrame
raw_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5,Handy little gadget,1294185600,"01 5, 2011"
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4,Small & may need to encourage battery,1329523200,"02 18, 2012"
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4,Very good but not great,1275955200,"06 8, 2010"
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4,great addition to your purse,1202428800,"02 8, 2008"
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5,Very nice and convenient.,1313452800,"08 16, 2011"


In [4]:
# View information about the data
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewerID      10000 non-null  object
 1   asin            10000 non-null  object
 2   reviewerName    9965 non-null   object
 3   helpful         10000 non-null  object
 4   reviewText      10000 non-null  object
 5   overall         10000 non-null  int64 
 6   summary         10000 non-null  object
 7   unixReviewTime  10000 non-null  int64 
 8   reviewTime      10000 non-null  object
dtypes: int64(2), object(7)
memory usage: 703.2+ KB


In [5]:
raw_df['reviewText'][0]

"This is a great little gadget to have around.  We've already used it to look for splinters and a few other uses.  The light is great.  It's a handy size.  However, I do wish I'd bought one with a little higher magnification."

In [6]:
raw_df['reviewText'][1]

'I would recommend this for a travel magnifier for the occasional reading.I had read on another review about a magnifier having a problem with the light coming on. I did find that this one appeared to be DOA out of the box. But, after opening & shutting the viewer to turn on & off the light, the light began to come on. After several times of doing this, the light appears to be coming on all the time.It is small, but for taking it someplace & reading things like a menu in a dark corner of a restaurant, this is great.'

### Step 1 Data Preparation

In [7]:
# Convert all the review text into a long string and print its length
raw_corpus = u''.join(raw_df['reviewText']+" ")
print("Raw Corpus contains {0:,} characters".format(len(raw_corpus)))

Raw Corpus contains 4,366,255 characters


In [8]:
### Let's check what happened
raw_corpus[0:500]

"This is a great little gadget to have around.  We've already used it to look for splinters and a few other uses.  The light is great.  It's a handy size.  However, I do wish I'd bought one with a little higher magnification. I would recommend this for a travel magnifier for the occasional reading.I had read on another review about a magnifier having a problem with the light coming on. I did find that this one appeared to be DOA out of the box. But, after opening & shutting the viewer to turn on "

In [9]:
# import natural language toolkit
import nltk
# download the punkt tokenizer
nltk.download('punkt')
print("The punkt tokenizer is downloaded")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


The punkt tokenizer is downloaded


In [10]:
# Load the punkt tokenizer
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
print("The punkt tokenizer is loaded")

# we tokenize the raw string into raw sentences
raw_sentences = tokenizer.tokenize(raw_corpus)
print("We have {0:,} raw sentences".format(len(raw_sentences)))

The punkt tokenizer is loaded
We have 46,953 raw sentences


As shown earlier, we have converted the initial 10,000 reviews to 46,820 raw sentences.

In [11]:
# Recheck the raw review data once again
## Check the first review
raw_df['reviewText'][0]

"This is a great little gadget to have around.  We've already used it to look for splinters and a few other uses.  The light is great.  It's a handy size.  However, I do wish I'd bought one with a little higher magnification."

In [12]:
raw_sentences[0:5]

['This is a great little gadget to have around.',
 "We've already used it to look for splinters and a few other uses.",
 'The light is great.',
 "It's a handy size.",
 "However, I do wish I'd bought one with a little higher magnification."]

As you can see, the first review has been broken down to it's individual sentences in the first 5 entries of the `raw_sentences` list. Similarly the rest of the reviews have been broken down and stored in the same list

In [13]:
import re

# Clean and split sentence into words
def clean_and_split_str(string):
    strip_special_chars = re.compile("[^A-Za-z]+") ## only retrieves alphabets and removes any punctuations and special chars
    string = re.sub(strip_special_chars, " ", string)
    return string.strip().split()

In [14]:
### let's check the above function in action on a sample raw sentence
sample = "We've already used it to look for splinters and a few other uses."

In [15]:
clean_and_split_str(sample)

['We',
 've',
 'already',
 'used',
 'it',
 'to',
 'look',
 'for',
 'splinters',
 'and',
 'a',
 'few',
 'other',
 'uses']

In [16]:
# clean each raw sentences and build the list of sentences
sentences = []
for raw_sent in raw_sentences:
    if len(raw_sent) > 0:
        sentences.append(clean_and_split_str(raw_sent))
print("We have {0:,} clean sentences".format(len(sentences)))

We have 46,953 clean sentences


In [17]:
###Check one raw sentence and its equivalent clean sentence
### You can change the index and evaluate for multiple sentences
print(raw_sentences[30],'\n')
print(sentences[30])

I use this magnifier to inspect seeds and leaves. 

['I', 'use', 'this', 'magnifier', 'to', 'inspect', 'seeds', 'and', 'leaves']


In [18]:
token_count = sum([len(sentence) for sentence in sentences])
print("The dataset corpus contains {0:,} tokens".format(token_count))

The dataset corpus contains 822,299 tokens


### Step 2: Model Building

In [19]:
import multiprocessing

#Dimensionality of the resulting word vectors
num_features = 300

#Minimum word count threshold
min_word_count = 3

#Number of threads to run in parallel
num_workers = multiprocessing.cpu_count()

#Context window length
context_size = 7

#Seed for the RNG, to make the result reproducible
seed = 1

In [20]:
!pip install gensim



In [21]:
# Import necessary libraries
from gensim.models import Word2Vec
# Create a Word2Vec model
model = Word2Vec(sentences, vector_size=300, window=7, min_count=3, sg=1)

In [22]:
# Training the Word2Vec model
model.train(sentences, total_examples=len(sentences), epochs=10)




(6048440, 8222990)

In [23]:
#Save the model
model.save("/content/word2vec_model_trained_on_Health_and_Personal_Care.w2v")
print("Model saved")

Model saved


In [24]:
# find top N similar word to a given word in the vocabulary
print(model.wv.most_similar("water",topn=5))


[('kettle', 0.46838831901550293), ('pot', 0.45810410380363464), ('suds', 0.4517161548137665), ('rinse', 0.4493551552295685), ('faucet', 0.4437199831008911)]


In [25]:
# find top N similar word to a given word in the vocabulary
print(model.wv.most_similar("gadget",topn=5))


[('gem', 0.6163192987442017), ('Gross', 0.5993233919143677), ('mag', 0.5839505195617676), ('woth', 0.566505491733551), ('sounding', 0.5579785108566284)]


In [27]:
# find top N similar word to a given word in the vocabulary
print(model.wv.most_similar("OMRON",topn=5))


[('Pressure', 0.7183316946029663), ('Blood', 0.7159221768379211), ('Accessory', 0.7126169800758362), ('incl', 0.7110934257507324), ('Optiva', 0.7105442881584167)]


In [29]:
model.wv.similarity('OMRON','gadget')

0.26706067

In [30]:
model.wv.similarity('OMRON','Blood')

0.7159222

In [32]:
# find top N similar word to a given word in the vocabulary
print(model.wv.most_similar("Duracell",topn=5))


[('Maxell', 0.6987974643707275), ('Energizer', 0.6792564988136292), ('Procell', 0.6776551008224487), ('Coppertops', 0.6333888173103333), ('Coppertop', 0.632138192653656)]


In [37]:
model.wv["Duracell"]

array([-2.60598838e-01, -4.38623130e-02,  1.58521056e-01,  6.00926459e-01,
       -1.96570456e-01, -2.87018865e-01,  4.56014395e-01, -1.06914029e-01,
        2.34253585e-01,  1.09944142e-01, -5.37605107e-01, -6.18112721e-02,
        1.93524212e-01,  3.06071900e-03, -1.42543018e-01, -1.10967837e-01,
        5.63448071e-01,  4.45138626e-02,  2.90649414e-01, -8.88369232e-02,
       -1.70357618e-02,  3.02243114e-01,  7.92753231e-03,  2.09295422e-01,
        3.74492019e-01, -3.06681156e-01, -1.42414868e-01,  2.26710424e-01,
       -2.85753631e-03, -7.09159434e-01, -1.74335256e-01, -3.51907521e-01,
        8.40604901e-02, -4.18184698e-02,  8.94318372e-02,  3.05299044e-01,
        5.28163135e-01, -2.03131083e-02, -6.66462839e-01, -3.13157350e-01,
        1.11693852e-01, -2.29197294e-01, -5.09344816e-01, -1.31846726e-01,
        4.48593348e-01,  2.63343871e-01, -3.02440852e-01,  1.37320906e-02,
        5.43555200e-01,  3.34723741e-01,  3.28231931e-01,  2.08989769e-01,
        1.07380278e-01, -