<a href="https://colab.research.google.com/github/hirajya/NLP-Gensim_Word2Vec/blob/main/gensim_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gensim module for word2vec NLP




### Dataset:
Toys_and_Games_10 at http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/

In [1]:
import pandas as pd
import gensim

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/datasets/Toys_and_Games_10.json'

df = pd.read_json(path, lines=True)
df.head()


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1QR76SYGTXJN5,073533417X,"Adam B. Shaeffer ""ashaef""","[0, 0]",I appreciate the materials used to create this...,4,A good product,1357516800,"01 7, 2013"
1,A1W415JP5WEAJK,073533417X,Alex S,"[0, 0]",I love the larger size pieces (these are about...,4,Big pieces for little hands,1357171200,"01 3, 2013"
2,AD0WUBKBO21KK,073533417X,Amazon Customer,"[0, 0]","We've picked up Melissa and Doug, Ravensburger...",5,Good puzzle with a carrying case!,1355875200,"12 19, 2012"
3,A1II2ZRPKZAQQD,073533417X,"Amy Mcfarland ""Mom of 2""","[0, 0]",My daughter is an official puzzle connoisseur....,4,Nice puzzle with a great box,1354233600,"11 30, 2012"
4,AYWHCM0TJ4737,073533417X,Ana Braga-Henebry,"[0, 0]",We are a puzzle-loving family! I believe there...,5,Oh what fun for little kids!,1352851200,"11 14, 2012"


In [4]:
df.shape

(18637, 9)

## Preprocessing & Tokenization

In [6]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

In [7]:
review_text

0        [appreciate, the, materials, used, to, create,...
1        [love, the, larger, size, pieces, these, are, ...
2        [we, ve, picked, up, melissa, and, doug, raven...
3        [my, daughter, is, an, official, puzzle, conno...
4        [we, are, puzzle, loving, family, believe, the...
                               ...                        
18632    [my, four, year, old, niece, and, really, enjo...
18633    [my, two, year, old, niece, loves, this, plays...
18634    [got, this, for, my, year, old, who, loves, mi...
18635    [the, minnie, pony, stable, appealed, to, my, ...
18636    [fisher, price, version, of, my, little, ponie...
Name: reviewText, Length: 18637, dtype: object

In [8]:
review_text[0]

['appreciate',
 'the',
 'materials',
 'used',
 'to',
 'create',
 'this',
 'puzzle',
 'and',
 'our',
 'three',
 'year',
 'old',
 'loves',
 'the',
 'picture',
 'that',
 'is',
 'formed',
 'when',
 'the',
 'puzzle',
 'is',
 'complete',
 'soy',
 'inks',
 'recycled',
 'cardboard',
 'wonderful',
 'have',
 'only',
 'one',
 'criticism',
 'and',
 'it',
 'minor',
 'one',
 'since',
 'the',
 'puzzle',
 'is',
 'intended',
 'for',
 'younger',
 'audience',
 'it',
 'might',
 'be',
 'nice',
 'if',
 'the',
 'pieces',
 'were',
 'bit',
 'thicker',
 'our',
 'son',
 'has',
 'good',
 'fine',
 'motor',
 'skills',
 'but',
 'the',
 'pieces',
 'can',
 'still',
 'bend',
 'and',
 'crease',
 'as',
 'he',
 'works',
 'to',
 'fit',
 'them',
 'together',
 'with',
 'that',
 'said',
 'however',
 'the',
 'creasing',
 'may',
 'be',
 'due',
 'more',
 'to',
 'his',
 'impatience',
 'when',
 'the',
 'pieces',
 'don',
 'seem',
 'to',
 'fit',
 'right',
 'than',
 'with',
 'the',
 'build',
 'quality',
 'of',
 'the',
 'puzzle']

In [9]:
df.reviewText[0]

"I appreciate the materials used to create this puzzle and our three year-old loves the picture that is formed when the puzzle is complete. Soy inks + recycled cardboard = Wonderful.I have only one criticism, and it's a minor one. Since the puzzle is intended for a younger audience, it might be nice if the pieces were a bit thicker. Our son has good fine motor skills, but the pieces can still bend and crease as he works to fit them together. With that said however, the creasing may be due more to his impatience when the pieces don't seem to fit right than with the build quality of the puzzle."

## Training the Word2Vec Model

In [10]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4
)

Build Vocabulary

In [11]:
model.build_vocab(review_text, progress_per=1000)

In [12]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(11824356, 15864375)

## Saving Model

In [13]:
model.save("./model_toys_reviews.model")

## Similarity between words

In [15]:
model.wv.most_similar("fragile")

[('flimsy', 0.7573012709617615),
 ('cushy', 0.7335811257362366),
 ('flexible', 0.7296791672706604),
 ('heavy', 0.710507869720459),
 ('lightweight', 0.7088773846626282),
 ('comfortable', 0.6911585927009583),
 ('rigid', 0.6842504739761353),
 ('thin', 0.680016279220581),
 ('forgiving', 0.6748815178871155),
 ('tough', 0.6591293215751648)]

In [18]:
model.wv.similarity(w1="cheap", w2="inexpensive")

0.45578113

In [17]:
model.wv.similarity(w1="pricey", w2="expensive")

0.7657304