## Building the Word2vec model using gensim:

In [2]:
!pip install -U gensim



In [4]:
import warnings
warnings.filterwarnings(action='ignore')

import nltk
nltk.download('stopwords')

#data processing
import pandas as pd
import re
from nltk.corpus import stopwords
stopWords = stopwords.words('english')

#modelling
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
data = pd.read_csv('/content/text.csv',header=None)

In [6]:
data.head()

Unnamed: 0,0
0,room kind clean strong smell dogs. generally a...
1,stayed crown plaza april april . staff friendl...
2,booked hotel hotwire lowest price could find. ...
3,stayed husband sons way alaska cruise. loved h...
4,girlfriends stayed celebrate th birthdays. pla...


# Preprocessing and preparing the dataset:

In [7]:
def pre_process(text):

    #convert to lowercase
    text = str(text).lower()

    #remove all special characters and keep only alpha numeric characters and spaces
    text = re.sub(r'[^A-Za-z0-9\s.]',r'',text)

    #remove new lines
    text = re.sub(r'\n',r' ',text)

    # remove stop words
    text = " ".join([word for word in text.split() if word not in stopWords])

    return text

In [8]:
pre_process(data[0][50])

'agree fancy. everything needed. breakfast pool hot tub nice shuttle airport later checkout time. noise issue tough sleep through. awhile forget noisy door nearby noisy guests. complained management later email credit compd us amount requested would return.'

In [9]:
data[0] = data[0].map(lambda x: pre_process(x))

In [10]:
data[0].head()

0    room kind clean strong smell dogs. generally a...
1    stayed crown plaza april april . staff friendl...
2    booked hotel hotwire lowest price could find. ...
3    stayed husband sons way alaska cruise. loved h...
4    girlfriends stayed celebrate th birthdays. pla...
Name: 0, dtype: object

In [11]:
data[0][1].split('.')[:5]

['stayed crown plaza april april ',
 ' staff friendly attentive',
 ' elevators tiny ',
 ' food restaurant delicious priced little high side',
 ' course washington dc']

In [12]:
corpus = []
for line in data[0][1].split('.'):
    words = [x for x in line.split()]
    corpus.append(words)

In [13]:
corpus[:2]

[['stayed', 'crown', 'plaza', 'april', 'april'],
 ['staff', 'friendly', 'attentive']]

In [14]:
data = data[0].map(lambda x: x.split('.'))

corpus = []
for i in (range(len(data))):
    for line in data[i]:
        words = [x for x in line.split()]
        corpus.append(words)

corpus[:2]

[['room', 'kind', 'clean', 'strong', 'smell', 'dogs'],
 ['generally', 'average', 'ok', 'overnight', 'stay', 'youre', 'fussy']]

In [15]:
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)

In [16]:
for index,sentence in enumerate(corpus):
    corpus[index] = bigram[sentence]

In [17]:
corpus[111]

['connected', 'rivercenter', 'mall', 'downtown', 'san_antonio']

In [18]:
corpus[9]

['course', 'washington_dc']

# Building the model:

In [19]:
size = 100
window_size = 2
epochs = 100
min_count = 2
workers = 4
sg = 1

In [20]:
model = Word2Vec(corpus, sg=1,window=window_size,vector_size=size,min_count=min_count,workers=workers,epochs=epochs)

In [21]:
model.save('/content/model/word2vec.model')

In [22]:
model = Word2Vec.load('/content/model/word2vec.model')

# Evaluating the embeddings:

In [23]:
model.wv.most_similar('san_diego')

[('san_antonio', 0.7573949098587036),
 ('dallas', 0.7460066080093384),
 ('memphis', 0.7446577548980713),
 ('austin', 0.7370147705078125),
 ('seattle', 0.7355470657348633),
 ('boston', 0.732273519039154),
 ('sd', 0.7298120856285095),
 ('san_francisco', 0.7271774411201477),
 ('phoenix', 0.7263659834861755),
 ('chicago', 0.7155700922012329)]

In [24]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.7437578439712524)]

In [25]:
text = ['los_angeles','indianapolis', 'holiday', 'san_antonio','new_york']

model.wv.doesnt_match(text)

'holiday'

## Visualizing word embeddings in TensorBoard:

In [26]:
import warnings
warnings.filterwarnings(action='ignore')

import tensorflow as tf
# from tensorflow.contrib.tensorboard.plugins import projector
from tensorboard.plugins import projector

# tf.logging.set_verbosity(tf.logging.ERROR)

import numpy as np
import gensim
import os

In [27]:
file_name = "/content/model/word2vec.model"
model = gensim.models.keyedvectors.KeyedVectors.load(file_name)

In [28]:
max_size = len(model.wv.key_to_index)-1

In [29]:
w2v =np.zeros((max_size,model.layer1_size))

In [30]:
if not os.path.exists('projections'):
  os.makedirs('projections')

with open("projections/metadata.tsv",'w+') as file_metadata:

  for i, word in enumerate(model.wv.index_to_key[:max_size]):

    #store the embeddings of the word
    w2v[i] = model.wv[word]

    #write the word to a file
    file_metadata.write(word + '\n')

In [31]:
tf.compat.v1.disable_eager_execution()

sess = tf.compat.v1.InteractiveSession()

In [32]:
with tf.device("/cpu:0"):
    embedding = tf.Variable(w2v, trainable=False, name='embedding')

In [33]:
tf.compat.v1.global_variables_initializer().run()

In [34]:
saver = tf.compat.v1.train.Saver()

In [35]:
writer = tf.compat.v1.summary.FileWriter('projections',sess.graph)

In [36]:
config = projector.ProjectorConfig()
embed= config.embeddings.add()

In [37]:
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'

In [38]:
projector.visualize_embeddings(writer, config)

saver.save(sess, 'projections/model.ckpt', global_step=max_size)

'projections/model.ckpt-28070'

In [39]:
%load_ext tensorboard

In [40]:
%tensorboard --logdir=projections --port=8000

<IPython.core.display.Javascript object>

# Finding similar documents using doc2vec

In [41]:
import warnings
warnings.filterwarnings('ignore')

import os
import gensim
from gensim.models.doc2vec import TaggedDocument

from nltk import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('english'))

In [42]:
import zipfile
with zipfile.ZipFile('news_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('data/')

In [43]:
docLabels = []
docLabels = [f for f in os.listdir('data/news_dataset') if f.endswith('.txt')]

data = []

for doc in docLabels:
  with open('data/news_dataset/'+doc, 'rb') as f:
    data.append(f.read())

In [44]:
docLabels[:5]

['Science_201.txt',
 'Electronics_663.txt',
 'Politics_494.txt',
 'Sports_618.txt',
 'Politics_993.txt']

In [45]:
class DocIterator(object):
  def __init__(self,doc_list,labels_list):
    self.labels_list=labels_list
    self.doc_list = doc_list

  def __iter__(self):
    for idx, doc in enumerate(self.doc_list):
      yield TaggedDocument(words=doc.split(), tags = [self.labels_list[idx]])

In [46]:
it = DocIterator(data, docLabels)

In [47]:
size = 100
alpha = 0.025
min_alpha = 0.025
dm = 1
min_count = 1

In [48]:
model = gensim.models.Doc2Vec(vector_size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, dm=dm,epochs=100)
model.build_vocab(it)

In [49]:
model.train(it,total_examples=4000,epochs=model.epochs)
model.alpha -= 0.002
model.min_alpha = model.alpha

In [50]:
model.save('model/doc2vec.model')

In [51]:
d2v_model = gensim.models.doc2vec.Doc2Vec.load('model/doc2vec.model')

In [52]:
model.docvecs.most_similar('Electronics_724.txt')

[('Electronics_407.txt', 0.9032192230224609),
 ('Electronics_726.txt', 0.8222894072532654),
 ('Sports_568.txt', 0.6805419921875),
 ('Electronics_118.txt', 0.6625266671180725),
 ('Electronics_381.txt', 0.658320963382721),
 ('Electronics_967.txt', 0.6569585204124451),
 ('Electronics_379.txt', 0.6534062623977661),
 ('Electronics_631.txt', 0.6446444392204285),
 ('Electronics_64.txt', 0.6442915797233582),
 ('Electronics_19.txt', 0.6382589340209961)]