# Learning Text Representation
Neural networks require inputs only in numbers. So when we have textual
data, we convert them into numeric or vector representation and feed it to the
network.

**word2vec** algorithm converts the textual input to a meaningful vector. They learn the semantic vector representation for each word in the given input text.Word2vec is one of the most popular and widely used models for generating
the word embeddings. Word embeddings
are the vector representations of words in a vector space. The embedding
generated by the word2vec model captures the syntactic and semantic
meanings of a word. Having a meaningful vector representation of a word
helps the neural network to understand the word better.

In [2]:
import numpy as np

In [1]:
#Single context CBOW ()
def Single_context_CBOW(x, label, W1, W2, loss):
  #forward propagation
  h = np.dot(W1.T, x)
  u = np.dot(W2.T, h)
  y_pred = softmax(u)
  #error
  e = -label + y_pred
  #backward propagation
  dW2 = np.outer(h, e)
  dW1 = np.outer(x, np.dot(W2.T, e))
  #update weights
  W1 = W1 - lr * dW1
  W2 = W2 - lr * dW2
  #loss function
  loss += -float(u[label == 1]) + np.log(np.sum(np.exp(u)))
  return W1, W2, loss

# Building the word2vec model using gensim

In [5]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import warnings
warnings.filterwarnings(action='ignore')
#data processing
import pandas as pd
import re
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
#modelling
from gensim.models import Word2Vec
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [8]:
data = pd.read_csv('/content/text.csv',header=None)

In [9]:
data.head()

Unnamed: 0,0
0,room kind clean strong smell dogs. generally a...
1,stayed crown plaza april april . staff friendl...
2,booked hotel hotwire lowest price could find. ...
3,stayed husband sons way alaska cruise. loved h...
4,girlfriends stayed celebrate th birthdays. pla...


In [10]:
#Preprocessing and preparing the dataset
def pre_process(text):
# convert to lowercase
  text = str(text).lower()

  # remove all special characters and keep only alpha numeric characters and spaces
  text = re.sub(r'[^A-Za-z0-9\s.]',r'',text)

  #remove new lines
  text = re.sub(r'\n',r' ',text)

  # remove stop words
  text = " ".join([word for word in text.split() if word not in stopWords])

  return text

In [11]:
pre_process(data[0][50])

'agree fancy. everything needed. breakfast pool hot tub nice shuttle airport later checkout time. noise issue tough sleep through. awhile forget noisy door nearby noisy guests. complained management later email credit compd us amount requested would return.'

In [12]:
data[0] = data[0].map(lambda x: pre_process(x))

In [13]:
data[0][1].split('.')[:5]

['stayed crown plaza april april ',
 ' staff friendly attentive',
 ' elevators tiny ',
 ' food restaurant delicious priced little high side',
 ' course washington dc']

In [14]:
#convert the list of data into a list of lists
corpus = []
for line in data[0][1].split('.'):
  words = [x for x in line.split()]
  corpus.append(words)

In [15]:
corpus[:2]

[['stayed', 'crown', 'plaza', 'april', 'april'],
 ['staff', 'friendly', 'attentive']]

In [16]:
#Convert the whole text in our dataset to a list of lists
data = data[0].map(lambda x: x.split('.'))
corpus = []
for i in (range(len(data))):
  for line in data[i]:
    words = [x for x in line.split()]
    corpus.append(words)

In [17]:
corpus[:2]

[['room', 'kind', 'clean', 'strong', 'smell', 'dogs'],
 ['generally', 'average', 'ok', 'overnight', 'stay', 'youre', 'fussy']]

In [18]:
#collect all the words that occur together and adds an underscore between them
phrases = Phrases(sentences=corpus,min_count=25,threshold=50)
bigram = Phraser(phrases)
for index,sentence in enumerate(corpus):
  corpus[index] = bigram[sentence]

In [19]:
corpus[111]

['connected', 'rivercenter', 'mall', 'downtown', 'san_antonio']

In [20]:
corpus[9]

['course', 'washington_dc']

# Building the model

In [21]:
size = 100
window_size = 2
epochs = 100
min_count = 2
workers = 4
sg = 1 #sg=1 implies we use the skip-gram model for training, sg=0 implies we use CBOW model

In [25]:
model = Word2Vec(corpus,sg=1,window=window_size,vector_size=size,min_count=min_count,workers=workers,epochs=epochs)

In [27]:
model.save('/content/word2vec.model')

In [35]:
model = Word2Vec.load('/content/word2vec.model')

In [36]:
#Evaluating the embeddings
model.wv.most_similar('san_diego')

[('san_antonio', 0.7938194870948792),
 ('san_francisco', 0.7419612407684326),
 ('phoenix', 0.7353970408439636),
 ('austin', 0.7346410155296326),
 ('dallas', 0.7320181131362915),
 ('memphis', 0.7246792316436768),
 ('indianapolis', 0.723918080329895),
 ('seattle', 0.7199681401252747),
 ('boston', 0.7106994390487671),
 ('sf', 0.7094084620475769)]

In [37]:
model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.6972159743309021)]

In [39]:
text = ['los_angeles','indianapolis', 'holiday', 'san_antonio','new_york']
model.wv.doesnt_match(text)

'holiday'

# Visualizing word embeddings in TensorBoard

In [68]:
import tensorflow.compat.v1 as tf
from tensorboard.plugins import projector
import numpy as np

import gensim
import os

In [45]:
file_name = "/content/word2vec.model"
model = gensim.models.keyedvectors.KeyedVectors.load(file_name)

In [47]:
max_size = len(model.wv)-1

In [50]:
w2v = np.zeros((max_size,model.layer1_size))

In [69]:
if not os.path.exists('projections'):
  os.makedirs('projections')
with open("projections/metadata.tsv", 'w+') as file_metadata:
  for i, word in enumerate(model.wv.index_to_key[:max_size]):
    #store the embeddings of the word
    w2v[i] = model.wv[word]
    #write the word to a file
    file_metadata.write(word + '\n')

In [70]:
sess = tf.InteractiveSession()

In [77]:
embedding = tf.Variable(w2v, trainable=False, name='embedding')

In [78]:
tf.disable_eager_execution()

In [79]:
tf.global_variables_initializer().run()

In [80]:
saver = tf.train.Saver()

In [81]:
writer = tf.summary.FileWriter('projections', sess.graph)

In [82]:
config = projector.ProjectorConfig()
embed = config.embeddings.add()

In [83]:
embed.tensor_name = 'embedding'
embed.metadata_path = 'metadata.tsv'

In [84]:
projector.visualize_embeddings(writer, config)
saver.save(sess, 'projections/model.ckpt', global_step=max_size)

'projections/model.ckpt-28070'

In [None]:
%load_ext tensorboard
%tensorboard --logdir=projections --port=8000

# Doc2vec

In [87]:
from gensim.models.doc2vec import TaggedDocument
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
tokenizer = RegexpTokenizer(r'\w+')
stopWords = set(stopwords.words('english'))

In [None]:
!unzip /content/news_dataset.zip

In [95]:
import os

docLabels = [f for f in os.listdir('/content/news_dataset') if f.endswith('.txt')]
data = []

for doc in docLabels:
    with open('/content/news_dataset/' + doc, 'r', encoding='latin-1') as file:
        file_content = file.read()
        data.append(file_content)


In [96]:
docLabels[:5]

['Electronics_313.txt',
 'Science_552.txt',
 'Electronics_513.txt',
 'Electronics_662.txt',
 'Sports_659.txt']

In [97]:
class DocIterator(object):
  def __init__(self, doc_list, labels_list):
    self.labels_list = labels_list
    self.doc_list = doc_list
  def __iter__(self):
    for idx, doc in enumerate(self.doc_list):
      yield TaggedDocument(words=doc.split(), tags=[self.labels_list[idx]])

In [98]:
it = DocIterator(data, docLabels)

In [99]:
size = 100
alpha = 0.025
min_alpha = 0.025
dm = 1
min_count = 1
epochs = 100

In [117]:
model = gensim.models.Doc2Vec(vector_size=size, min_count=min_count, alpha=alpha, min_alpha=min_alpha, dm=dm, epochs=epochs)
model.build_vocab(it)

In [118]:
for epoch in range(10):
  model.train(it,total_examples=120,epochs=model.epochs)
  model.alpha -= 0.002
  model.min_alpha = model.alpha



In [120]:
model.save('/content/doc2vec.model')

In [121]:
d2v_model = gensim.models.doc2vec.Doc2Vec.load('/content/doc2vec.model')

In [122]:
d2v_model.docvecs.most_similar('Sports_1.txt')

[('Politics_994.txt', 0.6021988391876221),
 ('Sports_360.txt', 0.592275857925415),
 ('Sports_555.txt', 0.5828407406806946),
 ('Sports_407.txt', 0.5800635814666748),
 ('Sports_324.txt', 0.5778862237930298),
 ('Sports_723.txt', 0.5773323774337769),
 ('Sports_323.txt', 0.5717895030975342),
 ('Sports_480.txt', 0.5717713236808777),
 ('Sports_918.txt', 0.5717670321464539),
 ('Sports_424.txt', 0.566116213798523)]