# Word2vec : Skipgram with Negative Sampling

In [1]:
import io
import os
import re
import time
import warnings
import string
import itertools

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tqdm

import spacy
import nltk

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_datasets as tfds

warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johanattia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
tqdm.notebook.tqdm().pandas()

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

# Load Data

In [4]:
train_dataset = pd.read_json('imdb_train_dataset.json')

print(train_dataset.shape)
train_dataset.head(2)


(25000, 6)


Unnamed: 0,review_id,review,rating,dataset,sentiment,review_tokens
0,3891,"I think that my favorite part of this movie, t...",3,train,0,"[think, favorite, movie, exemplify, sheer, poi..."
1,11091,Denzel is about the only thing that is right i...,2,train,0,"[denzel, thing, right, movie, maybe, early, st..."


In [5]:
test_dataset = pd.read_json('imdb_test_dataset.json')

print(test_dataset.shape)
test_dataset.head(2)

(25000, 6)


Unnamed: 0,review_id,review,rating,dataset,sentiment,review_tokens
0,3684,The gate to Hell has opened up under Moscow. A...,3,test,0,"[gate, hell, open, moscow, priest, play, vince..."
1,6181,"Ok, so I saw this movie at this year's Sundanc...",4,test,0,"[ok, see, movie, year, sundance, sorely, unimp..."


# Prepare word2vec dataset
Cleaning

In [6]:
def clean_text(text):
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'[-\(\)\"#\/@<>\{\}\-=~|]', ' ', text) # r'[-\(\)\"#\/@;:<>\{\}\-=~|\.\?]'
    text = re.sub(r'(\s)+', ' ', text)
    return text

In [7]:
train_dataset['review'][1]

'Denzel is about the only thing that is right in this movie.<br /><br />Maybe once in an early stage this was a better movie. Someone decided to cut some action and plot points into the beginning of the movie, giving away most of the story line in about the first 5 minutes. That and ruining whatever build up in pace and rhythm the movie might have had before.<br /><br />So first it confuses you and then it puts you off. The dramatization pushes beyond suspension of disbelieve.<br /><br />Of course there is that feeling of great injustice and anger that movies like this potentially manage to instill in viewers. Granted, it does that so if you are looking for that ... knock yourself out.'

In [8]:
clean_text(train_dataset['review'][1])

'Denzel is about the only thing that is right in this movie. Maybe once in an early stage this was a better movie. Someone decided to cut some action and plot points into the beginning of the movie, giving away most of the story line in about the first 5 minutes. That and ruining whatever build up in pace and rhythm the movie might have had before. So first it confuses you and then it puts you off. The dramatization pushes beyond suspension of disbelieve. Of course there is that feeling of great injustice and anger that movies like this potentially manage to instill in viewers. Granted, it does that so if you are looking for that ... knock yourself out.'

In [9]:
train_dataset['cleaned_review'] = train_dataset['review'].progress_apply(clean_text)
test_dataset['cleaned_review'] = test_dataset['review'].progress_apply(clean_text)

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




Tokenization & sentences

In [10]:
tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()

nltk.tokenize.sent_tokenize(train_dataset['cleaned_review'][1])

['Denzel is about the only thing that is right in this movie.',
 'Maybe once in an early stage this was a better movie.',
 'Someone decided to cut some action and plot points into the beginning of the movie, giving away most of the story line in about the first 5 minutes.',
 'That and ruining whatever build up in pace and rhythm the movie might have had before.',
 'So first it confuses you and then it puts you off.',
 'The dramatization pushes beyond suspension of disbelieve.',
 'Of course there is that feeling of great injustice and anger that movies like this potentially manage to instill in viewers.',
 'Granted, it does that so if you are looking for that ... knock yourself out.']

In [11]:
all_sentences = []

for review in tqdm.notebook.tqdm(train_dataset['cleaned_review']):
    
    tokens = tokenizer.tokenize(review)
    vocabulary_set.update(tokens)
    
    sentences = nltk.tokenize.sent_tokenize(review)
    all_sentences.extend(sentences)
    
for review in tqdm.notebook.tqdm(test_dataset['cleaned_review']):
    
    tokens = tokenizer.tokenize(review)
    vocabulary_set.update(tokens)
    
    sentences = nltk.tokenize.sent_tokenize(review)
    all_sentences.extend(sentences)

HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))




In [12]:
vocab_size = len(vocabulary_set)
token_encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

token_encoder.encode(train_dataset['cleaned_review'][1])

[95842,
 54367,
 27710,
 73834,
 114512,
 40889,
 129446,
 54367,
 114572,
 129415,
 120333,
 22225,
 110952,
 47243,
 129415,
 50043,
 82371,
 39922,
 120333,
 105262,
 26266,
 91186,
 22225,
 68382,
 105747,
 8823,
 31135,
 81563,
 10258,
 75625,
 31949,
 34401,
 5588,
 73834,
 116293,
 16060,
 73834,
 22225,
 41724,
 3426,
 29239,
 16060,
 73834,
 1786,
 67499,
 129415,
 27710,
 73834,
 113332,
 19474,
 114844,
 88131,
 75625,
 79529,
 41778,
 67676,
 80858,
 129415,
 84903,
 75625,
 45129,
 73834,
 22225,
 31844,
 40975,
 12581,
 450,
 91592,
 113332,
 27358,
 98660,
 41048,
 75625,
 62691,
 27358,
 124004,
 41048,
 61495,
 24603,
 119103,
 121237,
 79229,
 24597,
 16060,
 13542,
 100071,
 88546,
 35028,
 54367,
 129446,
 100319,
 16060,
 115750,
 501,
 75625,
 109413,
 129446,
 15509,
 15726,
 120333,
 79504,
 123378,
 8823,
 102996,
 129415,
 124890,
 100598,
 27358,
 71480,
 129446,
 96925,
 38101,
 41048,
 13354,
 117312,
 31535,
 129446,
 7319,
 94514,
 60173]

In [13]:
word_indices = token_encoder.encode(train_dataset['cleaned_review'][1])

sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=token_encoder.vocab_size, sampling_factor=1e-05)
tf.keras.preprocessing.sequence.skipgrams(
    word_indices, 
    token_encoder.vocab_size, 
    window_size=4, 
    negative_samples=3,
    shuffle=True,
    sampling_table=sampling_table
)

([[62691, 39938],
  [75625, 123940],
  [119103, 90200],
  [129415, 93360],
  [75625, 90652],
  [120333, 22225],
  [13354, 22988],
  [102996, 24459],
  [123378, 116173],
  [79529, 13847],
  [117312, 46629],
  [109413, 107118],
  [129446, 19222],
  [27358, 96925],
  [41048, 78281],
  [24603, 16226],
  [27710, 114512],
  [41048, 2747],
  [110952, 77462],
  [62691, 41048],
  [84903, 36684],
  [31135, 105747],
  [120333, 102996],
  [13542, 24597],
  [117312, 15576],
  [3426, 1786],
  [75625, 123256],
  [75625, 23464],
  [73834, 114512],
  [113332, 98660],
  [22225, 22226],
  [8823, 45712],
  [41778, 42282],
  [16060, 100319],
  [13354, 77120],
  [105747, 58416],
  [60173, 81183],
  [40975, 69981],
  [41778, 9320],
  [121237, 119103],
  [22225, 24139],
  [40975, 19673],
  [12581, 19548],
  [129415, 88729],
  [113332, 105181],
  [73834, 3426],
  [120333, 13273],
  [79229, 88703],
  [54367, 111697],
  [124890, 36822],
  [15509, 53031],
  [80858, 79529],
  [129415, 76883],
  [15509, 40964],
  [

In [14]:
all_pairs, all_labels = [], []
for sentence in tqdm.notebook.tqdm(all_sentences):
    
    word_indices = token_encoder.encode(sentence)
    pairs, labels = tf.keras.preprocessing.sequence.skipgrams(
        word_indices, 
        token_encoder.vocab_size, 
        window_size=4, 
        negative_samples=1,
        shuffle=True,
        sampling_table=sampling_table
    )
    all_pairs.extend(pairs)
    all_labels.extend(labels)

HBox(children=(FloatProgress(value=0.0, max=618066.0), HTML(value='')))




In [15]:
assert len(all_pairs) == len(all_labels)
len(all_pairs)

160103836

In [16]:
BUFFER_SIZE = 10000
BATCH_SIZE = 2048

X, y = tf.convert_to_tensor(all_pairs, dtype=tf.int32), tf.convert_to_tensor(all_labels, dtype=tf.int32)
word2vec_dataset = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# Word2vec Skipgram with Negative Sampling
Batch

In [17]:
words_pairs_batch, y_batch = next(iter(word2vec_dataset))
print(words_pairs_batch.shape, y_batch.shape, sep='\n')

(2048, 2)
(2048,)


In [18]:
words_pairs_batch

<tf.Tensor: shape=(2048, 2), dtype=int32, numpy=
array([[ 73834,  96094],
       [117312, 105968],
       [ 24603,  68430],
       ...,
       [ 64687,  38067],
       [ 54367,  27710],
       [ 75627, 129415]], dtype=int32)>

In [19]:
words_pairs_batch[:,0]

<tf.Tensor: shape=(2048,), dtype=int32, numpy=array([ 73834, 117312,  24603, ...,  64687,  54367,  75627], dtype=int32)>

Skipgram network

In [20]:
class SkipGram(tf.keras.Model):
    
    def __init__(self, d_model, token_vocab_size):
        super(SkipGram, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = token_vocab_size
        
        self.input_embedding = tf.keras.layers.Embedding(self.vocab_size, self.d_model, name='input_embedding')
        self.output_embedding = tf.keras.layers.Embedding(self.vocab_size, self.d_model, name='output_embedding')
        
    def call(self, words_pairs):
        center_word, context_word = tf.unstack(words_pairs_batch, axis=1)
        
        center_vector = self.input_embedding(center_word)
        context_vector = self.output_embedding(context_word)
        
        dot_product = tf.math.reduce_sum(tf.multiply(center_vector, context_vector), axis=1)
        logit = tf.expand_dims(dot_product, axis=1)
        
        return tf.nn.sigmoid(logit)
    
    def predict_step(self, words_indices):
        word_vectors = self.input_embedding(words_indices)
        average_vector = tf.reduce_mean(word_vectors, axis=1)
        
        return average_vector, word_vectors

In [21]:
skipgram = SkipGram(96, token_encoder.vocab_size)
assert skipgram(words_pairs_batch).shape[0] == BATCH_SIZE

In [22]:
average_vector, word_vectors = skipgram.predict(words_pairs_batch[:,0])
tf.print(average_vector.shape, word_vectors.shape)

(2048,) (2048, 96)


In [None]:
skipgram.compile(loss='binary_crossentropy', optimizer='adam', metrics= 'accuracy')
skipgram_history = skipgram.fit(
    word2vec_dataset, 
    epochs=2,
)

# Embedding Visualization

In [None]:
skipgram.summary()

In [None]:
weights = skipgram.get_layer('input_embedding').get_weights()[0]
print(weights, weights.shape, sep='\n')

In [None]:
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for idx, word in enumerate(token_encoder.tokens):
    
    vec = weights[idx+1] # skip 0, it's padding.
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    
out_v.close()
out_m.close()

# How to use pre-trained words vectors for text classification ?

In [None]:
projection_dim = 
final_dim =

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(token_encoder.vocab_size, 300, mask_zero=True, weights=[weights], trainable=True),
    tf.keras.layers.Dense(projection_dim),
    tf.keras.layers.GlobalAveragePooling1D(data_format='channels_last'),
    tf.keras.layers.Dense(final_dim)
])

# To go further

In [None]:
class SkipGram(tf.keras.Model):
    
    def __init__(self, d_model, token_vocab_size, token_encoder):
        super(SkipGram, self).__init__()
        
        self.d_model = d_model
        self.vocab_size = token_vocab_size
        self.token_encoder = token_encoder
        
        self.input_embedding = tf.keras.layers.Embedding(self.vocab_size, self.d_model, name='input_embedding')
        self.output_embedding = tf.keras.layers.Embedding(self.vocab_size, self.d_model, name='output_embedding')
        
    def call(self, words_pairs):
        center_word, context_word = tf.unstack(words_pairs_batch, axis=1)
        
        center_vector = self.input_embedding(center_word)
        context_vector = self.output_embedding(context_word)
        
        dot_product = tf.math.reduce_sum(tf.multiply(center_vector, context_vector), axis=1)
        logit = tf.expand_dims(dot_product, axis=1)
        
        return tf.nn.sigmoid(logit)
    
    def predict_step(self, words_indices):
        word_vectors = self.input_embedding(words_indices)
        average_vector = tf.reduce_mean(word_vectors, axis=1)
        
        return average_vector, word_vectors
    
    def word_similarity(self, word1, word2):
        return
    
    def sentence_similarity(self, sent1, sent2)
        return
    
    def most_similar(self, word, top=10):
        return