In [2]:
import collections
import math
import os
import errno
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange
import tensorflow as tf

  from ._conv import register_converters as _register_converters


In [4]:
data_dir = "word2vec_data/words"
data_url = "http://mattmahoney.net/dc/text8.zip"

In [5]:
def fetch_words_data(url=data_url, words_data=data_dir, zip_file_name='words.zip'):
    os.makedirs(words_data, exist_ok=True)
    zip_path = os.path.join(words_data, zip_file_name)
    if not os.path.exists(zip_path):
        urllib.request.urlretrieve(url, zip_path)
    
    with zipfile.ZipFile(zip_path) as f:
        data = f.read(f.namelist()[0])
    
    #return a list of all the words in the data source
    return data.decode('ascii').split()

In [6]:
words = fetch_words_data()

In [9]:
len(words)

17005207

In [10]:
words[9000:9040]

['feelings',
 'and',
 'the',
 'auditory',
 'system',
 'of',
 'a',
 'person',
 'without',
 'autism',
 'often',
 'cannot',
 'sense',
 'the',
 'fluctuations',
 'what',
 'seems',
 'to',
 'non',
 'autistic',
 'people',
 'like',
 'a',
 'high',
 'pitched',
 'sing',
 'song',
 'or',
 'flat',
 'robot',
 'like',
 'voice',
 'is',
 'common',
 'in',
 'autistic',
 'children',
 'some',
 'autistic',
 'children']

In [11]:
for w in words[9000:9040]:
    print(w, end=' ')

feelings and the auditory system of a person without autism often cannot sense the fluctuations what seems to non autistic people like a high pitched sing song or flat robot like voice is common in autistic children some autistic children 

In [13]:
from collections import Counter

In [14]:
#counter sample
my_list = ['one', 'two', 'two']
Counter(my_list)

Counter({'one': 1, 'two': 2})

In [24]:
x = Counter(my_list).most_common(2)

In [32]:
def create_counts(vocab_size=50000, _words=words):
    vocab = Counter(_words).most_common(vocab_size)
    vocab = np.array([word for word, _ in vocab])
    vocab_lookup = {word:index for index, word in enumerate(vocab)}
    data = np.array([vocab_lookup.get(word) for word in words])
    return data, vocab

In [71]:
data, vocabulary = create_counts()

In [35]:
data.shape

(17005207,)

In [37]:
vocabulary.shape

(50000,)

In [38]:
words[100]

'interpretations'

In [39]:
data[100]

4191

In [40]:
vocabulary[4191]

'interpretations'

In [72]:
def generate_batch(batch_size, num_skips, window_size):
    global data_index
    assert batch_size % num_skips == 0
    assert num_skips <= 2 * window_size
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    labels = np.ndarray(shape=(batch_size,1), dtype=np.int32)
    span = 2 * window_size + 1 # [window_size target window_size]
    buffer = collections.deque(maxlen=span)
    if data_index + span > len(data):
        data_index = 0
    buffer.extend(data[data_index:data_index+span])
    data_index += span
    for i in range(batch_size // num_skips):
        target = window_size # target label at the center of the buffer
        targets_to_avoid = [window_size]
        for j in range(num_skips):
            while target in targets_to_avoid:
                target = random.randint(0, span-1)
            targets_to_avoid.append(target)
            batch[i*num_skips + j] = buffer[window_size]
            labels[i*num_skips+j,0] = buffer[target]
    if data_index == len(data):
        buffer[:] = data[:span]
        data_index = span
    else:
        buffer.append(data[data_index])
        data_index +=1
    #backtrack a little bit to avoid skipping words in the end of a batch
    data_index = (data_index + len(data)-span)%len(data)
    return batch, labels

### Constants

In [73]:
batch_size = 32
embedding_size = 150 #How many dimensions we want from the model, dense of the information
window_size = 1 #how many words to consider
num_skips = 2 # how many times to reuse an input to generate a label

In [74]:
valid_size = 16 # random set of words wo evaluate similarity on and thenwe are going to pick samples

In [75]:
valid_window = 100 
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# We were actually limiting the validation samples

In [76]:
num_sampled = 64
learning_rate = 0.01
vocab_size = 50000

### Placeholders

In [77]:
tf.reset_default_graph()
train_inputs = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[batch_size,1])


### Constants and Variables

In [78]:
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
init_embeds = tf.random_uniform([vocab_size, embedding_size], minval=-1.0, maxval=1.0)
embeddings = tf.Variable(init_embeds)
embed = tf.nn.embedding_lookup(embeddings, train_inputs)

### Loss Function

In [79]:
nce_weights = tf.Variable(tf.truncated_normal([vocab_size, embedding_size], stddev=1.0/np.sqrt(embedding_size)))

In [80]:
nce_biases = tf.Variable(tf.zeros(vocab_size))

In [81]:
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights,nce_biases,train_labels, embed, num_sampled, vocab_size))

### Train - Optimizer

In [82]:
optimizer = tf.train.AdamOptimizer(learning_rate=1.0)
trainer = optimizer.minimize(loss)

### Cosine Similarity

In [83]:
# Compute the cosine similarity between minibatch examples and all embeddings
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), axis=1, keep_dims=True))
normalized_embeddings = embeddings/norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

In [84]:
#global_var
data_index = 0

In [85]:
num_steps = 5000
init = tf.global_variables_initializer()
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.85)
with tf.Session(config=tf.ConfigProto(gpu_options =gpu_options)) as sess:
    sess.run(init)
    average_loss = 0
    for step in range(num_steps):
        batch_inputs, batch_labels = generate_batch(batch_size, num_skips, window_size)
        feed_dict = {train_inputs: batch_inputs,
                    train_labels:batch_labels}
        
        _, loss_val = sess.run([trainer, loss], feed_dict=feed_dict)
        average_loss += loss_val
        
        if step % 1000 == 0:
            if step > 0:
                average_loss = average_loss/1000
            
            print('Average Loss at step {} is {} loss'.format(step, average_loss))
            average_loss = 0
        final_embeddings = normalized_embeddings.eval()

Average Loss at step 0 is 283.92822265625 loss


TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [68]:
num_skips

2

In [69]:
window_size

1

In [111]:
generate_batch(batch_size, num_skips, window_size)

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32), array([[371],
        [708],
        [708],
        [371],
        [371],
        [708],
        [708],
        [371],
        [371],
        [708],
        [371],
        [708],
        [371],
        [708],
        [371],
        [708],
        [371],
        [708],
        [371],
        [708],
        [371],
        [708],
        [708],
        [371],
        [371],
        [708],
        [371],
        [708],
        [708],
        [371],
        [371],
        [708]], dtype=int32))