In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import math
import os
import random
from tempfile import gettempdir

import numpy as np
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

# Step 1: Read data
filename = 'orginal_corpus.txt'
# Read the data into a list of strings.
def read_data(filename):
    with open(filename, 'r') as f:
        data = f.read().split('\n')
    return data

all_words = read_data(filename)
vocabulary = all_words[0].split()
print('Data size', len(vocabulary))


Data size 17005207


In [2]:
# Step 2: Build the dictionary and replace rare words with UNK token.
vocabulary_size = 5000000


def build_dataset(words, n_words):
    """Process raw inputs into a dataset."""
#     count = [['UNK', -1]]
    count = []
    count.extend(collections.Counter(words).most_common(n_words - 1))
    dictionary = dict()
    for word, _ in count:
        dictionary[word] = len(dictionary)
#     data = list()
#     unk_count = 0
#     for word in words:
#         index = dictionary.get(word, 0)
#         if index == 0:  # dictionary['UNK']
#             unk_count += 1
#         data.append(index)
#     count[0][1] = unk_count
    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
    return count, dictionary, reversed_dictionary

# Filling 4 global variables:
# data - list of codes (integers from 0 to vocabulary_size-1).
#   This is the original text but words are replaced by their codes
# count - map of words(strings) to count of occurrences
# dictionary - map of words(strings) to their codes(integers)
# reverse_dictionary - maps codes(integers) to words(strings)
count, dictionary, reverse_dictionary = build_dataset(vocabulary,
                                                            vocabulary_size)

In [9]:
# extract the sentences that include the target word

def tag_word(words, target_word, skip_window):
#     i = 1
    count = 0
    new_vocabulary = list()

    for word in words:
        if word == target_word:
#             word = word + str(i)
#             i+=1
            f.write(' '.join(words[count-skip_window:count]))
            f.write(' ')
            f.write(' '.join(words[count+1:count+1+skip_window]))
            f.write(' ')
        new_vocabulary.append(word)
        count+=1
    return new_vocabulary

with open('sentence_woman.txt', 'w') as f:
    new_vocabulary = tag_word(vocabulary, 'woman', 5)
    
del new_vocabulary

In [12]:
# del vocabulary  # Hint to reduce memory.
print('Most common words (+UNK)', count[:500])
# print('Sample data', [reverse_dictionary[i] for i in data[:10]])

Most common words (+UNK) [('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430), ('two', 192644), ('is', 183153), ('as', 131815), ('eight', 125285), ('for', 118445), ('s', 116710), ('five', 115789), ('three', 114775), ('was', 112807), ('by', 111831), ('that', 109510), ('four', 108182), ('six', 102145), ('seven', 99683), ('with', 95603), ('on', 91250), ('are', 76527), ('it', 73334), ('from', 72871), ('or', 68945), ('his', 62603), ('an', 61925), ('be', 61281), ('this', 58832), ('which', 54788), ('at', 54576), ('he', 53573), ('also', 44358), ('not', 44033), ('have', 39712), ('were', 39086), ('has', 37866), ('but', 35358), ('other', 32433), ('their', 31523), ('its', 29567), ('first', 28810), ('they', 28553), ('some', 28161), ('had', 28100), ('all', 26229), ('more', 26223), ('most', 25563), ('can', 25519), ('been', 25383), ('such', 24413), ('many', 24096), ('who', 23997), ('new', 23770), ('used',

In [10]:

train_data = read_data("sentence_woman.txt")
train_data = train_data[0].split()
trained_word = 'woman'

# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(train_data, trained_word):
    all_input = np.ndarray(shape=(len(train_data)), dtype=np.int32)
    labels = np.ndarray(shape=(len(train_data), 1), dtype=np.int32)
    all_input.fill(dictionary[trained_word])
    for i, word in enumerate(train_data):
        labels[i, 0] = dictionary[word]
    return all_input, labels

# batch, labels = generate_batch(train_data, trained_word, 8)
# for i in range(8):
#     print(batch[i], reverse_dictionary[batch[i]],
#         '->', labels[i, 0], reverse_dictionary[labels[i, 0]])


In [11]:
# Step 4: Build and train a skip-gram model.

batch_size = 100
embedding_size = 128  # Dimension of the embedding vector.
skip_window = 5       # How many words to consider left and right.
num_skips = 2         # How many times to reuse an input to generate a label.
num_sampled = 64      # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 2     # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_tests = np.random.choice(valid_window, valid_size, replace=False)
valid_examples = np.append(valid_tests, dictionary['woman'])
print(valid_examples[2])

graph = tf.Graph()

with graph.as_default():

    # Input data.
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Ops and variables pinned to the CPU because of missing GPU implementation
    with tf.device('/cpu:0'):
    # Look up embeddings for inputs.
        embeddings = tf.Variable(
            tf.random_uniform([len(dictionary), embedding_size], -1.0, 1.0))
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)

        # Construct the variables for the NCE loss
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

    # Compute the average NCE loss for the batch.
    # tf.nce_loss automatically draws a new sample of the negative labels each
    # time we evaluate the loss.
    # Explanation of the meaning of NCE loss:
    #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
    loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
                     biases=nce_biases,
                     labels=train_labels,
                     inputs=embed,
                     num_sampled=num_sampled,
                     num_classes=vocabulary_size))

    # Construct the SGD optimizer using a learning rate of 1.0.
    optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)

    # Compute the cosine similarity between minibatch examples and all embeddings.
    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
    similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)

    # Add variable initializer.
    init = tf.global_variables_initializer()

1013


In [13]:
# Step 5: Begin training.
num_steps = 10001

with tf.Session(graph=graph) as session:
    # We must initialize all variables before we use them.
    init.run()
    print('Initialized')

    average_loss = 0
    inputs, labels = generate_batch(train_data, trained_word)
    for step in xrange(num_steps):
        batch_inputs = np.ndarray(shape=(batch_size), dtype=np.int32)
#         batch_labels = np.zeros((batch_size, n_classes))
        batch_labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        for batch_num in xrange(int(len(inputs)/batch_size)+1):
            if (batch_num+1)*batch_size <= len(inputs):
                batch_inputs = inputs[batch_num*batch_size:(batch_num+1)*batch_size]
                batch_labels[:, 0] = labels[batch_num*batch_size:(batch_num+1)*batch_size, 0]
#                 batch_labels[range(batch_size), labels[batch_num*batch_size:(batch_num+1)*batch_size, 0]] = 1
            else:
                batch_inputs = inputs[-batch_size:]
                batch_labels[:, 0] = labels[-batch_size:, 0]
#                 batch_labels[range(batch_size), labels[-batch_size:, 0]] = 1
            feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

        # We perform one update step by evaluating the optimizer op (including it
        # in the list of returned values for session.run()
            _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
#             print(loss_val)
            average_loss += loss_val
        
#         for i in xrange(valid_size+1):
#             check_embeddings = valid_embeddings.eval()
#             valid_word = reverse_dictionary[valid_examples[i]]
#             print(check_embeddings[i])
#             print(valid_word)
        
        if step % 200 == 0:
            if step > 0:
                average_loss /= (200*(int(len(inputs)/batch_size)+1))
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step ', step, ': ', average_loss)
            average_loss = 0

#         # Note that this is expensive (~20% slowdown if computed every 500 steps)
#         if step % 10000 == 0:
#             sim = similarity.eval()
#             for i in xrange(valid_size):
#                 valid_word = reverse_dictionary[valid_examples[i]]
#                 top_k = 8  # number of nearest neighbors
#                 nearest = (-sim[i, :]).argsort()[1:top_k + 1]
#                 log_str = 'Nearest to %s:' % valid_word
#             for k in xrange(top_k):
#                 close_word = reverse_dictionary[nearest[k]]
#                 log_str = '%s %s,' % (log_str, close_word)
#             print(log_str)
    final_embeddings = normalized_embeddings.eval()

Initialized
Average loss at step  0 :  67926.5405884
Average loss at step  200 :  209.290919073
Average loss at step  400 :  147.076715871
Average loss at step  600 :  122.053367651
Average loss at step  800 :  104.809137571
Average loss at step  1000 :  92.8199263941
Average loss at step  1200 :  83.2380847703
Average loss at step  1400 :  75.4570656467
Average loss at step  1600 :  68.7821472524
Average loss at step  1800 :  63.2852490649
Average loss at step  2000 :  58.6979804171
Average loss at step  2200 :  54.6283764818
Average loss at step  2400 :  50.8451898168
Average loss at step  2600 :  47.4702152351
Average loss at step  2800 :  44.1339972874
Average loss at step  3000 :  41.6467426927
Average loss at step  3200 :  39.241179752
Average loss at step  3400 :  37.0988287364
Average loss at step  3600 :  35.0281831554
Average loss at step  3800 :  33.1837953808
Average loss at step  4000 :  31.6068613662
Average loss at step  4200 :  29.8547594323
Average loss at step  4400 :

In [14]:
with open('woman_vector.txt', 'w') as f:
    f.write(str(final_embeddings[dictionary['woman']]))

In [38]:
print(final_embeddings[dictionary['woman']])
woman = final_embeddings[dictionary['woman']]
woman = np.array(woman)

[ 0.0357807   0.1857841   0.01946772  0.0205264  -0.01824022  0.01648284
 -0.10532656  0.10455012 -0.0506693  -0.02614136 -0.06788473  0.067092
 -0.07097854 -0.01832454  0.17562512 -0.09220994 -0.08865318 -0.01852003
 -0.1891235  -0.08961862  0.01966646  0.05443218  0.03370234 -0.09657079
 -0.11356673  0.01233688 -0.00444836 -0.06135167  0.04103355  0.0118425
  0.03579991 -0.1657719  -0.00836371  0.0686646  -0.00647543  0.04998835
 -0.11685289 -0.03666879  0.00310926 -0.0009377   0.08625925 -0.03770928
 -0.13039523  0.0562083  -0.07692081 -0.08389106 -0.08173437 -0.10759465
 -0.07719949 -0.09055722  0.16130936  0.04092021 -0.0648022  -0.01815654
  0.11379946 -0.0024388  -0.00265408 -0.11260857 -0.0980937   0.00605923
  0.05985097  0.07598352  0.06128407 -0.05992256 -0.05869611 -0.02000132
  0.08525773 -0.0696595  -0.00175977 -0.01509145  0.0749114  -0.00337901
  0.08225035  0.09047309 -0.09357306  0.04469706 -0.01230907 -0.11616122
  0.02150053  0.00456217 -0.15452112 -0.04630809  0.12

In [39]:
king = '0.06426143 -0.00837654  0.1980134  -0.09742513 -0.10437928 -0.00150868\
  0.09878582  0.06168188 -0.07201536 -0.11429717 -0.07504471  0.02948793\
  0.07345533  0.04259725  0.11041805 -0.00494413  0.07051653 -0.11891647\
 -0.01062964  0.00798341 -0.02633937 -0.07363377  0.15297718  0.1345118\
  0.03939788  0.12761478 -0.03470821  0.12602098  0.00810388  0.0877671\
  0.00168014 -0.03119589  0.17897098  0.10663807 -0.06820287  0.14468075\
 -0.020985   -0.00459365 -0.10359303 -0.05125836  0.09031818 -0.13061424\
  0.1480469  -0.1074915  -0.01883516  0.0162647   0.02106079 -0.00410639\
 -0.11259528 -0.1454927  -0.06305808  0.0483107   0.06163573 -0.08252724\
  0.12252723 -0.11078162 -0.02730912 -0.0281108  -0.04359945  0.13812022\
 -0.02614157  0.09190316 -0.0052021  -0.04769276  0.09219858  0.07776697\
 -0.04105809 -0.06728956  0.06101124 -0.15189616 -0.04483667 -0.09586494\
 -0.05187369 -0.02582284  0.11297669 -0.04946613 -0.14020644 -0.08537365\
 -0.13278346  0.05018449  0.07249071  0.07443805 -0.14362812  0.15985423\
  0.02290909 -0.08001291  0.1339239  -0.01899522  0.0529473   0.12080525\
 -0.13830988  0.00823423  0.03806611  0.00463596  0.08813316  0.03474797\
 -0.04699161 -0.02391873 -0.07227785  0.16362631  0.02726053 -0.03693298\
 -0.01087115 -0.06992907 -0.03544131 -0.06004997 -0.07407713 -0.20325746\
  0.03689325 -0.01923681 -0.13377549 -0.01261471  0.01282861  0.06298161\
  0.13330215 -0.09261915 -0.03571277 -0.13735898 -0.07306235 -0.05227819\
  0.15543039  0.12054769 -0.06806238  0.1012755  -0.13383301  0.06115448\
  0.04878619 -0.10003781'
king = king.split()
king = map(float, king)
king = np.array(king)
print(king)

man = '-0.14401263  0.03133213  0.10876009  0.16138153  0.01993224  0.04053933\
  0.02407939  0.11829058 -0.09660213  0.05322566  0.15185139 -0.06808496\
 -0.08030491  0.00205176 -0.05553271  0.13937525 -0.04895727 -0.0200414\
 -0.06129832  0.11245506 -0.02979986  0.05640989  0.00764067  0.01651493\
  0.00163749 -0.06755417 -0.06297332  0.13367759  0.02575959 -0.06324068\
 -0.02061389 -0.12079421 -0.04861513 -0.06412487  0.22672339 -0.14655454\
  0.04374257  0.08781171 -0.00547297  0.05237079 -0.17510109  0.01255522\
 -0.16211067  0.14800885  0.04263682 -0.01509207 -0.03117405  0.10070733\
  0.05270626 -0.07679772  0.10136274 -0.03757891  0.08042792 -0.08155028\
 -0.15516576 -0.0857062  -0.05004138 -0.15370208 -0.08123562  0.0412583\
  0.00415061 -0.09087764  0.08707889 -0.05980315  0.1635002   0.10163325\
  0.00823487  0.07759993  0.04444237 -0.11905646 -0.04587984  0.02590295\
  0.02649482  0.07092221 -0.15362558  0.08152816  0.18418971 -0.05635915\
  0.02716513  0.00310327 -0.16568978  0.00091071 -0.19549216  0.0342464\
  0.06616123 -0.01854955  0.01579936  0.04912784 -0.04897791 -0.02565535\
  0.05528686  0.0444776  -0.01629809 -0.08517649  0.04435289 -0.00282378\
 -0.06897477 -0.06155508 -0.11135564 -0.13872746  0.05074596 -0.17721076\
  0.08092945  0.02020106  0.07460609 -0.05015494 -0.1300696   0.05390926\
  0.00175319  0.09611303  0.19114007 -0.05178848  0.05972478 -0.12104108\
 -0.03457611 -0.1263997   0.11070186  0.05045658 -0.08912141 -0.03569847\
 -0.11920138  0.02610456 -0.01187746  0.02057748 -0.0062471  -0.02288416\
  0.03060456 -0.14499602'
man = man.split()
man = map(float, man)
man = np.array(man)
print(man)


queen = '0.10719869 -0.10232529 -0.02304027 -0.02485728  0.17746936 -0.16133316\
 -0.09617456 -0.08252326  0.03485161 -0.06539463 -0.05119734 -0.0071219\
  0.04664997  0.07755207 -0.0376812  -0.05126946  0.04974871 -0.09130801\
  0.02200206 -0.11675026  0.05019299  0.03069795  0.04541247 -0.06323202\
  0.00760292  0.00975615  0.06263665 -0.15049373  0.10783202 -0.14120691\
  0.12385137  0.15252414 -0.04066575 -0.01268648 -0.18778183 -0.09001921\
  0.10144432  0.03953693 -0.05206658  0.08898162 -0.06332143 -0.03476854\
  0.03007286  0.15931943  0.00775215  0.03130399  0.05636695 -0.04932799\
  0.00386375 -0.03799838  0.07498509  0.05205622  0.14855604 -0.15902062\
  0.01181984  0.1059057  -0.05600962  0.00615451 -0.12136336  0.05390648\
  0.06225348 -0.05576621  0.06548391  0.117924    0.04091459 -0.09727937\
 -0.12719241  0.07945643  0.00403882 -0.08504421  0.01977304  0.01947873\
  0.02801108  0.04144734  0.06505365  0.04846439 -0.03233812  0.0376178\
 -0.03111132 -0.14116828  0.04352442  0.03442268  0.09748826 -0.10943903\
 -0.02274107 -0.03172036 -0.04356686 -0.02012795  0.14008713 -0.1183772\
 -0.08421071 -0.03655564  0.17045356 -0.02569478 -0.15899356 -0.21644557\
 -0.01376982 -0.06579022 -0.15950891 -0.20833515 -0.13493483 -0.02202897\
 -0.05435896 -0.08009836 -0.05089762 -0.12733521  0.09836871  0.05449677\
  0.10958663 -0.00039016 -0.05642409 -0.14433326  0.0749191   0.07995714\
  0.05160326  0.09664808 -0.01128155 -0.05913845  0.02192012 -0.14660437\
  0.10578209 -0.04522621  0.1281876   0.10627382  0.02222264 -0.15014711\
 -0.03498883  0.03361606'
queen = queen.split()
queen = map(float, queen)
queen = np.array(queen)
print(queen)
from numpy import linalg as LA
print(LA.norm(queen))

[ 0.06426143 -0.00837654  0.1980134  -0.09742513 -0.10437928 -0.00150868
  0.09878582  0.06168188 -0.07201536 -0.11429717 -0.07504471  0.02948793
  0.07345533  0.04259725  0.11041805 -0.00494413  0.07051653 -0.11891647
 -0.01062964  0.00798341 -0.02633937 -0.07363377  0.15297718  0.1345118
  0.03939788  0.12761478 -0.03470821  0.12602098  0.00810388  0.0877671
  0.00168014 -0.03119589  0.17897098  0.10663807 -0.06820287  0.14468075
 -0.020985   -0.00459365 -0.10359303 -0.05125836  0.09031818 -0.13061424
  0.1480469  -0.1074915  -0.01883516  0.0162647   0.02106079 -0.00410639
 -0.11259528 -0.1454927  -0.06305808  0.0483107   0.06163573 -0.08252724
  0.12252723 -0.11078162 -0.02730912 -0.0281108  -0.04359945  0.13812022
 -0.02614157  0.09190316 -0.0052021  -0.04769276  0.09219858  0.07776697
 -0.04105809 -0.06728956  0.06101124 -0.15189616 -0.04483667 -0.09586494
 -0.05187369 -0.02582284  0.11297669 -0.04946613 -0.14020644 -0.08537365
 -0.13278346  0.05018449  0.07249071  0.07443805 -0.1

In [45]:
np.dot(king, woman)

-0.018674117491851477