### What is this?
In this notebook, I'm playing around Word Embedding by reading [Deep Language Modeling for Question Answering using Keras](http://ben.bolte.cc/blog/2016/keras-language-modeling.html).

In [3]:
### Run sample code in the URL and add some comments
import itertools
import numpy as np

sentences = '''
sam is red
hannah not red
hannah is green
bob is green
bob not red
sam not green
sarah is red
sarah not green'''.strip().split('\n')
print("sentences", sentences)

# indexes for green ones
is_green = np.asarray([[0, 1, 1, 1, 1, 0, 0, 0]], dtype='int32').T
print("is_green", is_green)

sentences ['sam is red', 'hannah not red', 'hannah is green', 'bob is green', 'bob not red', 'sam not green', 'sarah is red', 'sarah not green']
is_green [[0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]]


In [5]:
lemma = lambda x: x.strip().lower().split(' ')
sentences_lemmatized = [lemma(sentence) for sentence in sentences]
print("sentences_lemmatized", sentences_lemmatized)
words = set(itertools.chain(*sentences_lemmatized))
print("words", words)
# set(['boy', 'fed', 'ate', 'cat', 'kicked', 'hat'])

sentences_lemmatized [['sam', 'is', 'red'], ['hannah', 'not', 'red'], ['hannah', 'is', 'green'], ['bob', 'is', 'green'], ['bob', 'not', 'red'], ['sam', 'not', 'green'], ['sarah', 'is', 'red'], ['sarah', 'not', 'green']]
words {'sam', 'sarah', 'red', 'is', 'bob', 'green', 'not', 'hannah'}


In [7]:
# dictionaries for converting words to integers and vice versa
word2idx = dict((v, i) for i, v in enumerate(words))
print("word2idx", word2idx)

idx2word = list(words)
print("idx2word", idx2word)


word2idx {'sarah': 1, 'is': 3, 'sam': 0, 'bob': 4, 'not': 6, 'green': 5, 'red': 2, 'hannah': 7}
idx2word ['sam', 'sarah', 'red', 'is', 'bob', 'green', 'not', 'hannah']


In [9]:
# convert the sentences a numpy array
to_idx = lambda x: [word2idx[word] for word in x]
sentences_idx = [to_idx(sentence) for sentence in sentences_lemmatized]
print("sentences_idx", sentences_idx)
sentences_array = np.asarray(sentences_idx, dtype='int32')
print("sentences_array", sentences_array)

sentences_idx [[0, 3, 2], [7, 6, 2], [7, 3, 5], [4, 3, 5], [4, 6, 2], [0, 6, 5], [1, 3, 2], [1, 6, 5]]
sentences_array [[0 3 2]
 [7 6 2]
 [7 3 5]
 [4 3 5]
 [4 6 2]
 [0 6 5]
 [1 3 2]
 [1 6 5]]


In [11]:
# parameters for the model
sentence_maxlen = 3
n_words = len(words)
n_embed_dims = 3

In [12]:
# put together a model to predict 
from keras.layers import Input, Embedding, merge, Flatten, SimpleRNN
from keras.models import Model

Using TensorFlow backend.


In [15]:
input_sentence = Input(shape=(sentence_maxlen,), dtype='int32')

# what is n_embed_dims for?
input_embedding = Embedding(n_words, n_embed_dims)(input_sentence)
color_prediction = SimpleRNN(1)(input_embedding)

In [22]:
predict_green = Model(input=[input_sentence], output=[color_prediction])
predict_green.compile(optimizer='sgd', loss='binary_crossentropy')
print("compiled")

compiled


In [25]:
# fit the model to predict what color each person is
predict_green.fit([sentences_array], [is_green], nb_epoch=500, verbose=0)
embeddings = predict_green.layers[1].W.value()
print(embeddings)

Tensor("embedding_4_W/read:0", shape=(8, 3), dtype=float32)


In [31]:
# print out the embedding vector associated with each word
for i in range(n_words):
	print('{}: {}'.format(idx2word[i], embeddings[i]))

sam: Tensor("strided_slice_15:0", shape=(3,), dtype=float32)
sarah: Tensor("strided_slice_16:0", shape=(3,), dtype=float32)
red: Tensor("strided_slice_17:0", shape=(3,), dtype=float32)
is: Tensor("strided_slice_18:0", shape=(3,), dtype=float32)
bob: Tensor("strided_slice_19:0", shape=(3,), dtype=float32)
green: Tensor("strided_slice_20:0", shape=(3,), dtype=float32)
not: Tensor("strided_slice_21:0", shape=(3,), dtype=float32)
hannah: Tensor("strided_slice_22:0", shape=(3,), dtype=float32)


![hoge](http://ben.bolte.cc/resources/attention_rnn/word_vectors.png)