In [0]:
!mkdir data
!wget http://www.umich.edu/~umfandsf/other/ebooks/alice30.txt -O ./data/alice_in_wonderland.txt

--2018-11-03 01:57:53--  http://www.umich.edu/~umfandsf/other/ebooks/alice30.txt
Resolving www.umich.edu (www.umich.edu)... 141.211.243.251, 2607:f018:1:1::1
Connecting to www.umich.edu (www.umich.edu)|141.211.243.251|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 148545 (145K) [text/plain]
Saving to: ‘./data/alice_in_wonderland.txt’


2018-11-03 01:57:53 (1.08 MB/s) - ‘./data/alice_in_wonderland.txt’ saved [148545/148545]



In [1]:
# -*- coding: utf-8 -*-
from __future__ import print_function
import operator

import nltk
import numpy as np
from keras.callbacks import TensorBoard
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer, one_hot
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import codecs

np.random.seed(42)

LOG_DIR = './logs'
BATCH_SIZE = 128
NUM_EPOCHS = 20

with codecs.open("./data/alice_in_wonderland.txt", "r", encoding="utf-8") as f:
    lines = [line.strip() for line in f if len(line) != 0]

try:
    sents = nltk.sent_tokenize(" ".join(lines))
except LookupError:
    print("Englisth tokenize does not downloaded. So download it.")
    nltk.download("punkt")
    sents = nltk.sent_tokenize(" ".join(lines))


tokenizer = Tokenizer(5000)  # use top 5000 words only
tokens = tokenizer.fit_on_texts(sents)
vocab_size = len(tokenizer.word_counts) + 1

xs = []
ys = []
for sent in sents:
    embedding = one_hot(sent, vocab_size)
    triples = list(nltk.trigrams(embedding))
    w_lefts = [x[0] for x in triples]
    w_centers = [x[1] for x in triples]
    w_rights = [x[2] for x in triples]
    xs.extend(w_centers)
    ys.extend(w_lefts)
    xs.extend(w_centers)
    ys.extend(w_rights)

ohe = OneHotEncoder(n_values=vocab_size)
X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense()
Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3,
                                                random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

Using TensorFlow backend.


(34402, 2653) (14744, 2653) (34402, 2653) (14744, 2653)


In [2]:
model = Sequential()
model.add(Dense(300, input_shape=(Xtrain.shape[1],)))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(Ytrain.shape[1]))
model.add(Activation("softmax"))
model.summary()

model.compile(optimizer="rmsprop", loss="categorical_crossentropy", 
              metrics=["accuracy"])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 300)               796200    
_________________________________________________________________
activation_1 (Activation)    (None, 300)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2653)              798553    
_________________________________________________________________
activation_2 (Activation)    (None, 2653)              0         
Total params: 1,594,753
Trainable params: 1,594,753
Non-trainable params: 0
_________________________________________________________________


In [3]:
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, 
                    epochs=NUM_EPOCHS, verbose=1,
                    callbacks=[TensorBoard(LOG_DIR)],
                    validation_data=(Xtest, Ytest))

Train on 34402 samples, validate on 14744 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [4]:
# evaluate model
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

Test score: 5.493, accuracy: 0.110


In [5]:
# using the word2vec model
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}

# retrieve the weights from the first dense layer. This will convert
# the input vector from a one-hot sum of two words to a dense 300 
# dimensional representation
W, b = model.layers[0].get_weights()

idx2emb = {}    
for word in word2idx.keys():
    wid = word2idx[word]
    vec_in = ohe.fit_transform(np.array(wid)).todense()
    vec_emb = np.dot(vec_in, W)
    idx2emb[wid] = vec_emb

for word in ["stupid", "alice", "succeeded"]:
    wid = word2idx[word]
    source_emb = idx2emb[wid]
    distances = []
    for i in range(1, vocab_size):
        if i == wid:
            continue
        target_emb = idx2emb[i]
        distances.append(((wid, i), 
                         cosine_distances(source_emb, target_emb)))
    sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
    predictions = [idx2word[x[0][1]] for x in sorted_distances]
    print("{:s} => {:s}".format(word, ", ".join(predictions)))

stupid => here, tale, slowly, hurrying, quarrelled, forgetting, crowded, carried, lefthand, enormous
alice => her, speed, trials, happening, low, she, eyelids, geography, by, quiver
succeeded => doors, murder, irritated, memory, struck, respect, conquest, branches, leaders, she'll
