In [18]:
import tensorflow as tf
import numpy as np
from nltk.tokenize import word_tokenize
from itertools import chain
import functools
import pickle

In [19]:
def save_pickle(d, path):
    with open(path, mode='wb') as f:
        pickle.dump(d, f)
def load_pickle(path):
    with open(path, mode='rb') as f:
        return pickle.load(f)

In [20]:
with open('data/movie_dialog_dataset/task1_qa/task1_qa_pipe_train.txt', 'r') as f:
    lines = f.readlines()
    data = []
    for l in lines:
        l = l.rstrip()
        turn, left = l.split(' ', 1)
        q, a = left.split('\t', 1)
        data.append((word_tokenize(q), a.split('|')))

In [24]:
max_query_size = max(map(len, chain.from_iterable(q for q, _ in data)))
vocab = sorted(functools.reduce(lambda x, y: x | y, (set(q + a) for q, a in data)))
print(len(vocab))
vocab_size = len(vocab) + 1 # +1 for nil word
w2i = dict((c, i+1) for i, c in enumerate(vocab))
w2i['__NIL__'] = 0
i2w = dict((i+1, c) for i, c in enumerate(vocab))
i2w[0] = '__NIL__'

65737


In [25]:
save_pickle(vocab, 'vocab.pickle')

In [23]:
save_pickle(data, 'data.pickle')

In [8]:
data[0]

(['what', 'movies', 'are', 'about', 'ginger', 'rogers', '?'],
 ['The Barkleys of Broadway', 'Kitty Foyle', 'Top Hat'])

In [26]:
def vectorize(data):
    Q, A = [], []
    for question, answer in data:
        q = [w2i[w] for w in question] + [0] * max(0, max_query_size - len(question)) # padding
        
        a = np.zeros(len(w2i))
#         print(answer)
        for ans in answer:
            a[w2i[ans]] = 1
        
        Q.append(q)
        A.append(a)
    
    return np.array(Q), np.array(A)

Q, A = vectorize(data[:10000])

In [27]:
Q[0]

array([65461, 62795, 58037, 57657, 60422, 63960,   481,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0])

In [None]:
save_pickle(Q, 'Q.pickle')
save_pickle(A, 'A.pickle')

## MLP w/ Tensorflow

In [12]:
# Parameters
learning_rate = 0.001
training_epochs = 15
batch_size = 128
display_step = 1

# Network Parameters
n_hidden_1 = 256 # 1st layer number of neurons
n_hidden_2 = 256 # 2nd layer number of neurons
# n_input = 784 # MNIST data input (img shape: 28*28)
n_input = max_query_size
# n_classes = 10 # MNIST total classes (0-9 digits)
n_classes = vocab_size

# tf Graph input
X = tf.placeholder("float", [None, n_input], name="X")
Y = tf.placeholder("float", [None, n_classes], name="Y")

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Create model
def multilayer_perceptron(x):
    # Hidden fully connected layer with 256 neurons
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    # Hidden fully connected layer with 256 neurons
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    # Output fully connected layer with a neuron for each class
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer


# Construct model
logits = multilayer_perceptron(X)


# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
# Initializing the variables
init = tf.global_variables_initializer()

In [13]:
print(len(Q), len(A))
print(X, Y)

1000 1000
Tensor("X:0", shape=(?, 23), dtype=float32) Tensor("Y:0", shape=(?, 65738), dtype=float32)


In [15]:
with tf.Session() as sess:
    sess.run(init)
    n_train = len(Q)
    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        total_batch = int(n_train/batch_size)
        # Loop over all batches
#         for i in range(total_batch):
        for start in range(0, n_train, batch_size):
            end = start + batch_size
#             batch_x, batch_y = mnist.train.next_batch(batch_size)
            batch_x, batch_y = Q[start:end], A[start:end]
    
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
            
            # Compute average loss
            avg_cost += c / total_batch
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost={:.9f}".format(avg_cost))
    print("Optimization Finished!")

    # Test model
    pred = tf.nn.softmax(logits)  # Apply softmax to logits
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
#     print("Accuracy:", accuracy.eval({X: mnist.test.images, Y: mnist.test.labels}))

Epoch: 0001 cost=725916918.857142806
Epoch: 0002 cost=644267483.428571463
Epoch: 0003 cost=582855012.571428537
Epoch: 0004 cost=531284374.857142806
Epoch: 0005 cost=488363318.857142925
Epoch: 0006 cost=450335094.857142866
Epoch: 0007 cost=417231867.428571463
Epoch: 0008 cost=388206486.857142866
Epoch: 0009 cost=362930491.428571463
Epoch: 0010 cost=340996955.428571463
Epoch: 0011 cost=322497364.571428597
Epoch: 0012 cost=307139069.714285731
Epoch: 0013 cost=294625241.142857134
Epoch: 0014 cost=284197760.000000000
Epoch: 0015 cost=275826070.857142866
Optimization Finished!


In [None]:
start=0
end=3
batch_x, batch_y = Q[start:end], A[start:end]
print(batch_y.shape)