In [47]:
import numpy as np
import pickle
import utils
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline

In [23]:
emoji_vectors = pickle.load(open('data/emoji_vectors.p', 'rb'))
moby_dick_vectors = pickle.load(open('data/moby_dick_vectors.p', 'rb'))
moby_dick_sents = pickle.load(open('data/moby_dick_sents.p', 'rb'))
raw_train = pickle.load(open('models/raw_train.p', 'rb'))
raw_test = pickle.load(open('models/raw_test.p', 'rb'))

emoji_embedding = np.array([v for v in emoji_vectors.values()])

In [31]:
tf.reset_default_graph()
sess.close()
sess = tf.InteractiveSession()

In [32]:
batch_size = 128
nodes = 300
embed_size = 300
x_seq_length = 32

inputs = tf.placeholder(tf.float32, (None, x_seq_length, embed_size), 'inputs')
input_mean = tf.nn.l2_normalize(tf.reduce_mean(inputs, axis=1), axis=1, name='input_mean')

output_embedding = tf.constant(emoji_embedding, name='output_embedding')

with tf.name_scope('network'):
    lstm_encoder = tf.contrib.rnn.LSTMCell(nodes, name='lstm_encoder')
    _, encoding = tf.nn.dynamic_rnn(lstm_encoder, inputs=inputs, dtype=tf.float32)
    
    lstm_decoder = tf.contrib.rnn.LSTMCell(nodes, name='lstm_decoder')
    lstm_outputs, _ = tf.nn.dynamic_rnn(lstm_decoder, inputs=inputs, initial_state=encoding, dtype=tf.float32)

    logits = tf.layers.dense(lstm_outputs, units=len(emoji_vectors), activation='softmax', name='dense') 
    outputs = utils.matmul3d(logits, output_embedding)

    output_mean = tf.nn.l2_normalize(tf.reduce_mean(outputs, axis=1), axis=1)

with tf.name_scope("optimization"):
    loss = tf.losses.cosine_distance(input_mean, output_mean, axis=1)
    optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
    
tf.summary.scalar('loss', loss)
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter('models/seq2seq_moby_dick/2', sess.graph)

In [6]:
train_sents, test_sents = train_test_split(moby_dick_sents, random_state=42)

In [7]:
X_train = [[moby_dick_vectors[w] for w in s] for s in train_sents]
X_test = [[moby_dick_vectors[w] for w in s] for s in test_sents]

In [19]:
def batch_generator(X, batch_size):
    i = 0
    while i < len(X):
        yield X[i:i+batch_size]
        i += batch_size

In [None]:
sess.run(tf.global_variables_initializer())
epochs = 100
start = time.time()

for i in range(epochs):
    t = time.time()
    losses = []
    for X in utils.batch_generator(X_train, batch_size):
        _, l, summary = sess.run([optimizer, loss, merged], feed_dict={inputs:X})
        losses.append(l)
    writer.add_summary(summary, global_step=i)
#     if l < .0005:
#         print('Epoch {:3} Loss: {:>6.3f} Epoch duration: {:>6.3f}s'.format(i, l, time.time() - t))
#         break
#     elif not i%10:
    print('Epoch {:3} Average Loss: {:>6.3f} Epoch duration: {:>6.3f}s'.format(i, np.mean(losses, axis=-1), time.time() - t))

saver = tf.train.Saver()
saver.save(sess, 'models/seq2seq_moby_dick/model2')
print('Total training time:', time.time()-start)

predictions = []
losses = []
emoji_keys = list(emoji_vectors.keys())
for x in tqdm(X_test):
    lo, l = sess.run([logits, loss], feed_dict={inputs:np.array(x).reshape(-1, 32, 300)})
    pred = np.argmax(lo, axis=2).reshape(32,)
    predictions.append([emoji_keys[i] for i in pred])
    losses.append(l)

print('Average test loss:', np.mean(losses, axis=-1))
print()

for i in range(10):
    print('Test sentence:', ' '.join(w for w in test_sents[i] if w))
    print('Prediction:', set(predictions[i]))
    print('Cosine distance:', losses[i])
    print()

Epoch   0 Average Loss:  0.406 Epoch duration: 49.880s
Epoch   1 Average Loss:  0.381 Epoch duration: 49.045s
Epoch   2 Average Loss:  0.379 Epoch duration: 49.211s
Epoch   3 Average Loss:  0.373 Epoch duration: 48.955s
Epoch   4 Average Loss:  0.361 Epoch duration: 48.973s


In [38]:
saver = tf.train.Saver()
saver.restore(sess, 'models/seq2seq_moby_dick/model1')

INFO:tensorflow:Restoring parameters from models/seq2seq_moby_dick/model1


In [39]:
predictions = []
losses = []
emoji_keys = list(emoji_vectors.keys())
for x in tqdm(X_test):
    lo, l = sess.run([logits, loss], feed_dict={inputs:np.array(x).reshape(-1, 32, 300)})
    pred = np.argmax(lo, axis=2).reshape(32,)
    predictions.append([emoji_keys[i] for i in pred])
    losses.append(l)



  0%|          | 0/2515 [00:00<?, ?it/s][A[A

  0%|          | 5/2515 [00:00<00:56, 44.40it/s][A[A

  0%|          | 10/2515 [00:00<00:56, 44.48it/s][A[A

  1%|          | 15/2515 [00:00<00:55, 44.80it/s][A[A

  1%|          | 20/2515 [00:00<00:55, 45.16it/s][A[A

  1%|          | 25/2515 [00:00<00:54, 45.91it/s][A[A

  1%|          | 30/2515 [00:00<00:53, 46.04it/s][A[A

  1%|▏         | 35/2515 [00:00<00:53, 46.07it/s][A[A

  2%|▏         | 40/2515 [00:00<00:53, 46.22it/s][A[A

  2%|▏         | 45/2515 [00:00<00:53, 46.30it/s][A[A

  2%|▏         | 50/2515 [00:01<00:53, 46.36it/s][A[A

  2%|▏         | 55/2515 [00:01<00:53, 46.34it/s][A[A

  2%|▏         | 60/2515 [00:01<00:53, 46.06it/s][A[A

  3%|▎         | 65/2515 [00:01<00:53, 46.01it/s][A[A

  3%|▎         | 70/2515 [00:01<00:53, 45.84it/s][A[A

  3%|▎         | 75/2515 [00:01<00:53, 45.90it/s][A[A

  3%|▎         | 80/2515 [00:01<00:52, 46.52it/s][A[A

  3%|▎         | 85/2515 [00:01<00:52, 

In [40]:
np.mean(losses)

0.30906993

In [41]:
for i in range(10):
    print('Test sentence:', ' '.join(w for w in test_sents[i] if w))
    print('Prediction:', set(predictions[i]))
    print('Cosine distance:', losses[i])
    print()

Test sentence: an old pike head sir there were seams dents in it
Prediction: {'🈁', '🐖', '🤕'}
Cosine distance: 0.22554648

Test sentence: this one poor hunt then the best lance out all surely he will not hang back when every foremast hand has clutched whetstone
Prediction: {'🈁', '😰', '👧', '🤼', '🔂', '🐖', '⚜', '🆙', '⛵'}
Cosine distance: 0.13118178

Test sentence: drop them over fore aft
Prediction: {'🐖', '⚜', '🆙'}
Cosine distance: 0.37479955

Test sentence: in the infancy the first settlement the emigrants were several times saved from starvation by the benevolent biscuit the whale ship luckily dropping an anchor in their waters
Prediction: {'🕦', '👧', '🐋', '〽', '🐖', '🎛', '🍠', '🛬', '⚰', '⛵'}
Cosine distance: 0.16954982

Test sentence: mighty whales which swim in sea water have sea oil swimming in them
Prediction: {'🐋', '🤼', '🐖', '🥘', '⛵'}
Cosine distance: 0.18382663

Test sentence: round round the fish s back pinioned in the turns upon turns in which during the past night the whale had ree

In [42]:
sorted_losses = np.argsort(losses)
examples = []
for i in sorted_losses:
    if len(raw_test[i]) < 10:
        examples.append([
            ' '.join(w for w in raw_test[i] if w), 
            ' '.join(w for w in test_sents[i] if w), 
            set(predictions[i]),
            losses[i]
        ])

In [59]:
examples[-1:-10:-1]

[["' Adios , Senor !'", '', {'🐖'}, 1.0],
 ['-- SCORESBY .', '', {'🐖'}, 1.0],
 ['Apoplexy !', '', {'🐖'}, 1.0],
 ['-- RAPE OF THE LOCK .', '', {'🐖'}, 1.0],
 ['Caw !', '', {'🐖'}, 1.0],
 ['Lo !', '', {'🐖'}, 1.0],
 ['CHAPTER 64', '', {'🐖'}, 1.0],
 ['" BLOODY BATTLE IN AFFGHANISTAN ."', '', {'🐖'}, 1.0],
 ['Halloa !', '', {'🐖'}, 1.0]]

In [48]:
counter = Counter()
for p in predictions:
    s = set(p)
    for e in s:
        counter[e] += 1

In [55]:
counter_items = sorted(list(counter.items()), key=lambda x: -x[1])

In [56]:
x = [c[0] for c in counter_items[0:10]]
y = [c[1] for c in counter_items[0:10]]

In [62]:
counter['🐖'] / len(predictions)

0.9984095427435388