# Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle as pk
import numpy as np
import sklearn.manifold as man
from tensorflow.python.framework import ops

from model import Emoji2Vec, ModelParams
from phrase2vec import Phrase2Vec
from utils import build_kb, get_examples_from_kb, generate_embeddings, get_metrics

  return f(*args, **kwds)


# Initializations
This step takes a while to execute, wait for 'DONE'

## Constants and Hyperparameters

In [58]:
word2vec_path = './data/word2vec/GoogleNews-vectors-negative300.bin'
mapping_path = 'emoji_mapping.p'
data_folder = './data/training/'
embeddings_file = 'generated_embeddings.p'

in_dim = 300   # Length of word2vec vectors
out_dim = 300  # Desired dimension of output vectors
pos_ex = 4
neg_ratio = 1
max_epochs = 40
dropout = 0.0

params = ModelParams(in_dim=in_dim, out_dim=out_dim, pos_ex=pos_ex, max_epochs=max_epochs,
                    neg_ratio=neg_ratio, learning_rate=0.001, dropout=dropout, class_threshold=0.5)


ckpt_path = params.model_folder('unicode') + '/model.ckpt'
e2v_path = params.model_folder('unicode') + '/emoji2vec.bin'
print(e2v_path)

./results/unicode/k-300_pos-4_rat-1_ep-120_dr-0/emoji2vec.bin


## Build Knowledge Base

In [59]:
print('reading training data from: ' + data_folder)
train_kb, ind2phr, ind2emoj = build_kb(data_folder)

pk.dump(ind2emoj, open(mapping_path, 'wb'))

reading training data from: ./data/training/


## Read or Generate Embeddings

In [60]:
embeddings_array = generate_embeddings(ind2phr=ind2phr, kb=train_kb, embeddings_file=embeddings_file,
                                             word2vec_file=word2vec_path)
print('DONE')

loading embeddings...
DONE


## Initialize models and mappings

In [62]:
print('Initializing: reading embedding data from: ' + word2vec_path)
# get the vector for a phrase
phraseVecModel = Phrase2Vec.from_word2vec_paths(params.in_dim, word2vec_path, e2v_path)
print('DONE')

Initializing: reading embedding data from: ./data/word2vec/GoogleNews-vectors-negative300.bin


FileNotFoundError: [Errno 2] No such file or directory: './results/unicode/k-300_pos-4_rat-1_ep-120_dr-0/emoji2vec.bin'

In [None]:
ops.reset_default_graph()

# mapping from id to emoji
mapping = pk.load(open(mapping_path, 'rb'))
# mapping from emoji to id
inv_map = {v: k for k, v in mapping.items()}

# tensorflow model
model = Emoji2Vec(params, len(mapping), embeddings_array=embeddings_array)
print('DONE')

In [None]:
# initialize tensorflow session
session = tf.Session()
saver = tf.train.Saver()
saver.restore(session, ckpt_path)

# Performance Measures
Check the accuracy, f1 score, auc, and the auc graph

In [None]:
def measures(example_type):
    train_kb, ind2phr, ind2emoj = build_kb(data_folder)
    ex_set = get_examples_from_kb(kb=train_kb, example_type=example_type)

    # evaluate the dev. accuracy using this as the threshold
    thresh = 0.5

    acc = model.accuracy(session=session, dset=ex_set, threshold=thresh)
    f1 = model.f1_score(session=session, dset=ex_set)
    print(str.format('Accuracy at thresh={}: {}', thresh, f1))
    print(str.format('F1 score: {}', f1))
    
    try:
        auc = model.auc(session=session, dset=ex_set)


        print(str.format('AUC score: {}', auc))

        fpr, tpr, thresholds = model.roc_vals(session=session, dset=ex_set)

        fig = plt.figure()
        ax = fig.add_subplot(111)
        ax.plot(fpr, tpr)
        ax.set_title("ROC Curve for learned emoji")
        plt.xlabel("false positive rate")
        plt.ylabel("true positive rate")

        #
        #for i , val in enumerate(thresholds):
        #    if i % 10 == 0:
        #        plt.annotate(val, (fpr[i], tpr[i]))

        plt.grid()
        plt.show()
    except:
        print('Can\'t compute AUC or ROC')

In [None]:
print('Train Set')
measures('train')

In [None]:
print('Dev Set')
measures('dev')

In [None]:
print('Test Set')
measures('test')

In [None]:
# RESET THE GRAPH
ops.reset_default_graph()
model = Emoji2Vec(params, len(mapping), embeddings_array=None, use_embeddings=False)

session = tf.Session()
saver = tf.train.Saver()
saver.restore(session, ckpt_path)

# Top Emoji Query
Set `phr` as a phrase, and get the top `N` emojis correlating to that phrase.

In [None]:
phr = 'happy face'
N = 5

# get the vector representaiton
vec = phraseVecModel[phr]

# query the tensorflow model
res = list()
for colIx in range(0, len(mapping)):
    predict = session.run(model.prob, feed_dict={
        model.col: np.array([colIx]),
        model.orig_vec: np.array([vec])
    })
    res.append(predict)

# print the top N emoji
for ind in sorted(range(len(res)), key=lambda i: res[i], reverse=True)[:N]:
    print(mapping[ind], res[ind])

In [None]:
dir(phraseVecModel.wordVecModel)

# Top Phrase Query
Set `em` as an emoji, and get the top `N` phrases correlating to that emoji.

In [None]:
# input
em = '👑'
N = 10

# get the relevant vectors from tensorflow
emoji_vecs = session.run(model.V)
vec = emoji_vecs[inv_map[em]]

# print top N phrases
for word, score in phraseVecModel.from_emoji([vec], top_n=N):
    print(str.format("{}\t{}", word, score))

# Analogy Task
Set `base` as a base emoji, `minus` as an emoji to subtract from the base, `plus` as an emoji to add, and get the top `N` correlating phrases and emojis relating to this analogy. 

In [None]:
def print_analogy_result(base, minus, plus):
    emoji_vecs = session.run(model.V)
    total = phraseVecModel[base] - phraseVecModel[minus] + phraseVecModel[plus]
    
    res = list()
    for colIx in range(0, len(mapping)):
        predict = session.run(model.prob, feed_dict={
            model.col: np.array([colIx]),
            model.orig_vec: np.array([total / np.linalg.norm(total)])
        })
        res.append(predict)
        
    ems = sorted(range(len(res)), key=lambda i: res[i], reverse=True)[:5]
    print(str.format('{} - {} + {} = {}', base, minus, plus, [mapping[em] for em in ems]))

In [None]:
print_analogy_result('👑', '🚹', '🚺')
print_analogy_result('💵', '🇺🇸', '🇬🇧')
print_analogy_result('💵', '🇺🇸', '🇪🇺')
print_analogy_result('👦', '👨', '👩')
print_analogy_result('👪', '👦', '👧')
print_analogy_result('🕶', '☀️', '⛈')

In [None]:
# input
base = '👑'
# base = '👨'
minus = '🚹'
plus = '🚺'
N = 10

# get the relevant vectors from tensorflow
emoji_vecs = session.run(model.V)
total = emoji_vecs[inv_map[base]] - emoji_vecs[inv_map[minus]] + emoji_vecs[inv_map[plus]]

total = phraseVecModel["I think I should buy zero cola"]

# print the top N phrases
print(str.format('Top {} matching phrases:', N))
print()
for word, score in phraseVecModel.from_emoji([total], top_n=N):
    print(str.format("{}\t{}", word, score))
    
# query the tensorflow model
res = list()
for colIx in range(0, len(mapping)):
    predict = session.run(model.prob, feed_dict={
        model.col: np.array([colIx]),
        model.orig_vec: np.array([total / np.linalg.norm(total)])
    })
    predict[0] = predict[0]**5
    res.append(predict)

# print the top N emoji
print()
print(str.format('Top {} matching emoji:', N))
print()
for ind in sorted(range(len(res)), key=lambda i: res[i], reverse=True)[:N]:
    print(mapping[ind], res[ind])

# Visualize Emoji Vector Space
2-D projection of the Emoji vector space, using t-SNE.

Jupyter won't plot emoji. Use visualize.py to see a clearer picture.

In [None]:
V = session.run(model.V)

fig = plt.figure()
ax = fig.add_subplot(111)
tsne = man.TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
trans = tsne.fit_transform(V)
x, y = zip(*trans)
plt.scatter(x, y)

for i in range(len(trans)):
    ax.annotate(mapping[i], xy=trans[i], textcoords='data')

plt.grid()
plt.show()