In [17]:
import numpy as np
import pandas as pd
import json
import spacy
from tqdm import tqdm
import gensim
# import warnings; warnings.simplefilter('ignore')
import pickle

import utils


In [2]:
df = pd.read_json('data/emojis.json')
print(df.shape)
df.head()

(2389, 7)


Unnamed: 0,category,definition,keywords,name,senses,shortcode,unicode
0,Miscellaneous Symbols And Pictographs -> Emoji...,"The Woman Gesturing Not OK, Type-6 emoji is a ...","[dark skin tone, hand, forbidden, gesture, wom...",woman gesturing NO: dark skin tone,{'adjectives': [{'bn:00104562a': ['Contrary to...,,U+1F645 U+1F3FF U+200D U+2640 U+FE0F
1,Miscellaneous Symbols And Pictographs -> Emoji...,"The Female Guard, Type-6 emoji is a sequence o...","[dark skin tone, woman, guard]",woman guard: dark skin tone,"{'adjectives': [], 'verbs': [{'bn:00090041v': ...",,U+1F482 U+1F3FF U+200D U+2640 U+FE0F
2,,The female version of the ?? Runner emoji. The...,"[racing, running, woman, marathon]",woman running,{'adjectives': [{'bn:00109994a': ['Of advancin...,,U+1F3C3 U+200D U+2640 U+FE0F
3,Miscellaneous Symbols And Pictographs -> Emoji...,"The Woman Doing Cartwheel, Type-3 emoji is a s...","[gymnastics, medium-light skin tone, woman, ca...",woman cartwheeling: medium-light skin tone,"{'adjectives': [], 'verbs': [{'bn:00084605v': ...",,U+1F938 U+1F3FC U+200D U+2640 U+FE0F
4,,The female version of the ?? Golfer emoji. The...,"[woman, golf]",woman golfing,"{'adjectives': [], 'verbs': [{'bn:00088979v': ...",,U+1F3CC U+FE0F U+200D U+2640 U+FE0F


In [4]:
for u in df.iloc[0].unicode.split(' '):
    print(u)
    utils.display_emoji(u)
    print()

U+1F645



U+1F3FF



U+200D



U+2640



U+FE0F





In [3]:
# With Word2vec

vocab_size = 30000 # keep this reasonably small for now, we're going to be doing a search over the entire vocab
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True, limit=vocab_size)

# Init blank english spacy nlp object
nlp = spacy.blank('en')

for i in tqdm(range(vocab_size)):
    nlp.vocab.set_vector(model.index2word[i], model.vectors[i])
# # Loop through range of all indexes, get words associated with each index.
# # The words in the keys list will correspond to the order of the google embed matrix
keys = [model.index2word[i] for i in range(vocab_size)]

# # Set the vectors for our nlp object to the google news vectors
nlp.vocab.vectors = spacy.vocab.Vectors(data=model.vectors, keys=keys)

100%|██████████| 30000/30000 [00:00<00:00, 74538.55it/s]


In [27]:
model.word_vec('<unk>')

KeyError: "word '<unk>' not in vocabulary"

In [55]:
print(model['woman'].shape)
print(model['golf'].shape)
print((model['woman']+model['golf']).shape)
print(utils.cosine_similarity(model['woman'],model['golf']))
print(utils.cosine_similarity(model['woman'],model['woman']+model['golf']))
print(utils.cosine_similarity(model['golf'],model['woman']+model['golf']))
avg = np.mean(np.concatenate((np.reshape(np.array(model['woman']),(-1,1)),np.reshape(np.array(model['golf']),(-1,1))),axis=-1),axis =1)
print(utils.cosine_similarity(model['woman'],avg))
print(utils.cosine_similarity(model['golf'],avg))

(300,)
(300,)
(300,)
0.11305464059114456
0.695676326751709
0.7923993468284607
0.695676326751709
0.7923993468284607


In [21]:
print(nlp.vocab.strings)

<spacy.strings.StringStore object at 0x7ff36c6bc9d8>


### Interesting: Cosine similarity between a word and the sum of two words is the same as the similarity between the word and the vector average of the two words.

In [13]:
for word in nlp.vocab.strings:
    print(word)
    break

""


In [58]:
# With Glove.  Let's use word2vec for now.

# nlp = spacy.load('en')

# with open('data/glove.6B.100d.txt', 'r') as f:
#     for line in tqdm(f, total=400000):
#         parts = line.split()
#         word = parts[0]
#         vec = np.array([float(v) for v in parts[1:]], dtype='f')
#         nlp.vocab.set_vector(word, vec)

In [4]:
docs = [nlp(x) for x in tqdm(df.keywords.str.join(' '))]

100%|██████████| 2389/2389 [00:00<00:00, 5933.77it/s]


In [5]:
primary_unicodes = [u[0] for u in df.unicode.str.split(' ')]

In [6]:
for i in range(5):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

woman gesturing NO: dark skin tone



woman guard: dark skin tone



woman running



woman cartwheeling: medium-light skin tone



woman golfing





In [7]:
emoji_vectors = {}
for i,u in enumerate(primary_unicodes):
    emoji = chr(int(u[2:], 16))
    emoji_vectors[emoji] = docs[i].vector

In [14]:
unicode_vectors = {}
for i,u in enumerate(primary_unicodes):
    unicode_vectors[primary_unicodes[i][2:]] = docs[i].vector

In [18]:
pickle.dump(emoji_vectors, open('data/emoji_vectors.p', 'wb'))

In [19]:
pickle.dump(unicode_vectors, open('data/unicode_vectors.p', 'wb'))

In [20]:
x = pickle.load(open('data/emoji_vectors.p', 'rb'))

In [23]:
list(x.items())[0]

('🙅', array([ 0.12855114,  0.00623113,  0.06112393, -0.03385787, -0.15304288,
        -0.02721058,  0.05570845, -0.08369029,  0.10476962,  0.03382388,
        -0.00608132, -0.17169744, -0.0610407 ,  0.01689009, -0.15957919,
         0.09194946, -0.01931208,  0.11918502, -0.01453746, -0.09363902,
         0.09375   , -0.03377325, -0.02470259, -0.0533475 , -0.00224998,
        -0.1459961 , -0.0098655 ,  0.108748  ,  0.06298828,  0.04016113,
        -0.10621227, -0.09569203, -0.06030273, -0.01551316, -0.06245006,
        -0.03633256,  0.05515914, -0.01250666,  0.00125954, -0.00770153,
         0.12029474,  0.01282198,  0.09190785,  0.00502153,  0.06919167,
        -0.04287442, -0.04181463,  0.01015403, -0.04398138, -0.04058838,
        -0.01980036,  0.06725519, -0.02463601, -0.02156761,  0.03394708,
        -0.02216131, -0.01426003, -0.04017223, -0.0145014 , -0.07403564,
        -0.05496493,  0.0254794 , -0.00519354,  0.0168568 , -0.06871449,
         0.07390802, -0.09547008,  0.10748846,

In [64]:
def get_similar_emoji(word_vector, n=5):
    '''
    returns doc index of most n most similar emoji
    args:
        word_vector: vector embedding of word
    returns:
        int index of most similar emoji if n == 1
        (list) index of most similar emoji if n > 1
    '''
    indexes = np.argsort([-utils.cosine_similarity(word_vector, emoji.vector) for emoji in docs])
    return indexes[0] if n == 1 else indexes[:n]

In [65]:
v = nlp(u'star').vector
for i in get_similar_emoji(v):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

eight-pointed star


  dist = 1.0 - uv / np.sqrt(uu * vv)



sparkles



white medium star



night with stars



dotted six-pointed star





In [66]:
v = nlp(u'star').vector + nlp(u'movie').vector
for i in get_similar_emoji(v):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

clapper board


  dist = 1.0 - uv / np.sqrt(uu * vv)



white medium star



sparkles



eight-pointed star



dizzy





In [67]:
vocab = list(nlp.vocab.strings)

In [68]:
def get_similar_words(emoji_vector, n=5):
    '''
    returns doc index of most n most similar words
    args:
        emoji_vector: vector embedding of emoji
    returns:
        int index of most similar word if n == 1
        (list) index of most similar words if n > 1
    '''
    indexes = np.argsort([-utils.cosine_similarity(emoji_vector, nlp(word).vector) for word in vocab])[:n]
    return indexes[0] if n == 1 else indexes[:n]

In [69]:
v1 = nlp(u'star').vector
v2 = emoji_vectors['⭐']
print(utils.cosine_similarity(v1, v2))

1.0


In [70]:
v1 = nlp(u'hello').vector
v2 = emoji_vectors['⭐']
print(utils.cosine_similarity(v1, v2))

0.12529227137565613


In [71]:
print(utils.cosine_similarity(v2, nlp.vocab[u'star'].vector))

1.0


In [72]:
v = emoji_vectors['⭐']
for i in tqdm(get_similar_words(v)):
    print(i)
    print(vocab[i])
    print()

  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 5/5 [00:00<00:00, 1502.37it/s]

1795
star

2910
stars

10017
superstar

19634
superstars

9351
standout






In [73]:
def encode(text):
    '''
    encodes text into emoji
    args:
        (list) text
    returns:
        (list) unicodes of emoji
    '''
    vectors = [nlp(word).vector for word in text.split(' ')]
    return [primary_unicodes[get_similar_emoji(v, n=1)] for v in tqdm(vectors)] 

def decode(emoji):
    '''
    decodes emoji into text
    args:
        (list) emoji
    returns:
        (list) words
    '''
    vectors = [emoji_vectors[e] for e in emoji]
    return [vocab[get_similar_words(v, n=1)] for v in tqdm(vectors)] 

In [78]:
text = 'Woman golfing'
encoding = encode(text)
emoji = [chr(int(unicode[2:], 16)) for unicode in encoding]
for e in emoji:
    print(e, end='')

  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 2/2 [00:00<00:00,  7.20it/s]

♀🏌




In [79]:
decoding = decode(emoji)

  dist = 1.0 - uv / np.sqrt(uu * vv)
100%|██████████| 2/2 [00:09<00:00,  4.76s/it]


In [80]:
print(decoding)

['woman', 'skin']


In [76]:
utils.cosine_similarity(nlp(text).vector, nlp(' '.join(decoding)).vector)

0.19798541069030762

In [77]:
for i in get_similar_emoji(nlp('Berkeley').vector):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

woman scientist


  dist = 1.0 - uv / np.sqrt(uu * vv)



man scientist



pot of food



woman scientist: dark skin tone



woman scientist: light skin tone



