In [5]:
import numpy as np
import pandas as pd
import json
import spacy
from tqdm import tqdm
import gensim
import warnings; warnings.simplefilter('ignore')

import utils


In [2]:
df = pd.read_json('data/emojis.json')
df.head()

Unnamed: 0,category,definition,keywords,name,senses,shortcode,unicode
0,Miscellaneous Symbols And Pictographs -> Emoji...,"The Woman Gesturing Not OK, Type-6 emoji is a ...","[dark skin tone, hand, forbidden, gesture, wom...",woman gesturing NO: dark skin tone,{'adjectives': [{'bn:00104562a': ['Contrary to...,,U+1F645 U+1F3FF U+200D U+2640 U+FE0F
1,Miscellaneous Symbols And Pictographs -> Emoji...,"The Female Guard, Type-6 emoji is a sequence o...","[dark skin tone, woman, guard]",woman guard: dark skin tone,"{'adjectives': [], 'verbs': [{'bn:00090041v': ...",,U+1F482 U+1F3FF U+200D U+2640 U+FE0F
2,,The female version of the ??¬†Runner emoji. The...,"[racing, running, woman, marathon]",woman running,{'adjectives': [{'bn:00109994a': ['Of advancin...,,U+1F3C3 U+200D U+2640 U+FE0F
3,Miscellaneous Symbols And Pictographs -> Emoji...,"The Woman Doing Cartwheel, Type-3 emoji is a s...","[gymnastics, medium-light skin tone, woman, ca...",woman cartwheeling: medium-light skin tone,"{'adjectives': [], 'verbs': [{'bn:00084605v': ...",,U+1F938 U+1F3FC U+200D U+2640 U+FE0F
4,,The female version of the ??¬†Golfer¬†emoji. The...,"[woman, golf]",woman golfing,"{'adjectives': [], 'verbs': [{'bn:00088979v': ...",,U+1F3CC U+FE0F U+200D U+2640 U+FE0F


In [3]:
for u in df.iloc[0].unicode.split(' '):
    print(u)
    utils.display_emoji(u)
    print()

U+1F645



U+1F3FF



U+200D



U+2640



U+FE0F





In [58]:
# With Word2vec

vocab_size = 100000 # keep this reasonbly small for now, we're going to be doing a search over the entire vocab
model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True, limit=vocab_size)

# Init blank english spacy nlp object
nlp = spacy.blank('en')

# Loop through range of all indexes, get words associated with each index.
# The words in the keys list will correspond to the order of the google embed matrix
keys = [model.index2word[i] for i in range(vocab_size)]

# Set the vectors for our nlp object to the google news vectors
nlp.vocab.vectors = spacy.vocab.Vectors(data=model.syn0, keys=keys)

In [None]:
# With Glove.  Let's use word2vec for now.

# nlp = spacy.load('en')

# with open('data/glove.6B.100d.txt', 'r') as f:
#     for line in tqdm(f, total=400000):
#         parts = line.split()
#         word = parts[0]
#         vec = np.array([float(v) for v in parts[1:]], dtype='f')
#         nlp.vocab.set_vector(word, vec)

In [59]:
for u in df.unicode:
    print(u)
    break

U+1F645 U+1F3FF U+200D U+2640 U+FE0F


In [57]:
emoji_vectors = {}
for row in tqdm(df.unicode):
    emoji
    if i < 10: print(chr(int(row['unicode'].split(' ')[0][2:], 16)))

2389it [00:00, 19102.31it/s]

üôÖ
üíÇ
üèÉ
ü§∏
üèå
0
üèÉ
üë®
üëè
üëè





In [13]:
emoji = []

docs = [nlp(x) for x in tqdm(df.keywords.str.join(' '))]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2389/2389 [00:00<00:00, 5435.43it/s]


In [14]:
primary_unicodes = [u[0] for u in df.unicode.str.split(' ')]

In [15]:
for i in range(5):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

woman gesturing NO: dark skin tone



woman guard: dark skin tone



woman running



woman cartwheeling: medium-light skin tone



woman golfing





In [62]:
emoji_vectors = {}
for i,u in enumerate(primary_unicodes):
    emoji = chr(int(u[2:], 16))
    emoji_vectors[emoji] = docs[i].vector

In [16]:
def get_similar_emoji(word_vector, n=5):
    '''
    returns doc index of most n most similar emoji
    args:
        word_vector: vector embedding of word
    returns:
        (list) index of most similar emoji
    '''
    return np.argsort([-utils.cosine_similarity(word_vector, emoji.vector) for emoji in docs])[:n]

In [17]:
v = nlp('star').vector
for i in get_similar_emoji(v):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

eight-pointed star



white medium star



night with stars



sparkles



dotted six-pointed star





In [18]:
v = nlp(u'star').vector + nlp(u'movie').vector
for i in get_similar_emoji(v):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

clapper board



eight-pointed star



white medium star



dizzy



night with stars





In [66]:
def get_similar_words(emoji_vector, n=5):
    '''
    returns doc index of most n most similar emoji
    args:
        word_vector: vector embedding of word
    returns:
        (list) index of most similar emoji
    '''
    return np.argsort([-utils.cosine_similarity(emoji_vector, word.vector) for word in nlp.vocab])[:n]

In [70]:
v = emoji_vectors['üé¨']
for i in get_similar_words(v):
    print(nlp.vocab[i].text)

appos
FLAG21
X
InfForm_three
Case_del
