In [84]:
import numpy as np
import pandas as pd
import json
import spacy
from tqdm import tqdm
from IPython.core.display import display, HTML
from sklearn.metrics.pairwise import cosine_similarity

import utils


In [85]:
df = pd.read_json('data/emojis.json')
df.head()

Unnamed: 0,category,definition,keywords,name,senses,shortcode,unicode
0,Miscellaneous Symbols And Pictographs -> Emoji...,"The Woman Gesturing Not OK, Type-6 emoji is a ...","[dark skin tone, hand, forbidden, gesture, wom...",woman gesturing NO: dark skin tone,{'adjectives': [{'bn:00104562a': ['Contrary to...,,U+1F645 U+1F3FF U+200D U+2640 U+FE0F
1,Miscellaneous Symbols And Pictographs -> Emoji...,"The Female Guard, Type-6 emoji is a sequence o...","[dark skin tone, woman, guard]",woman guard: dark skin tone,"{'adjectives': [], 'verbs': [{'bn:00090041v': ...",,U+1F482 U+1F3FF U+200D U+2640 U+FE0F
2,,The female version of the ?? Runner emoji. The...,"[racing, running, woman, marathon]",woman running,{'adjectives': [{'bn:00109994a': ['Of advancin...,,U+1F3C3 U+200D U+2640 U+FE0F
3,Miscellaneous Symbols And Pictographs -> Emoji...,"The Woman Doing Cartwheel, Type-3 emoji is a s...","[gymnastics, medium-light skin tone, woman, ca...",woman cartwheeling: medium-light skin tone,"{'adjectives': [], 'verbs': [{'bn:00084605v': ...",,U+1F938 U+1F3FC U+200D U+2640 U+FE0F
4,,The female version of the ?? Golfer emoji. The...,"[woman, golf]",woman golfing,"{'adjectives': [], 'verbs': [{'bn:00088979v': ...",,U+1F3CC U+FE0F U+200D U+2640 U+FE0F


In [86]:
reload(utils)
for u in df.iloc[0].unicode.split(' '):
    print(u)
    utils.display_emoji(u)
    print()

U+1F645



U+1F3FF



U+200D



U+2640



U+FE0F





In [87]:
nlp = spacy.load('en')

with open('data/glove.6B.100d.txt', 'r') as f:
    for line in tqdm(f, total=400000):
        parts = line.split()
        word = parts[0]
        vec = np.array([float(v) for v in parts[1:]], dtype='f')
        nlp.vocab.set_vector(word, vec)

100%|██████████| 400000/400000 [00:20<00:00, 19111.30it/s]


In [88]:
docs = [nlp(x) for x in tqdm(df.keywords.str.join(' '))]

100%|██████████| 2389/2389 [00:29<00:00, 80.39it/s]


In [89]:
primary_unicodes = [u[0] for u in df.unicode.str.split(' ')]

In [90]:
for i in range(5):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

woman gesturing NO: dark skin tone



woman guard: dark skin tone



woman running



woman cartwheeling: medium-light skin tone



woman golfing





In [91]:
def get_similar_emoji(word_vector, n=5):
    '''
    returns doc index of most n most similar emoji
    args:
        word_vector: vector embedding of word
    returns:
        (list) index of most similar emoji
    '''
    return np.argsort([-utils.cosine_similarity(word_vector, emoji.vector) for emoji in docs])[:n]

In [92]:
v = nlp('king').vector
for i in get_similar_emoji(v):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

  dist = 1.0 - uv / np.sqrt(uu * vv)


crown



prince



coffin



triangular ruler



family: man, man, boy





In [93]:
v = nlp('queen').vector
for i in get_similar_emoji(v):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

crown


  dist = 1.0 - uv / np.sqrt(uu * vv)



prince



Mrs. Claus



Mrs. Claus: light skin tone



Mrs. Claus: dark skin tone





In [97]:
v = nlp('money').vector
for i in get_similar_emoji(v):
    print(df.iloc[i]['name'])
    utils.display_emoji(primary_unicodes[i])
    print()

credit card


  dist = 1.0 - uv / np.sqrt(uu * vv)



heavy dollar sign



money bag



currency exchange



money-mouth face



