In [None]:
%load_ext lab_black

In [None]:
import gensim.downloader as api
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

import re
import ast

#### load merged and partly processed emoji data

In [None]:
df = pd.read_csv("data/processed/emoji_descriptions.csv")
df.emjpd_aliases = df.emjpd_aliases.apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else []
)
df.emjpd_aliases = df.emjpd_aliases.apply(lambda x: " ".join(x))
for col in [
    "emjpd_emoji_name_og",
    "hemj_emoji_name_og",
    "emjpd_aliases",
    "emjpd_full_description",
    "emjpd_description_main",
    "emjpd_description_side",
    "hemj_emoji_description",
    "emjpd_usage_info",
]:
    df[col] = df[col].str.lower()

df["emoji_name_og"] = (
    df.emjpd_emoji_name_og.fillna("") + " " + df.hemj_emoji_name_og.fillna("")
)

### load embedding model

In [None]:
def load_embedding_model(model):
    """Load GloVe Vectors from Gensim
    Params:
        - model {str}: string specifying the model, possibilities include:
            - glove-wiki-gigaword-200
            - glove-twitter-200
            - word2vec-google-news-300
            - glove-wiki-gigaword-300
    Return:
        - wv_from_bin {gensim.models.keyedvectors.KeyedVectors}: Embeddings of all words
    """
    wv_from_bin = api.load(model)
    print("Loaded vocab size %i" % len(list(wv_from_bin.index_to_key)))
    return wv_from_bin

In [None]:
model = "glove-twitter-200"
emb = load_embedding_model(model)
emb_vocabulary = set(emb.index_to_key)
default = np.zeros(emb.get_vector("hello").shape, dtype=np.float32)

In [None]:
def getWords(text):
    """Returns list of all words (and numbers) in a given text. 
    Special characters and punctuation are ignored. 
    """
    print(text, "\n")
    return re.compile("\w+").findall(text)

In [None]:
def get_vector(text, default=np.zeros(200, dtype=np.float32)):
    """Given a string or a list of words (in lower case)
    the average word vector for all words that can be found 
    in the embedding vocabulary is returned. Words that can't be found 
    are not included in the average calculation. If no word is in the 
    embedding vocaublary the default (vector of 0s) is returned. 
    
    Returns: {np.array}
    """
    words = []
    if isinstance(text, str) and len(text) > 0:
        words = re.compile("\w+").findall(text)
    elif isinstance(text, list):
        words = text
    words = [w for w in words if w in emb_vocabulary]
    if len(words) > 0:
        embeddings = [emb.get_vector(w) for w in words]
        return np.mean(embeddings, axis=0)
    else:
        return default

In [None]:
def get_embedding(weighting, df):
    """Returns average word embedding for emojis given weighting rule. For 
    each column specified in weighting the average embedding will be calculated. 
    The embeddings of the columns are averaged with the specified weights 
    (weights don't have to sum to 1). 
    
    Params: 
        - weighting {dictionary}: dictionary containing weights for columns that are to 
                    be considered in the weighted average calculation of word embeddings
        - df {pd.DataFrame}: Dataframe with different emojis in each row and their descriptions, 
                    in particular it has the columns specified in weighting. The descriptions are taken 
                    for the word embedding average calculation. 
                    
    Returns: 
        - {pd.Series}: pandas Series of np.arrays that contain the word embeddings for the emojis
    """
    result = []
    total_weights = []
    for col, weight in weighting.items():
        vectors = df[col].apply(get_vector)
        addition = weight * vectors
        result.append(addition)

        indictaion = weight * (vectors.apply(sum) != 0)
        total_weights.append(indictaion)
    result = pd.concat(result, axis=1).sum(axis=1)
    total_weights = pd.concat(total_weights, axis=1).sum(axis=1)
    return result / total_weights

In [None]:
weighting = {
    "emoji_name_og": 30,
    "emjpd_aliases": 15,
    "emjpd_description_main": 35,
    "emjpd_description_side": 5,
    "hemj_emoji_description": 15,
}

embeddings = get_embedding(weighting, df)

### plausibility check

Check whether cosine similarity between similar emojis is indeed higher than unrelated

In [None]:
ana = df[["emoji_char"]].copy()
ana["embeddings"] = embeddings

In [None]:
def get_embedding(ana, idx):
    return torch.Tensor(ana.iloc[idx].embeddings)

In [None]:
a = get_embedding(ana, 1866)
b = get_embedding(ana, 1638)
c = get_embedding(ana, 1498)

In [None]:
sim = nn.CosineSimilarity(dim=0)

In [None]:
sim(a, c)

In [None]:
ana.sample(20)