This notebook provides the exploratori code to find whether there are any words in a text that can directly be represented with emojis. If such an emoji was used, one could filter this emoji. This type of label splitting would give us one dataset where the use of emojis would be more subtle and one dataset where emojis would just be used in a literal way.

In [None]:
%load_ext lab_black

In [None]:
import torch
import re
import pandas as pd
import torch.nn.functional as F

from sentence_transformers import SentenceTransformer

In [None]:
%%time
# model_name = "all-MiniLM-L6-v2"
model_name = "sentence-transformers/paraphrase-MiniLM-L3-v2"
# model_name = "average_word_embeddings_glove.6B.300d"
model = SentenceTransformer(model_name)

### emoji names

In [None]:
emoji_names = pd.read_csv(
    "../emoji_embedding/data/processed/emoji_descriptions.csv",
    usecols=["emoji_id", "emjpd_emoji_name_og"],
)

In [None]:
emoji_tensors = model.encode(
    emoji_names.emjpd_emoji_name_og.tolist(),
    show_progress_bar=True,
    normalize_embeddings=True,
    convert_to_tensor=True,
)

### get twitter data

In [None]:
data = pd.read_csv(
    "../twemoji/data/twemoji_valid_v2.csv",
    usecols=["text_no_emojis", "emoji_ids", "raw_text"],
    nrows=1000,
)
data.emoji_ids = data.emoji_ids.str[1:-1].apply(
    lambda x: [int(y) for y in x.split(",")]
)

### do processing

In [None]:
def literal_emoji_wrapper(threshold):
    def literal_emoji(row):
        twitter_word_tensors = model.encode(
            row.text_no_emojis.split(" "),
            normalize_embeddings=True,
            convert_to_tensor=True,
        )
        em_tensors = emoji_tensors[row.emoji_ids]
        m_v, _ = (twitter_word_tensors @ em_tensors.transpose(1, 0)).max(dim=0)
        idx = (m_v > threshold).nonzero().flatten().tolist()
        return [row.emoji_ids[i] for i in idx]

    return literal_emoji

In [None]:
%%time
literal_emoji = literal_emoji_wrapper(0.7)
data["literal_emoji"] = data.apply(literal_emoji, axis = 1)

In [None]:
data

In [None]:
data.loc[data.literal_emoji.apply(len) > 0].shape

In [None]:
# texts with obviously literal use of emojis
for k in data.loc[data.literal_emoji.apply(len) > 0].raw_text:
    print(k)
    print()

In [None]:
# sample texts with subtle use of emojis
for k in data.loc[data.literal_emoji.apply(len) == 0].raw_text.sample(20):
    print(k)
    print()