In [22]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
from nltk.metrics import edit_distance
import numpy as np
import pandas as pd
import kagglehub
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
text8_path = kagglehub.dataset_download("gupta24789/text8-word-embedding")
text8_ds = open(f"{text8_path}/text8", "r")



In [24]:
emojis_dataset = pd.read_csv("./emojis.csv")
emojis_dataset

Unnamed: 0,Group,Subgroup,CodePoint,Status,Representation,Name,Section
0,Activities,event,1F383,fully-qualified,🎃,jack-o-lantern,E0.6
1,Activities,event,1F384,fully-qualified,🎄,Christmas tree,E0.6
2,Activities,event,1F386,fully-qualified,🎆,fireworks,E0.6
3,Activities,event,1F387,fully-qualified,🎇,sparkler,E0.6
4,Activities,event,1F9E8,fully-qualified,🧨,firecracker,E11.0
...,...,...,...,...,...,...,...
4585,Travel-Places,sky-weather,2604 FE0F,fully-qualified,☄️,comet,E1.0
4586,Travel-Places,sky-weather,2604,unqualified,☄,comet,E1.0
4587,Travel-Places,sky-weather,1F525,fully-qualified,🔥,fire,E0.6
4588,Travel-Places,sky-weather,1F4A7,fully-qualified,💧,droplet,E0.6


In [25]:
def pre_process(sentence) -> list[str]:
  sentence = sentence.lower()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  tokens = [t for t in tokens if t not in stop_words]
  return tokens

In [26]:
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)
emojis_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)


Unnamed: 0,Representation,Name,cleaned_text
0,🎃,jack-o-lantern,[jackolantern]
1,🎄,Christmas tree,"[christmas, tree]"
2,🎆,fireworks,[fireworks]
3,🎇,sparkler,[sparkler]
4,🧨,firecracker,[firecracker]
...,...,...,...
4585,☄️,comet,[comet]
4586,☄,comet,[comet]
4587,🔥,fire,[fire]
4588,💧,droplet,[droplet]


In [27]:
# Load pre-trained Google News Word2Vec model
# google_news_model = api.load("word2vec-google-news-300")

tokenized_text8 = [pre_process(line) for line in text8_ds.readlines()]
cbow_model = Word2Vec(sentences=tokenized_text8, vector_size=100, window=5, min_count=2, sg=0)

In [28]:
def get_embedding(tokens, model):
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

In [29]:
def replace_with_emoji(sentence: str, model: Word2Vec, emoji_embeddings: dict, threshold: float = 0.6) -> tuple[str, dict]:
    tokens = word_tokenize(sentence)
    modified_tokens = []
    similarities = {}

    for token in tokens:
        preprocessed_tokens = pre_process(token)

        if not preprocessed_tokens: # Skip if token is empty
            modified_tokens.append(token)
            continue

        preprocessed_word = preprocessed_tokens[0]

        if preprocessed_word in model.wv:  # Ensure word exists in Word2Vec model
            word_embedding = model.wv[preprocessed_word] # Get the word embedding
            best_match = None
            best_similarity = 0

            for emoji, emoji_embedding in emoji_embeddings.items():
                similarity = cosine_similarity([word_embedding], [emoji_embedding])[0][0] 

                if similarity > best_similarity:
                    best_match = emoji
                    best_similarity = similarity

            similarities[preprocessed_word] = best_similarity

            if best_similarity >= threshold:
                modified_tokens.append(best_match)
            else:
                modified_tokens.append(token)
        else:
            modified_tokens.append(token) 

    modified_sentence = " ".join(modified_tokens)
    return modified_sentence, similarities

In [30]:
emoji_embeddings = {
    row["Representation"]: get_embedding(row["cleaned_text"], cbow_model)
    for _, row in emojis_dataset.iterrows()
}

emoji_embeddings

{'🎃': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '🎄': array([-1.1266328e-03, -8.6971745e-03, -2.6435968e-03,  4.5867902e-03,
         4.9102325e-03, -1.9074222e-03, -2.6144870e-03, -1.1089193e-03,
        -2.4966081e-05,  4.2585605e-03, -1.5943387e-04,  2.8292264e-03,
         1.2760025e-03, -4.4104401e-03, -1.0714128e-03,  9.4857416e-04,
         9.7933598e-04,  5.7595819e-03, -6.3751820e-03, -6.8464517e-03,
        -5.3329384e-03, -2.1578271e-04, -4.5518959e-03,  2.5884565e-03,
         4.5891898e-03,  2.7986327e-03, -7.1707666e-03, -6.0235676e-03,
        -5.9951739e-03,  9.0180471e-

In [31]:
test_sentences = [
    "I want pizza and a movie night.",
    "This is such a sad day.",
    "Happy birthday to you!",
    "I need a vacation by the beach.",
    "coffee"
]

for sent in test_sentences:
    modified_sentence, similarities = replace_with_emoji(sent, cbow_model, emoji_embeddings)
    print(f"Input: {sent}")
    print(f"Modified Sentence: {modified_sentence}")
    print(f"Similarities: {similarities}\n")

Input: I want pizza and a movie night.
Modified Sentence: I want 🍕 and a 🎥 🌃 .
Similarities: {'want': 0.31245157, 'pizza': 0.99999994, 'movie': 0.69561756, 'night': 0.7608175}

Input: This is such a sad day.
Modified Sentence: This is such a sad day .
Similarities: {'sad': 0.5488049, 'day': 0.31487155}

Input: Happy birthday to you!
Modified Sentence: Happy 🎂 to you !
Similarities: {'happy': 0.36334664, 'birthday': 0.70465225}

Input: I need a vacation by the beach.
Modified Sentence: I need a vacation by the 🏖️ .
Similarities: {'need': 0.41794932, 'vacation': 0.28270304, 'beach': 0.7251611}

Input: coffee
Modified Sentence: coffee
Similarities: {'coffee': 0.41339368}

