In [250]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.metrics import edit_distance
import numpy as np
import pandas as pd
import kagglehub

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [251]:
emojis_dataset = pd.read_csv("./emojis.csv")
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset

Unnamed: 0,Representation,Name
0,🎃,jack-o-lantern
1,🎄,Christmas tree
2,🎆,fireworks
3,🎇,sparkler
4,🧨,firecracker
...,...,...
4585,☄️,comet
4586,☄,comet
4587,🔥,fire
4588,💧,droplet


In [252]:
def pre_process(sentence) -> list[str]:
  sentence = sentence.lower()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  tokens = [t for t in tokens if t not in stop_words]
  return tokens

In [253]:
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)
emojis_dataset

Unnamed: 0,Representation,Name,cleaned_text
0,🎃,jack-o-lantern,[jackolantern]
1,🎄,Christmas tree,"[christmas, tree]"
2,🎆,fireworks,[fireworks]
3,🎇,sparkler,[sparkler]
4,🧨,firecracker,[firecracker]
...,...,...,...
4585,☄️,comet,[comet]
4586,☄,comet,[comet]
4587,🔥,fire,[fire]
4588,💧,droplet,[droplet]


In [254]:
def get_hypernyms(word) -> list[str]:
    hypernyms = set()
    for syn in wordnet.synsets(word):
        for hypernym in syn.hypernyms():
            for lemma in hypernym.lemmas():
                hypernyms.add(lemma.name())
    return hypernyms

In [255]:
def find_synonym_words(wordx):
    keep = []
    synsets = wordnet.synsets(wordx, lang='eng')
    for synset in synsets:
        word = synset.name().split(".")[0]
        if word not in keep:
            keep.append(word)
    return keep

In [256]:
def get_best_matching_emoji(word, emoji_dataset, threshold=0.6):
    best_emoji = None
    best_similarity = 0

    word_synsets = wordnet.synsets(word)

    if not word_synsets:
        return None, 0
    
    emoji_dict = {row["Representation"]: row["cleaned_text"] for _, row in emoji_dataset.iterrows()}
    
    for emoji, emoji_names in emoji_dict.items():
        for emoji_name in emoji_names:
            emoji_name_synsets = wordnet.synsets(emoji_name)

            if not emoji_name_synsets:
                continue

            w_synset = word_synsets[0]
            e_synset = emoji_name_synsets[0]

            w_synset
            e_synset

            similarity = w_synset.path_similarity(e_synset)
            if similarity and similarity > best_similarity:
                best_similarity = similarity
                best_emoji = emoji

    return (best_emoji, best_similarity) if best_similarity >= threshold else (None, best_similarity)


In [257]:
def replace_with_emoji(sentence, emoji_dataset, threshold = 0.6) -> tuple[str, dict]:
    tokens = word_tokenize(sentence)
    modified_tokens = []
    similarities = {}

    for token in tokens:
        pre_processed_token = pre_process(token)

        if not pre_processed_token:
            modified_tokens.append(token)
            continue

        best_emoji, best_similarity = get_best_matching_emoji(pre_processed_token[0], emoji_dataset, threshold)

        if not best_emoji:
            synonyms = find_synonym_words(pre_processed_token[0])
            for synonym in synonyms:
                best_emoji, best_similarity = get_best_matching_emoji(synonym, emoji_dataset, threshold)
                if best_emoji:
                    break

        if not best_emoji:
            hypernyms = get_hypernyms(pre_processed_token[0])
            for hypernym in hypernyms:
                best_emoji, best_similarity = get_best_matching_emoji(hypernym, emoji_dataset, threshold)
                if best_emoji:
                    break
        
        if best_emoji:
            modified_tokens.append(best_emoji)
        else:
            modified_tokens.append(token)
        
        similarities[token] = best_similarity

    modified_sentence = " ".join(modified_tokens)
    
    return modified_sentence, similarities

In [258]:
test_sentences = [
    "I want pizza and a movie night.",
    "This is such a sad day.",
    "Happy birthday to you!",
    "I need a vacation by the beach.",
    "coffee"
]


for sent in test_sentences:
    modified_sentence, similarity = replace_with_emoji(sent, emojis_dataset)
    print(f"Input: {sent}")
    print(f"Modified Sentence: {modified_sentence}")
    print(f"Similarities: {similarity}\n")

Input: I want pizza and a movie night.
Modified Sentence: I 👧 🍕 and a 🎥 🌃 .
Similarities: {'want': 1.0, 'pizza': 1.0, 'movie': 1.0, 'night': 1.0}

Input: This is such a sad day.
Modified Sentence: This is such a 😥 day .
Similarities: {'sad': 1.0, 'day': 0.3333333333333333}

Input: Happy birthday to you!
Modified Sentence: Happy 🎂 to you !
Similarities: {'Happy': 0.2, 'birthday': 1.0}

Input: I need a vacation by the beach.
Modified Sentence: I need a vacation by the 🏖️ .
Similarities: {'need': 0.5, 'vacation': 0.25, 'beach': 1.0}

Input: coffee
Modified Sentence: 🍫
Similarities: {'coffee': 1.0}

