In [583]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import re
import pandas as pd
from nltk import pos_tag

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [584]:
emojis_dataset = pd.read_csv("./emojis.csv")
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset

Unnamed: 0,Representation,Name
0,🎃,jack-o-lantern
1,🎄,Christmas tree
2,🎆,fireworks
3,🎇,sparkler
4,🧨,firecracker
...,...,...
4585,☄️,comet
4586,☄,comet
4587,🔥,fire
4588,💧,droplet


In [585]:
def pre_process(sentence) -> list[str]:
  lemmatizer = WordNetLemmatizer()
  sentence = sentence.lower()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  tokens = [t for t in tokens if t not in stop_words]
  tokens = [lemmatizer.lemmatize(t) for t in tokens]
  return tokens

In [586]:
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)
emojis_dataset

Unnamed: 0,Representation,Name,cleaned_text
0,🎃,jack-o-lantern,[jackolantern]
1,🎄,Christmas tree,"[christmas, tree]"
2,🎆,fireworks,[firework]
3,🎇,sparkler,[sparkler]
4,🧨,firecracker,[firecracker]
...,...,...,...
4585,☄️,comet,[comet]
4586,☄,comet,[comet]
4587,🔥,fire,[fire]
4588,💧,droplet,[droplet]


In [587]:
def get_hypernyms(word, pos_tagged):
    hypernyms = set()
    for syn in wordnet.synsets(word, pos=pos_tagged):
        for hypernym in syn.hypernyms():
                hypernyms.add(hypernym)
    return hypernyms

hypernym - дума с широко значение, съставляваща категория, в която попадат думите с по-специфични значения; Например color е hypernym на red

In [588]:
def find_synonym_words(word, pos_tagged):
    keep = []
    returned = []
    synsets = wordnet.synsets(word, pos=pos_tagged)
    for synset in synsets:
        synonym = synset.name().split(".")[0]
        if synonym not in keep:
            keep.append(synonym) #we are interested in the name of the synset
            returned.append(synset)

    return returned

In [589]:
def get_emoji_exact_synset(emoji_name, emoji_synsets):
    for synset in emoji_synsets:
        if emoji_name in synset.lemma_names():
            return synset
    return None

In [590]:
def penn_to_wn(tag):
    """Convert Penn Treebank POS tags to WordNet POS tags."""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None  

In [591]:
def get_best_matching_emoji(word_synset, emoji_dataset, threshold):
    best_emoji = None
    best_similarity = 0

    if not word_synset:
        return None, 0
    
    emoji_dict = {row["Representation"]: row["cleaned_text"] for _, row in emoji_dataset.iterrows()}
    
    for emoji, emoji_names in emoji_dict.items():
        for emoji_name in emoji_names:
            emoji_name_synsets = wordnet.synsets(emoji_name)

            if not emoji_name_synsets:
                continue

            emoji_synset = get_emoji_exact_synset(emoji_name, emoji_name_synsets)
            
            if not emoji_synset:
                continue
            
            similarity = word_synset.path_similarity(emoji_synset)
            if similarity and similarity > best_similarity:
                best_similarity = similarity
                best_emoji = emoji

    return (best_emoji, best_similarity) if best_similarity >= threshold else (None, best_similarity)


In [592]:
def replace_with_emoji_wordnet(sentence, emoji_dataset, threshold = 0.6):
    tokens = word_tokenize(sentence)
    tagged_words = pos_tag(tokens)
    modified_tokens = []
    similarities = {} 

    for token, tag in tagged_words:
        pre_processed_token = pre_process(token)
        wn_tag = penn_to_wn(tag)
        
        if not pre_processed_token:
            modified_tokens.append(token)
            continue

        best_emoji = None
        best_similarity = 0
        
        pre_processed_token = pre_processed_token[0]
        synonyms = find_synonym_words(pre_processed_token, wn_tag)
        for synonum_synset in synonyms:
            best_emoji, best_similarity = get_best_matching_emoji(synonum_synset, emoji_dataset, threshold)
            if best_emoji:
                break

        if not best_emoji:
            hypernyms = get_hypernyms(pre_processed_token, wn_tag)
            for hypernym in hypernyms:
                best_emoji, best_similarity = get_best_matching_emoji(hypernym, emoji_dataset, threshold)
                if best_emoji:
                    break
        
        if best_emoji:
            modified_tokens.append(best_emoji)
        else:
            modified_tokens.append(token)
        
        similarities[token] = best_similarity

    modified_sentence = " ".join(modified_tokens)
    
    return modified_sentence, similarities

In [593]:
def replace_with_emoji(sentence):
    modified_sentence, _ = replace_with_emoji_wordnet(sentence, emojis_dataset)
    return modified_sentence

In [None]:
test_sentences = [
    "I want pizza and a movie night.",
    "This is such a sad day.",
    "Happy birthday to you!",
    "I need a vacation by the beach.",
    "coffee",
    "star boy",
    "i love you ",
    "the pizza is great",
    "chicken lays eggs ",
    "i have scored hundred in maths ",
    "She is the queen of hearts ",
    "messi is the king of soccer ",
    "lets build a rocket "  
]

for sent in test_sentences:
    modified_sentence, similarity = replace_with_emoji_wordnet(sent, emojis_dataset)
    print(f"Input: {sent}")
    print(f"Modified Sentence: {modified_sentence}")
    print(f"Similarities: {similarity}\n")



Input: I want pizza and a movie night.
Modified Sentence: I want 🍕 and a 🎥 🌃 .
Similarities: {'want': 0.3333333333333333, 'pizza': 1.0, 'movie': 1.0, 'night': 1.0}

Input: This is such a sad day.
Modified Sentence: This is such a 😥 day .
Similarities: {'sad': 1.0, 'day': 0.3333333333333333}

Input: Happy birthday to you!
Modified Sentence: Happy 🎂 to you !
Similarities: {'Happy': 0.3333333333333333, 'birthday': 1.0}

Input: I need a vacation by the beach.
Modified Sentence: I need a vacation by the 🏖️ .
Similarities: {'need': 0.3333333333333333, 'vacation': 0.25, 'beach': 1.0}

Input: coffee
Modified Sentence: ☕
Similarities: {'coffee': 1.0}

Input: star boy
Modified Sentence: ✡️ 👦
Similarities: {'star': 1.0, 'boy': 1.0}

Input: i love you 
Modified Sentence: i love you
Similarities: {'love': 0.3333333333333333}

Input: the pizza is great
Modified Sentence: the 🍕 is great
Similarities: {'pizza': 1.0, 'great': 0.3333333333333333}

Input: chicken lays eggs 
Modified Sentence: 🐔 lays 🥚
Si