In [71]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.metrics import edit_distance
import numpy as np
import pandas as pd
import kagglehub

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [72]:
twitter_path = kagglehub.dataset_download("hariharasudhanas/twitter-emoji-prediction")
twitter_ds = pd.read_csv(f"{twitter_path}/Train.csv")
twitter_ds.head()

text8_path = kagglehub.dataset_download("gupta24789/text8-word-embedding")
print("Path to dataset files:", text8_path)
text8_ds = open(f"{text8_path}/text8", "r")

Path to dataset files: /Users/anapetrova/.cache/kagglehub/datasets/gupta24789/text8-word-embedding/versions/1


In [73]:
emojis_dataset = pd.read_csv("./emojis.csv")
emojis_dataset

Unnamed: 0,Group,Subgroup,CodePoint,Status,Representation,Name,Section
0,Activities,event,1F383,fully-qualified,🎃,jack-o-lantern,E0.6
1,Activities,event,1F384,fully-qualified,🎄,Christmas tree,E0.6
2,Activities,event,1F386,fully-qualified,🎆,fireworks,E0.6
3,Activities,event,1F387,fully-qualified,🎇,sparkler,E0.6
4,Activities,event,1F9E8,fully-qualified,🧨,firecracker,E11.0
...,...,...,...,...,...,...,...
4585,Travel-Places,sky-weather,2604 FE0F,fully-qualified,☄️,comet,E1.0
4586,Travel-Places,sky-weather,2604,unqualified,☄,comet,E1.0
4587,Travel-Places,sky-weather,1F525,fully-qualified,🔥,fire,E0.6
4588,Travel-Places,sky-weather,1F4A7,fully-qualified,💧,droplet,E0.6


In [74]:
def pre_process(sentence: str) -> list[str]:
  ps = PorterStemmer()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  stemmed_sent = [ps.stem(t) for t in tokens]
  tokens = [t for t in stemmed_sent if t not in stop_words]
  return tokens

In [75]:
sent = "I love pizza and movies!"
tokens = pre_process(sent)

tokens

['love', 'pizza', 'movi']

In [76]:
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)
emojis_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)


Unnamed: 0,Representation,Name,cleaned_text
0,🎃,jack-o-lantern,[jackolantern]
1,🎄,Christmas tree,"[christma, tree]"
2,🎆,fireworks,[firework]
3,🎇,sparkler,[sparkler]
4,🧨,firecracker,[firecrack]
...,...,...,...
4585,☄️,comet,[comet]
4586,☄,comet,[comet]
4587,🔥,fire,[fire]
4588,💧,droplet,[droplet]


In [77]:

tokenized_sentences = emojis_dataset['cleaned_text'].tolist()
model = Word2Vec(sentences=tokenized_sentences, vector_size=300, window=5, min_count=1, sg=1)  # sg=0 for CBOW

In [78]:
# Compute emoji embeddings
def get_embedding(tokens: list[str], model: Word2Vec) -> np.ndarray:
    """
    Compute the embedding for a list of tokens using the Word2Vec model.
    Args:
        tokens (list[str]): List of tokens.
        model (Word2Vec): Trained Word2Vec model.
    Returns:
        np.ndarray: Averaged embedding of the tokens.
    """
    embeddings = [model.wv[word] for word in tokens if word in model.wv] #wv is the word vector, Word2Vec model’s vocabulary (model.wv
    if embeddings:
        return np.mean(embeddings, axis=0)
    return np.zeros(model.vector_size)

# Generate emoji embeddings
emoji_embeddings = {
    row["Representation"]: get_embedding(row["cleaned_text"], model)
    for _, row in emojis_dataset.iterrows()
}

emoji_embeddings


{'🎃': array([-8.7766703e-03,  7.0940149e-03,  9.3251439e-03,  1.1743259e-03,
         8.3689084e-03,  9.7873341e-03, -8.6324215e-03, -5.1278686e-03,
         9.5613310e-03, -5.4475223e-03,  2.1129595e-03, -7.8555109e-04,
         5.8920467e-03,  1.2091601e-03,  7.5866473e-03, -8.4117232e-03,
        -4.3800403e-03, -1.0638071e-03,  4.7407318e-03,  6.9928314e-03,
         2.5106908e-04, -2.2125496e-03, -7.2193695e-03, -7.0200203e-04,
        -3.6052323e-03, -5.6401859e-03, -5.6529877e-04, -4.3962537e-03,
        -8.5149314e-03,  1.8207311e-04, -7.0603504e-03,  9.3200775e-03,
         5.8994852e-03, -6.4044739e-03,  8.0881026e-03,  5.3984965e-03,
        -3.9825249e-03,  1.4058924e-03, -8.3551696e-03, -1.2849879e-03,
         6.5791691e-03, -9.3526086e-03, -7.5853826e-04,  4.5812558e-03,
         4.9851062e-03, -4.1456735e-03, -4.5514344e-03, -7.5143301e-03,
        -5.7018902e-03, -8.9607359e-04,  3.2303976e-03, -4.1547134e-03,
        -5.1907967e-03,  6.8453299e-03, -8.4373001e-03,  2.

In [79]:
def predict_emoji(sentence: str, model: Word2Vec, emoji_embeddings: dict) -> tuple[str, float]:
    tokens = pre_process(sentence)

    sentence_embedding = get_embedding(tokens, model)
    
    similarities = {
        emoji: cosine_similarity([sentence_embedding], [embedding])[0][0]
        for emoji, embedding in emoji_embeddings.items()
    }


    
    best_match = max(similarities, key=similarities.get)
    return best_match, similarities[best_match]


In [80]:
test_sentences = [
    "I want pizza and a movie night.",
    "This is such a sad day.",
    "Happy birthday to you!",
    "I need a vacation by the beach."
]

for sent in test_sentences:
    emoji, similarity = predict_emoji(sent, model, emoji_embeddings)
    print(f"Input: {sent}")
    print(f"Predicted Emoji: {emoji} (Similarity: {similarity:.2f})")

Input: I want pizza and a movie night.
Predicted Emoji: 🌉 (Similarity: 0.95)
Input: This is such a sad day.
Predicted Emoji: 😥 (Similarity: 0.89)
Input: Happy birthday to you!
Predicted Emoji: 🎂 (Similarity: 0.89)
Input: I need a vacation by the beach.
Predicted Emoji: 🏖️ (Similarity: 0.97)
