In [2]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from nltk.metrics import edit_distance
import numpy as np
import pandas as pd
import kagglehub

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/anapetrova/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
twitter_path = kagglehub.dataset_download("hariharasudhanas/twitter-emoji-prediction")
twitter_ds = pd.read_csv(f"{twitter_path}/Train.csv")
twitter_ds.head()

text8_path = kagglehub.dataset_download("gupta24789/text8-word-embedding")
print("Path to dataset files:", text8_path)
text8_ds = open(f"{text8_path}/text8", "r")
text8_ds

Path to dataset files: /Users/anapetrova/.cache/kagglehub/datasets/gupta24789/text8-word-embedding/versions/1


<_io.TextIOWrapper name='/Users/anapetrova/.cache/kagglehub/datasets/gupta24789/text8-word-embedding/versions/1/text8' mode='r' encoding='UTF-8'>

In [33]:
emojis_dataset = pd.read_csv("./emojis.csv")
emojis_dataset

Unnamed: 0,Group,Subgroup,CodePoint,Status,Representation,Name,Section
0,Activities,event,1F383,fully-qualified,🎃,jack-o-lantern,E0.6
1,Activities,event,1F384,fully-qualified,🎄,Christmas tree,E0.6
2,Activities,event,1F386,fully-qualified,🎆,fireworks,E0.6
3,Activities,event,1F387,fully-qualified,🎇,sparkler,E0.6
4,Activities,event,1F9E8,fully-qualified,🧨,firecracker,E11.0
...,...,...,...,...,...,...,...
4585,Travel-Places,sky-weather,2604 FE0F,fully-qualified,☄️,comet,E1.0
4586,Travel-Places,sky-weather,2604,unqualified,☄,comet,E1.0
4587,Travel-Places,sky-weather,1F525,fully-qualified,🔥,fire,E0.6
4588,Travel-Places,sky-weather,1F4A7,fully-qualified,💧,droplet,E0.6


In [34]:
def pre_process(sentence: str) -> list[str]:
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)  # Tokenize the sentence
  filtered_tokens = [t for t in tokens if t not in stop_words]  # Remove stop words
  return filtered_tokens

In [35]:
sent = "I want castle for Christmas"
tokens = pre_process(sent)

tokens

['I', 'want', 'castle', 'Christmas']

In [36]:
# Prepare training data
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)
emojis_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emojis_dataset["cleaned_text"] = emojis_dataset["Name"].apply(pre_process)


Unnamed: 0,Representation,Name,cleaned_text
0,🎃,jack-o-lantern,[jackolantern]
1,🎄,Christmas tree,"[Christmas, tree]"
2,🎆,fireworks,[fireworks]
3,🎇,sparkler,[sparkler]
4,🧨,firecracker,[firecracker]
...,...,...,...
4585,☄️,comet,[comet]
4586,☄,comet,[comet]
4587,🔥,fire,[fire]
4588,💧,droplet,[droplet]


In [37]:
# Convert cleaned text to a list of tokenized sentences
tokenized_sentences = emojis_dataset['cleaned_text'].tolist()
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=2, min_count=1, sg=0)  # sg=0 for CBOW

In [None]:
# Compute emoji embeddings
def get_embedding(tokens: list[str], model: Word2Vec) -> np.ndarray:
    """
    Compute the embedding for a list of tokens using the Word2Vec model.
    Args:
        tokens (list[str]): List of tokens.
        model (Word2Vec): Trained Word2Vec model.
    Returns:
        np.ndarray: Averaged embedding of the tokens.
    """
    embeddings = [model.wv[word] for word in tokens if word in model.wv] #wv is the word vector, Word2Vec model’s vocabulary (model.wv
    if embeddings:
        return np.mean(embeddings, axis=0)
    return np.zeros(model.vector_size)

# Generate emoji embeddings
emoji_embeddings = {
    row["Representation"]: get_embedding(row["cleaned_text"], model)
    for _, row in emojis_dataset.iterrows()
}

emoji_embeddings


{'🎃': array([-0.00461436,  0.00549541,  0.00315122,  0.00443139,  0.00537437,
        -0.0085962 ,  0.00955436, -0.00799534,  0.00509125,  0.00094   ,
        -0.00304289,  0.00178874,  0.00494892, -0.00634124,  0.0056113 ,
        -0.00103229, -0.00303981, -0.00282892,  0.00721801, -0.00579899,
         0.003199  ,  0.00191105,  0.00027348, -0.00887997, -0.00315538,
        -0.00892946, -0.00352   , -0.00809442,  0.00469817,  0.00011124,
         0.00150915,  0.00726693,  0.00813528,  0.00147374,  0.0012838 ,
         0.0029683 , -0.00358258,  0.00409029, -0.00938605, -0.00196821,
         0.00967712,  0.0072264 ,  0.00542205,  0.00189584, -0.00087208,
        -0.00217982,  0.00462873,  0.00965878,  0.0092265 , -0.00322027,
        -0.00996592, -0.00791142,  0.00819277, -0.00897855,  0.00839094,
         0.00680707,  0.00081678, -0.00224299, -0.0038187 ,  0.00681882,
        -0.00338691,  0.0037531 , -0.00340574, -0.00688394,  0.00300862,
        -0.00208576,  0.00679749, -0.00899957,

In [39]:
# Predict the most likely emoji for a given sentence
def predict_emoji(sentence: str, model: Word2Vec, emoji_embeddings: dict) -> tuple[str, float]:
    tokens = pre_process(sentence)
    sentence_embedding = get_embedding(tokens, model)
    
    similarities = {
        emoji: cosine_similarity([sentence_embedding], [embedding])[0][0]
        for emoji, embedding in emoji_embeddings.items()
    }
    best_match = max(similarities, key=similarities.get)
    return best_match, similarities[best_match]

In [41]:
test_sentences = [
    "I love pizza and movies!",
    "It's such a sad day.",
    "Happy birthday to you!",
    "I need a vacation by the beach."
]

for sent in test_sentences:
    emoji, similarity = predict_emoji(sent, model, emoji_embeddings)
    print(f"Input: {sent}")
    print(f"Predicted Emoji: {emoji} (Similarity: {similarity:.2f})")

Input: I love pizza and movies!
Predicted Emoji: 🇸🇲 (Similarity: 0.93)
Input: It's such a sad day.
Predicted Emoji: 🌳 (Similarity: 0.53)
Input: Happy birthday to you!
Predicted Emoji: 🎂 (Similarity: 0.93)
Input: I need a vacation by the beach.
Predicted Emoji: 🏖️ (Similarity: 0.98)
