In [132]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import stopwords
import re
from nltk.metrics import edit_distance
import numpy as np
import pandas as pd
import kagglehub

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [58]:
twitter_path = kagglehub.dataset_download("hariharasudhanas/twitter-emoji-prediction")

twitter_ds = pd.read_csv(f"{twitter_path}/Train.csv")
twitter_ds.head()

Unnamed: 0.1,Unnamed: 0,TEXT,Label
0,0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,2,Been friends since 7th grade. Look at us now w...,2
3,3,This is what it looks like when someone loves ...,3
4,4,RT @user this white family was invited to a Bl...,3


In [4]:
emojis_dataset = pd.read_csv("./emojis.csv")
emojis_dataset

Unnamed: 0,Group,Subgroup,CodePoint,Status,Representation,Name,Section
0,Activities,event,1F383,fully-qualified,🎃,jack-o-lantern,E0.6
1,Activities,event,1F384,fully-qualified,🎄,Christmas tree,E0.6
2,Activities,event,1F386,fully-qualified,🎆,fireworks,E0.6
3,Activities,event,1F387,fully-qualified,🎇,sparkler,E0.6
4,Activities,event,1F9E8,fully-qualified,🧨,firecracker,E11.0
...,...,...,...,...,...,...,...
4585,Travel-Places,sky-weather,2604 FE0F,fully-qualified,☄️,comet,E1.0
4586,Travel-Places,sky-weather,2604,unqualified,☄,comet,E1.0
4587,Travel-Places,sky-weather,1F525,fully-qualified,🔥,fire,E0.6
4588,Travel-Places,sky-weather,1F4A7,fully-qualified,💧,droplet,E0.6


In [145]:
def pre_process(sentence: str) -> list[str]:
  ps = PorterStemmer()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  stemmed_sent = [ps.stem(t) for t in tokens]
  tokens = [t for t in stemmed_sent if t not in stop_words]
  return tokens

In [8]:
sent = "I want castle for Christmas"
tokens = pre_process(sent)

tokens

['want', 'castl', 'christma']

In [10]:
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset["Stemmed"] = [pre_process(emoji_sent) for emoji_sent in emojis_dataset.Name]
emojis_dataset

Unnamed: 0,Representation,Name,Stemmed
0,🎃,jack-o-lantern,[jack-o-lantern]
1,🎄,Christmas tree,"[christma, tree]"
2,🎆,fireworks,[firework]
3,🎇,sparkler,[sparkler]
4,🧨,firecracker,[firecrack]
...,...,...,...
4585,☄️,comet,[comet]
4586,☄,comet,[comet]
4587,🔥,fire,[fire]
4588,💧,droplet,[droplet]


In [32]:
def emojify(sent: str) -> str:
  tokens = pre_process(sent)

  result = ""

  for token in tokens:
    matches = []

    for _, emoji in emojis_dataset.iterrows():
      if token in emoji.Stemmed:
        matches.append((emoji.Representation, emoji.Stemmed))

    if len(matches) > 0:
      result += matches[random.randint(0, len(matches) - 1)][0]
    else:
      result += token
    result += " "

  return result

In [148]:
def emojify_stop_words(sent: str) -> str:
  tokens = word_tokenize(sent)

  result = ""

  for token in tokens:
    processed = pre_process(token)

    if len(processed) == 0: # It is stop-word
      result += token + " "
      continue

    processed = processed[0]

    matches = []

    for _, emoji in emojis_dataset.iterrows():
      #for emoji_token in emoji.Stemmed:
      #  dist = edit_distance(processed, emoji_token)
      #  if dist < 2:
      #    matches.append((emoji.Representation, emoji.Stemmed))
      if processed in emoji.Stemmed:
        matches.append((emoji.Representation, emoji.Stemmed))

    if len(matches) > 0:
      # result += matches[random.randint(0, len(matches) - 1)][0]
      # result += choose_random_weighted(matches)[0]
      result += choose_shortest_len(matches)[0]
    else:
      result += token
    result += " "

  return result

In [151]:
def softmax(x):
  return np.exp(x) / sum(np.exp(x))

def choose_random_weighted(matches: list[tuple[str, list[str]]]) -> str:
  probs = softmax([1 / len(m[1]) for m in matches])
  idx = np.random.choice(range(len(matches)), 1, p=probs)
  return matches[idx[0]]

def choose_shortest_len(matches: list[tuple[str, list[str]]]) -> str:
  return sorted(matches, key=lambda x: len(x[1]))[0]

In [35]:
print(emojify("I like sandwich"))
print(emojify("I want castle for Christmas"))

like 🥪 
want 🏯 🇨🇽 


In [40]:
print(emojify_stop_words("I like sandwich"))
print(emojify_stop_words("I want castle for Christmas"))

I like 🥪 
I want 🏰 for 🇨🇽 


In [153]:
for _, tweet in twitter_ds.sample(10).iterrows():
  print('Original:', tweet.TEXT)
  #print('Emojified:', emojify(tweet.TEXT))
  print('Emojified:', emojify_stop_words(tweet.TEXT))
  print('---------------------------')

Original: This is how we spend our night...surrounded by the written word. #booklovers #bestfriend #loveher…

Emojified: This is how we spend our 🌃 ... surrounded by the written word . # booklovers # bestfriend # loveher… 
---------------------------
Original: Rocking around the #rockefellercenter #christmastree #makingspiritsbright #seagrill…

Emojified: 🪨 around the # rockefellercenter # christmastree # makingspiritsbright # seagrill… 
---------------------------
Original: weekend full of nothing but family, congratulations to Matt &amp; Jeanette …

Emojified: weekend 🌕 of nothing but 👪 , ㊗️ to Matt & amp ; Jeanette … 
---------------------------
Original: HAPPY LATE NIGHT EVERYONE! We got some Are you ready for our August 2017 "Sexy Summer Heat"…

Emojified: HAPPY LATE 🌃 EVERYONE ! We got some Are you ready for our August 2017 `` Sexy Summer Heat '' … 
---------------------------
Original: Come chill in my spot #music #dance #performance #performing #radflava #rideout #alternative…


In [92]:
tweet = "Flexin' in a bikini on national television- Things I never would have imagined for 500, Alex -- Did…"
print('Emojified:', emojify(tweet))
print('Emojified with stop words:', emojify_stop_words(tweet))

Emojified: flexin 👙 🏞️ 📺 thing never would imagin 500 alex 
Emojified with stop words: Flexin ' in a 👙 on 🏞 📺 Things I never would have imagined for 500 , Alex -- Did… 


In [118]:
tweet = "Confessions of a candy maker: I love ️ #toffee! #buttery crisphandmade toffee dipped in milk…"
print('Emojified:', emojify_stop_words(tweet))

Emojified: Confessions of a 🍬 maker : I 🧤 ️ # toffee ! # 🔋 crisphandmade toffee 💁🏽 in 🌌 


In [142]:
def pre_process_lemmatizer(sentence: str) -> list[str]:
  ps = WordNetLemmatizer()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  print('Clean', clean_sent)
  tokens = word_tokenize(clean_sent)
  print('Tokenize', tokens)
  stemmed_sent = [ps.lemmatize(t) for t in tokens]
  print(stemmed_sent)
  tokens = [t for t in stemmed_sent if t not in stop_words]
  return tokens

# We need embeddings, because "buttery" is also a noun: https://dictionary.cambridge.org/dictionary/english/buttery
print(pre_process_lemmatizer("Confessions of a candy I love ️buttery"))

Clean Confessions of a candy I love buttery
Tokenize ['Confessions', 'of', 'a', 'candy', 'I', 'love', 'buttery']
['Confessions', 'of', 'a', 'candy', 'I', 'love', 'buttery']
['Confessions', 'candy', 'I', 'love', 'buttery']


In [137]:
def pre_process_snowball(sentence: str) -> list[str]:
  ps = LancasterStemmer()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  print('Tokenize', tokens)
  stemmed_sent = [ps.stem(t) for t in tokens]
  print(stemmed_sent)
  tokens = [t for t in stemmed_sent if t not in stop_words]
  return tokens

print(pre_process_snowball("Confessions of a candy maker: I love ️ #toffee! #buttery crisphandmade toffee dipped in milk"))

Tokenize ['Confessions', 'of', 'a', 'candy', 'maker', 'I', 'love', 'toffee', 'buttery', 'crisphandmade', 'toffee', 'dipped', 'in', 'milk']
['confess', 'of', 'a', 'candy', 'mak', 'i', 'lov', 'toff', 'buttery', 'crisphandmad', 'toff', 'dip', 'in', 'milk']
['confess', 'candy', 'mak', 'lov', 'toff', 'buttery', 'crisphandmad', 'toff', 'dip', 'milk']


In [154]:
print(emojify_stop_words("star boy"))
print(emojify_stop_words("i love you "))
print(emojify_stop_words("the pizza is great"))
print(emojify_stop_words("chicken lays eggs "))
print(emojify_stop_words("i have scored hundred in maths "))
print(emojify_stop_words("She is the queen of hearts "))
print(emojify_stop_words("messi is the king of soccer "))
print(emojify_stop_words("lets build a rocket "))

⭐ 👦 
i 💌 you 
the 🍕 is great 
🐔 lays 🥚 
i have 🎼 💯 in maths 
She is the queen of ♥️ 
messi is the king of ⚽ 
lets 🏛️ a 🚀 
