In [1]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import stopwords
import re
from nltk.metrics import edit_distance
import numpy as np
import pandas as pd
import kagglehub

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
twitter_path = kagglehub.dataset_download("hariharasudhanas/twitter-emoji-prediction")

twitter_ds = pd.read_csv(f"{twitter_path}/Train.csv")
twitter_ds.head()

Downloading from https://www.kaggle.com/api/v1/datasets/download/hariharasudhanas/twitter-emoji-prediction?dataset_version_number=1...


100%|██████████| 3.50M/3.50M [00:01<00:00, 2.90MB/s]

Extracting files...





Unnamed: 0.1,Unnamed: 0,TEXT,Label
0,0,Vacation wasted ! #vacation2017 #photobomb #ti...,0
1,1,"Oh Wynwood, you’re so funny! : @user #Wynwood ...",1
2,2,Been friends since 7th grade. Look at us now w...,2
3,3,This is what it looks like when someone loves ...,3
4,4,RT @user this white family was invited to a Bl...,3


In [4]:
emojis_dataset = pd.read_csv("./emojis.csv")
emojis_dataset

Unnamed: 0,Group,Subgroup,CodePoint,Status,Representation,Name,Section
0,Activities,event,1F383,fully-qualified,🎃,jack-o-lantern,E0.6
1,Activities,event,1F384,fully-qualified,🎄,Christmas tree,E0.6
2,Activities,event,1F386,fully-qualified,🎆,fireworks,E0.6
3,Activities,event,1F387,fully-qualified,🎇,sparkler,E0.6
4,Activities,event,1F9E8,fully-qualified,🧨,firecracker,E11.0
...,...,...,...,...,...,...,...
4585,Travel-Places,sky-weather,2604 FE0F,fully-qualified,☄️,comet,E1.0
4586,Travel-Places,sky-weather,2604,unqualified,☄,comet,E1.0
4587,Travel-Places,sky-weather,1F525,fully-qualified,🔥,fire,E0.6
4588,Travel-Places,sky-weather,1F4A7,fully-qualified,💧,droplet,E0.6


In [5]:
def pre_process(sentence: str) -> list[str]:
  ps = PorterStemmer()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  stemmed_sent = [ps.stem(t) for t in tokens]
  tokens = [t for t in stemmed_sent if t not in stop_words]
  return tokens

In [6]:
sent = "I want castle for Christmas"
tokens = pre_process(sent)

tokens

['want', 'castl', 'christma']

In [7]:
emojis_dataset = emojis_dataset[["Representation", "Name"]]
emojis_dataset["Stemmed"] = [pre_process(emoji_sent) for emoji_sent in emojis_dataset.Name]
emojis_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emojis_dataset["Stemmed"] = [pre_process(emoji_sent) for emoji_sent in emojis_dataset.Name]


Unnamed: 0,Representation,Name,Stemmed
0,🎃,jack-o-lantern,[jackolantern]
1,🎄,Christmas tree,"[christma, tree]"
2,🎆,fireworks,[firework]
3,🎇,sparkler,[sparkler]
4,🧨,firecracker,[firecrack]
...,...,...,...
4585,☄️,comet,[comet]
4586,☄,comet,[comet]
4587,🔥,fire,[fire]
4588,💧,droplet,[droplet]


In [8]:
def emojify(sent: str) -> str:
  tokens = pre_process(sent)

  result = ""

  for token in tokens:
    matches = []

    for _, emoji in emojis_dataset.iterrows():
      if token in emoji.Stemmed:
        matches.append((emoji.Representation, emoji.Stemmed))

    if len(matches) > 0:
      result += matches[random.randint(0, len(matches) - 1)][0]
    else:
      result += token
    result += " "

  return result

In [133]:
def emojify_stop_words(sent: str) -> str:
  tokens = word_tokenize(sent)

  result = ""

  for token in tokens:
    processed = pre_process(token)

    if len(processed) == 0: # It is stop-word
      result += token + " "
      continue

    processed = processed[0]

    matches = []

    for _, emoji in emojis_dataset.iterrows():
      #for emoji_token in emoji.Stemmed:
      #  dist = edit_distance(processed, emoji_token)
      #  if dist < 2:
      #    matches.append((emoji.Representation, emoji.Stemmed))
      if processed in emoji.Stemmed:
        matches.append((emoji.Representation, emoji.Stemmed))

    if len(matches) > 0:
      # result += matches[random.randint(0, len(matches) - 1)][0]
      # result += choose_random_weighted(matches)[0]
      result += choose_shortest_len(matches)[0]
    else:
      result += token
    result += " "

  return result

In [134]:
def softmax(x):
  return np.exp(x) / sum(np.exp(x))

def choose_random_weighted(matches: list[tuple[str, list[str]]]) -> str:
  probs = softmax([1 / len(m[1]) for m in matches])
  idx = np.random.choice(range(len(matches)), 1, p=probs)
  return matches[idx[0]]

def choose_shortest_len(matches: list[tuple[str, list[str]]]) -> str:
  return sorted(matches, key=lambda x: len(x[1]))[0]

# TODO: Implement "choose_closest_embedding"

In [11]:
print(emojify("I like sandwich"))
print(emojify("I want castle for Christmas"))

like 🇬🇸 
want 🏰 🎄 


In [12]:
print(emojify_stop_words("I like sandwich"))
print(emojify_stop_words("I want castle for Christmas"))

I like 🥪 
I want 🏰 for 🎄 


In [13]:
for _, tweet in twitter_ds.sample(10).iterrows():
  print('Original:', tweet.TEXT)
  #print('Emojified:', emojify(tweet.TEXT))
  print('Emojified:', emojify_stop_words(tweet.TEXT))
  print('---------------------------')

Original: REPOST!!! One of my favorite pumps that I got on camera @ LA Fitness - RIVERSIDE - MAGNOLIA AVE

Emojified: REPOST ! ! ! 🔞 of my favorite ⛽ that I got on 📷 @ LA Fitness - RIVERSIDE - MAGNOLIA AVE 
---------------------------
Original: My whole heart Thanks to everyone that donated we appreciate you! #bbfw16 #bbfw #bestbuddy…

Emojified: My whole ♥️ Thanks to everyone that donated we appreciate you ! # bbfw16 # bbfw # bestbuddy… 
---------------------------
Original: Full day at Gulf Shores yesterday with hubby ️ @ The Hangout Beach, Music and Arts Festival

Emojified: 🌕 day at Gulf Shores yesterday with hubby ️ @ The Hangout 🏖️ , 🎼 and 🎭 Festival 
---------------------------
Original: As if we didn't have enough group pics #verizonlounge seahawks #tayslastfling…

Emojified: As if we did n't have enough group pics # verizonlounge seahawks # tayslastfling… 
---------------------------
Original: #PleasureIsDope #ConsentIsDope ️ @ Sexual &amp; Health Wellness Studio

Emojified: #

In [14]:
tweet = "Flexin' in a bikini on national television- Things I never would have imagined for 500, Alex -- Did…"
print('Emojified:', emojify(tweet))
print('Emojified with stop words:', emojify_stop_words(tweet))

Emojified: flexin 👙 🇺🇳 📺 thing never would imagin 500 alex 
Emojified with stop words: Flexin ' in a 👙 on 🏞️ 📺 Things I never would have imagined for 500 , Alex -- Did… 


In [15]:
tweet = "Confessions of a candy maker: I love ️ #toffee! #buttery crisphandmade toffee dipped in milk…"
print('Emojified:', emojify_stop_words(tweet))

Emojified: Confessions of a 🍬 maker : I 💌 ️ # toffee ! # buttery crisphandmade toffee dipped in 🥛 


In [16]:
def pre_process_lemmatizer(sentence: str) -> list[str]:
  ps = WordNetLemmatizer()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  print('Clean', clean_sent)
  tokens = word_tokenize(clean_sent)
  print('Tokenize', tokens)
  stemmed_sent = [ps.lemmatize(t) for t in tokens]
  print(stemmed_sent)
  tokens = [t for t in stemmed_sent if t not in stop_words]
  return tokens

# We need embeddings, because "buttery" is also a noun: https://dictionary.cambridge.org/dictionary/english/buttery
print(pre_process_lemmatizer("Confessions of a candy I love ️buttery"))

Clean Confessions of a candy I love buttery
Tokenize ['Confessions', 'of', 'a', 'candy', 'I', 'love', 'buttery']
['Confessions', 'of', 'a', 'candy', 'I', 'love', 'buttery']
['Confessions', 'candy', 'I', 'love', 'buttery']


In [17]:
def pre_process_snowball(sentence: str) -> list[str]:
  ps = LancasterStemmer()
  clean_sent = re.sub(r'[^\w\s]', '', sentence)
  tokens = word_tokenize(clean_sent)
  print('Tokenize', tokens)
  stemmed_sent = [ps.stem(t) for t in tokens]
  print(stemmed_sent)
  tokens = [t for t in stemmed_sent if t not in stop_words]
  return tokens

print(pre_process_snowball("Confessions of a candy maker: I love ️ #toffee! #buttery crisphandmade toffee dipped in milk"))

Tokenize ['Confessions', 'of', 'a', 'candy', 'maker', 'I', 'love', 'toffee', 'buttery', 'crisphandmade', 'toffee', 'dipped', 'in', 'milk']
['confess', 'of', 'a', 'candy', 'mak', 'i', 'lov', 'toff', 'buttery', 'crisphandmad', 'toff', 'dip', 'in', 'milk']
['confess', 'candy', 'mak', 'lov', 'toff', 'buttery', 'crisphandmad', 'toff', 'dip', 'milk']


In [18]:
print(emojify_stop_words("star boy"))
print(emojify_stop_words("i love you "))
print(emojify_stop_words("the pizza is great"))
print(emojify_stop_words("chicken lays eggs "))
print(emojify_stop_words("i have scored hundred in maths "))
print(emojify_stop_words("She is the queen of hearts "))
print(emojify_stop_words("messi is the king of soccer "))
print(emojify_stop_words("lets build a rocket "))

⭐ 👦 
i 💌 you 
the 🍕 is great 
🐔 lays 🥚 
i have 🎼 💯 in maths 
She is the queen of ♥️ 
messi is the king of ⚽ 
lets 🏛️ a 🚀 


# How to evaluate performance on this

## Part 1: Is the meaning preserved?

Introduce an inverse function from "emojified text"
to normal text, by replacing emojis with their text meaning:

- Input sentence: "Chicken lays eggs"
- Pre-processed input: "Chicken lay egg"
- Output sentence: "🍕 lays 🥚"
- Inverse of output: "pizza lay egg"

Then compare "Pre-processed" and "Inverse" somehow.

## Part 2: Give positive score for number of used emojis

**Idea:** The more emojis are used, the bigger the score.

### Part 3: Combine 1 and 2 into a score

### Part 4: Reinforcement learning...

In [111]:
emoji_to_names = {}
for _, emoji in emojis_dataset.iterrows():
  emoji_to_names[emoji.Representation] = word_tokenize(emoji.Name)

In [20]:
def emojify_inverse(sent: str) -> str:
  tokens = word_tokenize(sent)

  results = [""]

  def add_to_all(l, token):
    for i in range(len(l)):
      l[i] += token + " "

  for token in tokens:
    if token not in emoji_to_names:
      add_to_all(results, token)
      continue

    new_results = []

    for emoji_name in emoji_to_names[token]:
      results_copy = results.copy()
      add_to_all(results_copy, emoji_name)
      new_results += results_copy

    results = new_results

  for i in range(len(results)):
    results[i] = results[i].strip()

  return results

In [21]:
def print_emoji_and_inverse(sent: str):
  emojified = emojify_stop_words(sent)
  print("Sentence: ", sent)
  print("Emojified: ", emojified)
  print("Inverse: ", emojify_inverse(emojified))
  print("-------------------------")

print_emoji_and_inverse("star boy")
print_emoji_and_inverse("i love you")
print_emoji_and_inverse("the pizza is great")
print_emoji_and_inverse("chicken lays eggs ")
print_emoji_and_inverse("i have scored hundred in maths ")
print_emoji_and_inverse("She is the queen of hearts ")
print_emoji_and_inverse("messi is the king of soccer ")
print_emoji_and_inverse("lets build a rocket ")

Sentence:  star boy
Emojified:  ⭐ 👦 
Inverse:  ['star boy']
-------------------------
Sentence:  i love you
Emojified:  i 💌 you 
Inverse:  ['i love you', 'i letter you']
-------------------------
Sentence:  the pizza is great
Emojified:  the 🍕 is great 
Inverse:  ['the pizza is great']
-------------------------
Sentence:  chicken lays eggs 
Emojified:  🐔 lays 🥚 
Inverse:  ['chicken lays egg']
-------------------------
Sentence:  i have scored hundred in maths 
Emojified:  i have 🎼 💯 in maths 
Inverse:  ['i have musical hundred in maths', 'i have score hundred in maths', 'i have musical points in maths', 'i have score points in maths']
-------------------------
Sentence:  She is the queen of hearts 
Emojified:  She is the queen of ♥️ 
Inverse:  ['She is the queen of heart', 'She is the queen of suit']
-------------------------
Sentence:  messi is the king of soccer 
Emojified:  messi is the king of ⚽ 
Inverse:  ['messi is the king of soccer', 'messi is the king of ball']
---------------

In [22]:
def similarity(sent, inverse):
  pre_processed = pre_process(sent)
  inverse_pre_processed = pre_process(inverse)

  assert len(pre_processed) == len(inverse_pre_processed)

  same_count = 0
  for i in range(len(pre_processed)):
    if pre_processed[i] == inverse_pre_processed[i]:
      same_count += 1

  return same_count / len(pre_processed)

print(similarity("She is the queen of hearts", "She is the queen of heart"))
print(similarity("i have musical hundred in maths", "i have scored hundred in maths"))

1.0
0.6666666666666666


In [23]:
def emojify_similarity(sent, emojify_func):
  emojified = emojify_func(sent)
  inverses = emojify_inverse(emojified)

  return max([similarity(sent, inverse) for inverse in inverses])

print(emojify_similarity("She is the queen of hearts", emojify_stop_words))
print(emojify_similarity("i have scored hundred in maths", emojify_stop_words))

1.0
1.0


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [emoji.Name for _, emoji in emojis_dataset.iterrows()]
print(len(corpus))

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print("Features: ", vectorizer.get_feature_names_out())
print("Features length: ", len(vectorizer.get_feature_names_out()))

4590
Features:  ['10' '1st' '2nd' ... 'zipper' 'zombie' 'zzz']
Features length:  1648


In [25]:
emoji = "🎼"

idx = -1
for i, e in emojis_dataset.iterrows():
  if e.Representation == emoji:
    idx = i
    break

#print(X.shape)
print(X[idx])
print(emojis_dataset.iloc[idx].Name, emojis_dataset.iloc[idx].Representation)

  (0, 955)	0.6669640401779303
  (0, 1259)	0.7450899067290686
musical score 🎼


In [139]:
from collections.abc import Callable

def evaluate_model(
  test_data: list[str],
  model: Callable[[str], str],
  word_similarity: Callable[[str, str], float]) -> float:

  score = 0
  for sent in test_data:
    score += evaluate_single_sentence(sent, model, word_similarity)
  return score / len(test_data)

def evaluate_single_sentence(
  sent: str,
  model: Callable[[str], str],
  word_similarity: Callable[[str, str], float]) -> float:

  emojified = model(sent)
  #print("Emojified is:", emojified)

  emoji_count = 0

  sent_tokens = word_tokenize(sent)
  emojified_tokens = word_tokenize(emojified)

  for i in range(len(sent_tokens)):
    if sent_tokens[i] != emojified_tokens[i]:
      emoji_count += 1

  #print("emoji count:", emoji_count)

  total_similarity = 0
  for i in range(len(sent_tokens)):
    #print("similarity between", sent_tokens[i], "and", emojified_tokens[i], "is", word_similarity(sent_tokens[i], emojified_tokens[i]))
    total_similarity += word_similarity(sent_tokens[i], emojified_tokens[i])

  #print("total sim", total_similarity)

  return np.sqrt(emoji_count / len(sent_tokens)) * (total_similarity / len(sent_tokens))


In [30]:
!python -m spacy download en_core_web_md

import spacy

nlp = spacy.load("en_core_web_md")
doc = nlp("This is a sentence.")

print([(w.text, w.pos_) for w in doc])

Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.




[('This', 'PRON'), ('is', 'AUX'), ('a', 'DET'), ('sentence', 'NOUN'), ('.', 'PUNCT')]


In [119]:
def embedding_similarity(word: str, emoji: str) -> float:
  doca = nlp(word)

  emoji_names = emoji_to_names.get(emoji, [])
  if len(emoji_names) == 0:
    # just a word
    docb = nlp(emoji)
    return doca.similarity(docb)

  docb = nlp(" ".join(emoji_names))

  return doca.similarity(docb)

In [149]:
chicken_emoji = None
for k in emoji_to_names.keys():
  if k == "🐔":
    chicken_emoji = k
    break

print(evaluate_single_sentence("heart", lambda x: "❤️", embedding_similarity))
print(evaluate_single_sentence("chicken", lambda x: "❤️", embedding_similarity))
print(evaluate_single_sentence("chicken", lambda x: chicken_emoji, embedding_similarity))
print(evaluate_single_sentence("hearts", lambda x: "💞", embedding_similarity))
print(evaluate_single_sentence("hearts", lambda x: "❤️", embedding_similarity))

0.6321227433757768
0.266291506621378
1.0
0.8505853670606688
0.5513423851021232


In [87]:
nltk.download('brown')


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


57340


TypeError: expected string or bytes-like object, got 'list'

In [141]:
from nltk.corpus import brown

dataset_size = 10
sentences = [' '.join(sent) for sent in brown.sents()[:dataset_size]]

print("Score for non-neural-network emojifier:", evaluate_model(sentences, emojify_stop_words, embedding_similarity))
print("Score for identity:", evaluate_model(sentences, lambda x: x, embedding_similarity))

Score for non-neural-network emojifier: 0.22504997100029883
Score for identity: 0.0
