### Import libraries

In [22]:
import re
import os
import numpy as np
import pandas as pd
import re
import torch
from transformers import AutoTokenizer
from transformers import AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Build required functions inside a class

In [23]:
class EmojiRecommender:
    def __init__(self):
        self.getModel()
    
    def getModel(self):
        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
        self.model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
    
    def getMeanTokensSentence(self, sentence):
        sentence = sentence.lower()
        sentence = re.sub('[^a-z]+', ' ', sentence)
        stop_words = set(stopwords.words('english'))
        word_tokens = word_tokenize(sentence)
        sentence = [w for w in word_tokens if not w.lower() in stop_words]
        sentence = ' '.join(sentence)
        return obj.getMeanTokens([sentence])
        
    def getMeanTokensCsv(self):
        self.emoji_df = pd.read_csv("data/emoji-data.csv")
        self.all_emoji_df = self.emoji_df
        self.emoji_df = self.emoji_df[1800:2000]
        self.emoji_df = self.emoji_df.reset_index(drop=True)
        return self.getMeanTokens(self.emoji_df['description'])
    
    def getMeanTokens(self, sentences):
        self.getTokens(sentences)
        self.getEmbedding()
        return self.getMeanValue()
    
    def getTokens(self, sentences):
        self.tokens = {'input_ids': [], 'attention_mask': []}

        for sentence in sentences:
            new_tokens = self.tokenizer.encode_plus(sentence, max_length=128,
                                               truncation=True, padding='max_length',
                                               return_tensors='pt')
            self.tokens['input_ids'].append(new_tokens['input_ids'][0])
            self.tokens['attention_mask'].append(new_tokens['attention_mask'][0])

        self.tokens['input_ids'] = torch.stack(self.tokens['input_ids'])
        self.tokens['attention_mask'] = torch.stack(self.tokens['attention_mask'])
    
    def getEmbedding(self):
        outputs = self.model(**self.tokens)
        self.embeddings = outputs.last_hidden_state
    
    def getMeanValue(self):
        attention_mask = self.tokens['attention_mask']
        mask = attention_mask.unsqueeze(-1).expand(self.embeddings.size()).float()
        masked_embeddings = self.embeddings * mask
        summed = torch.sum(masked_embeddings, 1)
        summed_mask = torch.clamp(mask.sum(1), min=1e-9)
        self.mean_pooled = summed / summed_mask
        self.mean_pooled = self.mean_pooled.detach().numpy()
        return self.mean_pooled

    def getSimilarity(self, sentence_tokens, mean_tokens):
        similarity = cosine_similarity([sentence_tokens],mean_tokens)
        return similarity
        
    def build_emoji_csv(self):
        df = pd.read_csv("data/raw-emoji-data.csv", usecols=[1, 3], squeeze = True, header = None)
        df = df.dropna()
        df = df.iloc[1:, :]
        self.df_to_csv(df)
    
    def df_to_csv(self, df):
        df = pd.DataFrame({'emoji': df[1], 'description': df[3]})
        df.to_csv("data/emoji-data.csv", encoding='utf-8', index=False)

### Initialize class object

In [24]:
obj = EmojiRecommender()

### Read raw emoji data from "raw-emoji-data.csv", clean it & then store in "emoji-data.csv"

In [11]:
obj.build_emoji_csv()

### Test: Read top 200 emojis, process it & find top five recommended emojis for the example sentence

In [19]:
mean_tokens = obj.getMeanTokensCsv()
print(mean_tokens.shape)

(16, 768)


In [7]:
sentence = "Will you go on a date with me?"
sentence_token = obj.getMeanTokensSentence(sentence)
similarity = obj.getSimilarity(sentence_token[0], mean_tokens)

In [9]:
indices = (-similarity[0]).argsort()[:5]
for i in indices:
    print(i, obj.emoji_df['emoji'][i], obj.emoji_df['description'][i])

62 🥳 partying face
166 🤟 love you gesture
119 💋 kiss mark
189 🤳 selfie
120 💌 love letter


### Loop through "emoji-data.csv" & store the mean tokens in a .pt file

In [20]:
# Example code
# torch.save(mean_tokens, 'checkpoint/token-all.pt')

### 1. Build a function to load mean tokens for all emojis
### 2. Find recommended emojis based on cosine similarity between mean tokens of text & emoji descriptions

In [28]:
def recommend_emoji(sentence):
    name = 'token-all.pt'
    all_tokens = torch.load('checkpoint/'+name)
    sentence_token = obj.getMeanTokensSentence(sentence)
    similarity = obj.getSimilarity(sentence_token[0], all_tokens)
    indices = (-similarity[0]).argsort()[:5]
    emoji_df = pd.read_csv("data/emoji-data.csv")
    for j in indices:
        print(emoji_df['emoji'][j], emoji_df['description'][j])

In [29]:
sentence = "I am going to the movies"
recommend_emoji(sentence)

🎥 movie camera
🎦 cinema
📽 film projector
📀 dvd
🎞 film frames
