In [None]:
# Loading Dataset
from datasets import load_dataset

ds = load_dataset("cardiffnlp/tweet_eval", "emoji")
print(ds["train"].features["label"].names)

['❤', '😍', '😂', '💕', '🔥', '😊', '😎', '✨', '💙', '😘', '📷', '🇺🇸', '☀', '💜', '😉', '💯', '😁', '🎄', '📸', '😜']


In [None]:
# Loading Facebook AI RoBERTa Base Model and Tokenizer
from transformers import RobertaTokenizer, RobertaForSequenceClassification

model_name = "roberta-base"

tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=20)  # 20 emoji classes

In [3]:
def preprocess(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization
encoded_dataset = ds.map(preprocess, batched=True)
encoded_dataset = encoded_dataset.rename_column("label", "labels")
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


In [4]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


In [35]:
# Setting Training Configurations
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=r"D:/emoji-predictor",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=10,
)

In [None]:
# Training the model
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["validation"],
    compute_metrics=compute_metrics,
)

trainer.train(resume_from_checkpoint = True)


In [None]:
# Saving the model for future use

model.save_pretrained(r"D:/complete-emoji-model")
tokenizer.save_pretrained(r"D:/complete-emoji-model")

In [3]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Load model and tokenizer from saved folder
model = AutoModelForSequenceClassification.from_pretrained(r"D:/complete-emoji-model")
tokenizer = AutoTokenizer.from_pretrained(r"D:/complete-emoji-model")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Emoji ID to Emoji character
id2emoji = {
    0: '❤',   1: '😍',  2: '😂',  3: '💕',  4: '🔥',
    5: '😊',  6: '😎',  7: '✨',  8: '💙',  9: '😘',
    10: '📷', 11: '🇺🇸', 12: '☀', 13: '💜', 14: '😉',
    15: '💯', 16: '😁', 17: '🎄', 18: '📸', 19: '😜'
}

In [7]:
emoji_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

def predict_emoji(text):
    result = emoji_classifier(text)
    label = int(result[0]["label"].replace("LABEL_", ""))
    return f"{id2emoji[label]}"

Device set to use cpu


In [None]:
# Create User Interface
import gradio as gr

iface = gr.Interface(
    fn=predict_emoji,
    inputs=gr.Textbox(lines=2, placeholder="Type a message here..."),
    outputs="Text",
    title="Emoji Predictor 🤖✨",
    description="Enter a sentence and get a predicted emoji!"
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


