In [2]:
import pandas as pd
import numpy as np
RANDOM_SEED = 577
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
def preprocess_data(data, label_encoder):
    texts = data['text'].tolist()
    labels = label_encoder.fit_transform(data['emoji_id'])
    return np.array(texts), labels, label_encoder

In [4]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
bertweet = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=100)

class BERTweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=25):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze(), label

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initiali

In [5]:
label_encoder = LabelEncoder()

# train_data = pd.read_csv("dataset/train_data.csv")
train_data = pd.read_csv("dataset/train_data_augmented.csv").dropna()
validate_data = pd.read_csv("dataset/validate_data.csv").dropna()
test_data = pd.read_csv("dataset/test_data.csv").dropna()

train_texts, train_labels, label_encoder = preprocess_data(train_data, label_encoder)
test_texts, test_labels, _ = preprocess_data(test_data, label_encoder)
validate_texts, validate_labels, _ = preprocess_data(validate_data, label_encoder)


train_dataset = BERTweetDataset(train_texts, train_labels, tokenizer)
test_dataset = BERTweetDataset(test_texts, test_labels, tokenizer)
validate_dataset = BERTweetDataset(validate_texts, validate_labels, tokenizer)


train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
validate_dataloader = DataLoader(validate_dataset, batch_size=128, shuffle=False)

In [6]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")
print(f"Validate dataset size: {len(test_dataset)}")
print(f"Train DataLoader size: {len(train_dataloader)}")
print(f"Test DataLoader size: {len(test_dataloader)}")
print(f"Validate DataLoader size: {len(validate_dataloader)}")

Train dataset size: 33079
Test dataset size: 3041
Validate dataset size: 3041
Train DataLoader size: 259
Test DataLoader size: 24
Validate DataLoader size: 24


In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bertweet.to(device)

optimizer = AdamW(bertweet.parameters(), lr=1e-5)
num_epochs = 10

for epoch in tqdm(range(num_epochs)):
    bertweet.train()
    train_loss = 0

    for input_ids, attention_mask, labels in train_dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = bertweet(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    bertweet.eval()
    val_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for input_ids, attention_mask, labels in validate_dataloader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = bertweet(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()

            _, predicted = torch.max(outputs.logits, 1)
            correct_predictions += (predicted == labels).sum().item()

    val_accuracy = correct_predictions / len(validate_dataset)

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss / len(train_dataloader):.4f}, Validation Loss: {val_loss / len(validate_dataloader):.4f}, Validation Accuracy: {val_accuracy:.4f}")



 10%|█         | 1/10 [00:59<08:56, 59.67s/it]

Epoch 1/10, Train Loss: 4.2334, Validation Loss: 3.6180, Validation Accuracy: 0.2493


 20%|██        | 2/10 [01:56<07:44, 58.03s/it]

Epoch 2/10, Train Loss: 3.5419, Validation Loss: 3.0803, Validation Accuracy: 0.3160


 30%|███       | 3/10 [02:53<06:43, 57.69s/it]

Epoch 3/10, Train Loss: 3.0291, Validation Loss: 2.7540, Validation Accuracy: 0.3591


 40%|████      | 4/10 [03:51<05:45, 57.56s/it]

Epoch 4/10, Train Loss: 2.6297, Validation Loss: 2.5438, Validation Accuracy: 0.4008


 50%|█████     | 5/10 [04:48<04:47, 57.52s/it]

Epoch 5/10, Train Loss: 2.2885, Validation Loss: 2.4176, Validation Accuracy: 0.4143


 60%|██████    | 6/10 [05:46<03:49, 57.50s/it]

Epoch 6/10, Train Loss: 1.9962, Validation Loss: 2.3061, Validation Accuracy: 0.4267


 70%|███████   | 7/10 [06:43<02:52, 57.50s/it]

Epoch 7/10, Train Loss: 1.7351, Validation Loss: 2.2605, Validation Accuracy: 0.4382


 80%|████████  | 8/10 [07:41<01:54, 57.47s/it]

Epoch 8/10, Train Loss: 1.5275, Validation Loss: 2.2202, Validation Accuracy: 0.4415


 90%|█████████ | 9/10 [08:38<00:57, 57.52s/it]

Epoch 9/10, Train Loss: 1.3407, Validation Loss: 2.2238, Validation Accuracy: 0.4442


100%|██████████| 10/10 [09:36<00:00, 57.61s/it]

Epoch 10/10, Train Loss: 1.1883, Validation Loss: 2.1767, Validation Accuracy: 0.4514





In [8]:
bertweet.eval()
correct_predictions = 0

with torch.no_grad():
    for input_ids, attention_mask, labels in test_dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = bertweet(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        correct_predictions += (predicted == labels).sum().item()

test_accuracy = correct_predictions / len(test_dataset)
print(f"Test Accuracy: {test_accuracy:.4f}")

Test Accuracy: 0.4594


In [9]:
from sklearn.metrics import classification_report, f1_score

bertweet.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_dataloader:
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        outputs = bertweet(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        
        true_labels.extend(labels.cpu().numpy())
        predicted_labels.extend(predicted.cpu().numpy())

# Compute and print the classification report
f1_score_report = classification_report(true_labels, predicted_labels)
print(f1_score(true_labels, predicted_labels, average="macro"))
print("F1 Score Report:")
print(f1_score_report)

0.3162586629243133
F1 Score Report:
              precision    recall  f1-score   support

           0       0.35      0.41      0.38        22
           1       0.59      0.67      0.63       380
           2       0.50      0.11      0.19        35
           3       0.36      0.44      0.39        32
           4       0.44      0.73      0.55        22
           5       0.65      0.72      0.68        43
           6       0.22      0.40      0.29         5
           7       0.44      0.59      0.50       182
           8       0.00      0.00      0.00        37
           9       0.35      0.73      0.47        75
          10       0.57      0.67      0.61        82
          11       0.50      0.20      0.29        10
          12       0.39      0.48      0.43        82
          13       0.00      0.00      0.00        11
          14       0.00      0.00      0.00        12
          15       0.69      0.62      0.65       117
          16       0.41      0.76      0.54  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
def test_tweet(tweet, bertweet_model, tokenizer):
    # Step 1: Tokenize and prepare the input tweet for the BERTweet model
    inputs = tokenizer(tweet, return_tensors="pt", max_length=128, padding="max_length", truncation=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Step 2: Feed the input to the BERTweet model and obtain the predictions
    with torch.no_grad():
        outputs = bertweet_model(input_ids, attention_mask=attention_mask)
        print(nn.functional.softmax(outputs.logits))
        _, predicted_idx = torch.max(outputs.logits, 1)

    predicted_idx = predicted_idx.item()
    
    return predicted_idx

tweet = "Good night, my friend."
label = test_tweet(tweet, bertweet, tokenizer)
print(f"Predicted label: {label}")


tensor([[3.6590e-04, 5.9816e-04, 7.2852e-04, 2.4061e-04, 2.4354e-03, 5.3358e-03,
         2.1377e-03, 1.3610e-03, 1.3374e-03, 3.2126e-04, 2.3399e-02, 7.1908e-04,
         1.5424e-03, 2.3722e-03, 1.6425e-03, 1.4648e-03, 2.5034e-03, 6.2619e-04,
         6.6121e-04, 1.9675e-03, 1.0300e-03, 1.6258e-03, 5.8667e-04, 2.1243e-03,
         2.4942e-03, 1.6727e-03, 1.0062e-02, 1.1106e-03, 7.1480e-04, 6.3631e-04,
         8.3600e-04, 1.4328e-04, 2.0256e-03, 8.8725e-04, 3.8393e-04, 2.0320e-04,
         6.6939e-04, 6.2245e-04, 6.0409e-03, 1.1874e-03, 8.7890e-04, 4.7415e-03,
         2.8010e-03, 3.1406e-03, 3.7967e-04, 2.2139e-03, 3.8184e-04, 1.1477e-01,
         5.2864e-04, 1.5080e-02, 4.4223e-04, 4.3471e-04, 1.9818e-03, 7.9152e-04,
         9.0659e-04, 1.2345e-03, 1.4133e-03, 9.1259e-04, 3.2011e-04, 6.5046e-01,
         4.1065e-03, 1.5103e-03, 9.8508e-03, 4.3015e-04, 9.9339e-04, 1.5640e-03,
         7.6500e-04, 8.3128e-03, 7.4763e-03, 8.0556e-04, 9.7631e-04, 2.0100e-03,
         5.1194e-04, 2.7488e

  print(nn.functional.softmax(outputs.logits))


In [21]:
emoji_map = {}
for i, j in zip(train_data["emoji"], train_data["emoji_id"]):
  if i not in emoji_map:
    emoji_map[i] = j

In [22]:
reversed_emoji_map = {value: key for key, value in emoji_map.items()}
reversed_emoji_map[label][1]

'🌙'

In [13]:
emoji_map

{' 😊': 25,
 ' 😱': 0,
 ' 😔': 7,
 ' 🌞': 26,
 ' 🙏': 5,
 ' 😎': 89,
 ' 💖': 96,
 ' 🎬': 91,
 ' 😂': 9,
 ' 😴': 10,
 ' 🙁': 55,
 ' 🤔': 1,
 ' 🤩': 31,
 ' 😢': 16,
 ' 🤞': 21,
 ' 🏠': 75,
 ' 🤬': 3,
 ' 😞': 53,
 ' 📝': 90,
 ' 😩': 12,
 ' 🎮': 82,
 ' 🤕': 17,
 ' ☕️': 62,
 ' 🙄': 70,
 ' 🤪': 14,
 ' 🤦': 35,
 ' 🧹': 87,
 ' 🤷': 64,
 ' 🌙': 59,
 ' 🌤': 67,
 ' 😷': 4,
 ' 🛍': 40,
 ' 🤗': 23,
 ' ❤️': 33,
 ' 🍻': 69,
 ' 🙅': 97,
 ' 🤓': 63,
 ' 🤑': 11,
 ' 😆': 29,
 ' 🤝': 2,
 ' 😡': 36,
 ' 🎉': 15,
 ' 💕': 54,
 ' 🤯': 50,
 ' 💔': 42,
 ' 🎂': 52,
 ' 💰': 85,
 ' 🤣': 34,
 ' 🤘': 28,
 ' 🎧': 99,
 ' 😒': 81,
 ' 😠': 57,
 ' 😤': 18,
 ' 🔥': 65,
 ' 😉': 61,
 ' 🌧': 38,
 ' 😬': 46,
 ' 📺': 39,
 ' 🤢': 20,
 ' 🛫': 95,
 ' ☀️': 77,
 ' 🎵': 30,
 ' 🚗': 60,
 ' 🥵': 80,
 ' 💪': 45,
 ' 🍴': 6,
 ' 😍': 51,
 ' 😋': 98,
 ' 🏡': 32,
 ' 🍕': 48,
 ' 😲': 92,
 ' 🍔': 88,
 ' 📖': 76,
 ' 🎶': 37,
 ' 😳': 56,
 ' 😰': 79,
 ' 🎤': 72,
 ' 😃': 27,
 ' 💻': 78,
 ' 🐶': 43,
 ' 😕': 94,
 ' 🙌': 41,
 ' 🍽': 24,
 ' 📚': 73,
 ' 😁': 22,
 ' 🎸': 66,
 ' 😭': 8,
 ' 🤤': 84,
 ' 🗳': 71,
 ' 🍿': 86,
 ' 🏃': 74,
 ' 🌊':

In [14]:
a = [7.9977e-04, 2.9141e-04, 1.7481e-03, 8.9901e-04, 2.2168e-03, 5.4152e-03,
         3.6625e-03, 9.3339e-04, 1.6010e-03, 8.9317e-04, 3.2083e-03, 1.6273e-03,
         1.4378e-03, 1.4740e-03, 1.4684e-03, 5.8376e-03, 2.0788e-03, 1.0593e-03,
         8.9349e-04, 2.4485e-03, 1.5230e-03, 1.3898e-03, 4.0302e-03, 9.9571e-03,
         2.1867e-03, 9.8646e-03, 6.6561e-01, 9.3499e-03, 3.0011e-03, 2.2290e-03,
         3.0850e-03, 4.2704e-03, 7.1119e-03, 6.8402e-03, 1.4497e-03, 3.4174e-04,
         1.2950e-03, 4.0112e-03, 9.7552e-03, 1.9705e-03, 3.2153e-03, 5.6694e-03,
         2.2442e-03, 3.4494e-03, 1.1937e-03, 1.6548e-03, 7.6679e-04, 3.4509e-03,
         3.1187e-03, 2.7716e-03, 5.4364e-04, 5.7858e-03, 3.6586e-03, 7.9227e-04,
         6.3173e-03, 9.1841e-04, 1.0079e-03, 1.0732e-03, 2.7833e-03, 1.0879e-02,
         5.2406e-03, 2.5733e-03, 3.5899e-03, 1.0424e-03, 8.7068e-04, 5.2327e-03,
         3.0119e-03, 1.2747e-02, 8.6398e-03, 2.4391e-03, 1.6232e-03, 1.9866e-03,
         3.7783e-03, 1.2592e-03, 3.2638e-03, 4.1626e-03, 2.1148e-03, 2.6737e-02,
         2.2242e-03, 1.3329e-03, 3.9976e-03, 1.5409e-03, 2.0270e-03, 3.8865e-03,
         2.4626e-03, 1.3738e-03, 3.1394e-03, 2.0015e-03, 3.2569e-03, 3.7861e-03,
         2.2908e-03, 4.3359e-03, 9.9548e-04, 3.3245e-03, 1.2052e-03, 4.4659e-03,
         3.8085e-03, 1.6605e-03, 2.9919e-03, 3.0202e-03]

In [15]:
b = {a[i]: i for i in range(len(a))}
dict(sorted(b.items(), reverse=True))

{0.66561: 26,
 0.026737: 77,
 0.012747: 67,
 0.010879: 59,
 0.0099571: 23,
 0.0098646: 25,
 0.0097552: 38,
 0.0093499: 27,
 0.0086398: 68,
 0.0071119: 32,
 0.0068402: 33,
 0.0063173: 54,
 0.0058376: 15,
 0.0057858: 51,
 0.0056694: 41,
 0.0054152: 5,
 0.0052406: 60,
 0.0052327: 65,
 0.0044659: 95,
 0.0043359: 91,
 0.0042704: 31,
 0.0041626: 75,
 0.0040302: 22,
 0.0040112: 37,
 0.0039976: 80,
 0.0038865: 83,
 0.0038085: 96,
 0.0037861: 89,
 0.0037783: 72,
 0.0036625: 6,
 0.0036586: 52,
 0.0035899: 62,
 0.0034509: 47,
 0.0034494: 43,
 0.0033245: 93,
 0.0032638: 74,
 0.0032569: 88,
 0.0032153: 40,
 0.0032083: 10,
 0.0031394: 86,
 0.0031187: 48,
 0.003085: 30,
 0.0030202: 99,
 0.0030119: 66,
 0.0030011: 28,
 0.0029919: 98,
 0.0027833: 58,
 0.0027716: 49,
 0.0025733: 61,
 0.0024626: 84,
 0.0024485: 19,
 0.0024391: 69,
 0.0022908: 90,
 0.0022442: 42,
 0.002229: 29,
 0.0022242: 78,
 0.0022168: 4,
 0.0021867: 24,
 0.0021148: 76,
 0.0020788: 16,
 0.002027: 82,
 0.0020015: 87,
 0.0019866: 71,
 0.

In [16]:
reversed_emoji_map[59]

' 🌙'