In [None]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word.isalpha() and word not in stop_words
    ]
    return " ".join(filtered_tokens)


class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "label": torch.tensor(label, dtype=torch.long),
        }


data = pd.read_csv("emotions.csv")


data['text'] = data['text'].apply(preprocess_text)


label_map = {label: idx for idx, label in enumerate(data['label'].unique())}
data['label'] = data['label'].map(label_map)


X = data['text'].values
y = data['label'].values


tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = bert_model.to(device)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:

print("Computing BERT embeddings...")
def compute_bert_embeddings(texts, tokenizer, model, max_len):
    model.eval()
    embeddings = []

    with torch.no_grad():
        for text in tqdm(texts):
            encoding = tokenizer(
                text,
                max_length=max_len,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )

            input_ids = encoding["input_ids"].to(device)
            attention_mask = encoding["attention_mask"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings.append(outputs.pooler_output.cpu().numpy().squeeze())

    return np.array(embeddings)

X_embeddings = compute_bert_embeddings(X, tokenizer, bert_model, max_len=128)


print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_embeddings, y)


X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)



Computing BERT embeddings...


100%|██████████| 416809/416809 [1:14:18<00:00, 93.48it/s] 


Applying SMOTE...


In [None]:

class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, index):
        embedding = self.embeddings[index]
        label = self.labels[index]
        return {
            "embedding": torch.tensor(embedding, dtype=torch.float),
            "label": torch.tensor(label, dtype=torch.long),
        }

class EmotionClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(EmotionClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, embeddings):
        x = self.fc1(embeddings)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

input_dim = X_resampled.shape[1]
model = EmotionClassifier(input_dim, num_classes=len(label_map))
model = model.to(device)

BATCH_SIZE = 16
train_dataset = EmbeddingDataset(X_train, y_train)
test_dataset = EmbeddingDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)




In [None]:

def train_epoch(model, data_loader, optimizer, criterion):
    model.train()
    total_loss = 0
    correct_predictions = 0

    for batch in tqdm(data_loader):
        embeddings = batch["embedding"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct_predictions += (outputs.argmax(dim=1) == labels).sum().item()

    return total_loss / len(data_loader), correct_predictions / len(data_loader.dataset)

def eval_model(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in tqdm(data_loader):
            embeddings = batch["embedding"].to(device)
            labels = batch["label"].to(device)

            outputs = model(embeddings)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            correct_predictions += (outputs.argmax(dim=1) == labels).sum().item()

    return total_loss / len(data_loader), correct_predictions / len(data_loader.dataset)


optimizer = Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

EPOCHS = 5
for epoch in range(EPOCHS):
    print(f"Epoch {epoch + 1}/{EPOCHS}")
    train_loss, train_acc = train_epoch(model, train_loader, optimizer, criterion)
    print(f"Train loss: {train_loss:.4f}, accuracy: {train_acc:.4f}")

    val_loss, val_acc = eval_model(model, test_loader, criterion)
    print(f"Validation loss: {val_loss:.4f}, accuracy: {val_acc:.4f}")


y_pred = []
y_true = []

model.eval()
with torch.no_grad():
    for batch in test_loader:
        embeddings = batch["embedding"].to(device)
        labels = batch["label"].to(device)

        outputs = model(embeddings)
        y_pred.extend(outputs.argmax(dim=1).cpu().numpy())
        y_true.extend(labels.cpu().numpy())


Epoch 1/5


100%|██████████| 42321/42321 [01:38<00:00, 428.48it/s]


Train loss: 1.1108, accuracy: 0.5794


100%|██████████| 10581/10581 [00:09<00:00, 1166.99it/s]


Validation loss: 1.0734, accuracy: 0.5943
Epoch 2/5


100%|██████████| 42321/42321 [01:30<00:00, 466.97it/s]


Train loss: 1.1066, accuracy: 0.5815


100%|██████████| 10581/10581 [00:09<00:00, 1071.76it/s]


Validation loss: 1.0777, accuracy: 0.5942
Epoch 3/5


100%|██████████| 42321/42321 [01:30<00:00, 468.48it/s]


Train loss: 1.1042, accuracy: 0.5826


100%|██████████| 10581/10581 [00:09<00:00, 1079.67it/s]


Validation loss: 1.0726, accuracy: 0.5947
Epoch 4/5


100%|██████████| 42321/42321 [01:30<00:00, 468.10it/s]


Train loss: 1.1005, accuracy: 0.5840


100%|██████████| 10581/10581 [00:08<00:00, 1200.46it/s]


Validation loss: 1.0650, accuracy: 0.5961
Epoch 5/5


100%|██████████| 42321/42321 [01:30<00:00, 469.39it/s]


Train loss: 1.0975, accuracy: 0.5855


100%|██████████| 10581/10581 [00:09<00:00, 1076.87it/s]


Validation loss: 1.0658, accuracy: 0.5978


In [None]:

label_map = {str(label): idx for idx, label in enumerate(data['label'].unique())}
data['label'] = data['label'].map(label_map)


inverse_label_map = {v: k for k, v in label_map.items()}


target_names = [str(inverse_label_map[i]) for i in range(len(inverse_label_map))]


print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=target_names))



Classification Report:
              precision    recall  f1-score   support

           0       0.57      0.63      0.59     28213
           1       0.58      0.45      0.51     28213
           2       0.59      0.67      0.62     28214
           3       0.54      0.58      0.56     28214
           4       0.69      0.72      0.70     28214
           5       0.63      0.54      0.58     28213

    accuracy                           0.60    169281
   macro avg       0.60      0.60      0.60    169281
weighted avg       0.60      0.60      0.60    169281



In [None]:
!pip install datasets



In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from transformers import AutoTokenizer, AutoModel, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from xgboost import XGBClassifier
from datasets import Dataset




device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


data = pd.read_csv("emotions.csv")

numeric_to_string_mapping = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
string_to_numeric_mapping = {v: k for k, v in numeric_to_string_mapping.items()}


data["label_string"] = data["label"].map(numeric_to_string_mapping)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased").to(device)

def preprocess_text(text):
    """Tokenizes text and converts it into a BERT embedding."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = bert_model(**inputs)
        return outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()







Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
#
data["processed_text"] = data["text"].apply(preprocess_text)
embeddings = np.array(data["processed_text"].to_list())
labels = data["label"].values


X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

clf = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42, tree_method="gpu_hist")
clf.fit(X_train, y_train)


y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))