In [None]:
# Necessary Libraries
!pip install pandas
!pip install tensorflow
!pip install transformers
!pip install scikit-learn
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install imbalanced-learn
!pip install openpyxl
!pip install tqdm



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Required Libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
import pandas as pd
import numpy as np
from tqdm import tqdm

# Dataset Class
class ERDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        tokens = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": tokens["input_ids"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.float),
        }

In [None]:
# RNN Model (LSTM)
class EmotionRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_labels, num_layers=1, bidirectional=True):
        super(EmotionRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim, num_layers, bidirectional=bidirectional, batch_first=True
        )
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_labels)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.dropout(lstm_out[:, -1, :])  # Use the last hidden state
        logits = self.fc(lstm_out)
        return logits


# Load Dataset
file_path = "/content/drive/MyDrive/Team_Project/Journal_500Dataset.xlsx"
df = pd.read_excel(file_path)
df = df[df['er_strat'].notna()]

# Prepare Labels
mlb = MultiLabelBinarizer()
df['labels'] = df['er_strat'].apply(lambda x: x.split(", "))
y = mlb.fit_transform(df['labels'])

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['text_cleaned'], y, test_size=0.3, random_state=42
)

# Tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
vocab_size = tokenizer.vocab_size

# Dataset and DataLoader
max_length = 128
batch_size = 16

train_dataset = ERDataset(X_train.tolist(), y_train, tokenizer, max_length)
test_dataset = ERDataset(X_test.tolist(), y_test, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
# Model Parameters
embedding_dim = 128
hidden_dim = 256
num_labels = y.shape[1]
num_layers = 2
bidirectional = True

# Initialize Model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = EmotionRNN(vocab_size, embedding_dim, hidden_dim, num_labels, num_layers, bidirectional)
model = model.to(device)

# Loss Function and Optimizer
class_counts = np.sum(y_train, axis=0)
pos_weights = torch.tensor((len(y_train) - class_counts) / class_counts, dtype=torch.float).to(device)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=0.01)

In [None]:
# Training Function
def train_model(model, data_loader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0

    for batch in tqdm(data_loader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)

In [None]:
# Evaluation Function
def evaluate_model(model, data_loader, device, threshold=0.5):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].cpu().numpy()

            outputs = model(input_ids)
            preds = torch.sigmoid(outputs).cpu().numpy()
            predictions.extend((preds > threshold).astype(int))
            true_labels.extend(labels)

    return np.array(predictions), np.array(true_labels)

In [None]:
# Training Loop
epochs = 10
best_f1 = 0

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train_loss = train_model(model, train_loader, optimizer, loss_fn, device)
    print(f"Train Loss: {train_loss:.4f}")

    # Evaluate on validation data
    predictions, true_labels = evaluate_model(model, test_loader, device)
    f1 = f1_score(true_labels, predictions, average="macro")
    print(f"Validation F1 Score: {f1:.4f}")

    # Save best model
    if f1 > best_f1:
        best_f1 = f1
        torch.save(model.state_dict(), "best_rnn_model.pth")

Epoch 1/10


100%|██████████| 21/21 [00:22<00:00,  1.07s/it]


Train Loss: 1.1273
Validation F1 Score: 0.2677
Epoch 2/10


100%|██████████| 21/21 [01:10<00:00,  3.36s/it]


Train Loss: 1.1207
Validation F1 Score: 0.2384
Epoch 3/10


100%|██████████| 21/21 [02:17<00:00,  6.53s/it]


Train Loss: 1.1200
Validation F1 Score: 0.3459
Epoch 4/10


100%|██████████| 21/21 [01:59<00:00,  5.67s/it]


Train Loss: 1.1173
Validation F1 Score: 0.2947
Epoch 5/10


100%|██████████| 21/21 [02:19<00:00,  6.64s/it]


Train Loss: 1.1171
Validation F1 Score: 0.3451
Epoch 6/10


100%|██████████| 21/21 [02:52<00:00,  8.23s/it]


Train Loss: 1.1214
Validation F1 Score: 0.2509
Epoch 7/10


100%|██████████| 21/21 [03:25<00:00,  9.78s/it]


Train Loss: 1.1170
Validation F1 Score: 0.3456
Epoch 8/10


100%|██████████| 21/21 [02:29<00:00,  7.12s/it]


Train Loss: 1.1177
Validation F1 Score: 0.2915
Epoch 9/10


100%|██████████| 21/21 [02:49<00:00,  8.09s/it]


Train Loss: 1.1190
Validation F1 Score: 0.3456
Epoch 10/10


100%|██████████| 21/21 [03:02<00:00,  8.68s/it]


Train Loss: 1.1171
Validation F1 Score: 0.2953


In [None]:
# Final Evaluation
model.load_state_dict(torch.load("best_rnn_model.pth"))
predictions, true_labels = evaluate_model(model, test_loader, device)

# Classification Report
print("Final Evaluation on Test Set")
print("Classification Report:")
print(classification_report(true_labels, predictions, target_names=mlb.classes_))

  model.load_state_dict(torch.load("best_rnn_model.pth"))


Final Evaluation on Test Set
Classification Report:
                        precision    recall  f1-score   support

Attentional Deployment       0.49      0.37      0.42        51
      Cognitive Change       0.59      0.41      0.48        56
   Response Modulation       0.18      0.70      0.29        27
Situation Modification       0.23      0.69      0.35        13
   Situation Selection       0.15      0.43      0.23        14
                  none       0.18      1.00      0.31        19

             micro avg       0.26      0.53      0.35       180
             macro avg       0.30      0.60      0.35       180
          weighted avg       0.40      0.53      0.39       180
           samples avg       0.23      0.50      0.31       180

