https://www.kaggle.com/datasets/simaanjali/emotion-analysis-based-on-text

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
!pip install torch transformers



##Importing Libraries and Modules

In [28]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import pandas as pd
import torch

In [29]:
# Check if GPU is available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

##Reads the training, validation, and test data from CSV files.

In [30]:
# Read data into pandas DataFrames
df = pd.read_csv('/content/drive/MyDrive/emotion_sentimen_dataset.csv')
df = df.iloc[:1000] # for practice
df.head()

Unnamed: 0.1,Unnamed: 0,text,Emotion
0,0,i seriously hate one subject to death but now ...,hate
1,1,im so full of life i feel appalled,neutral
2,2,i sit here to write i start to dig out my feel...,neutral
3,3,ive been really angry with r and i feel like a...,anger
4,4,i feel suspicious if there is no one outside l...,neutral


In [31]:
df['Emotion'].unique()

array(['hate', 'neutral', 'anger', 'love', 'worry', 'relief', 'happiness',
       'fun', 'empty', 'enthusiasm', 'sadness', 'surprise'], dtype=object)

##Encodes the labels into numerical values using LabelEncoder from sklearn.

In [32]:
#Convert categorical type data into numerical type data
labelencoder = LabelEncoder()
df['label_num'] = labelencoder.fit_transform(df['Emotion'])

In [33]:
# create label and text list
text = df.text.values

#check distribution of data based on labels
df.label_num.value_counts()

label_num
7     842
4      34
8      27
5      20
6      20
0      17
2      10
11      9
3       9
9       7
1       3
10      2
Name: count, dtype: int64

##Tokenizes the text data using the BERT tokenizer and converts them into input IDs.

In [34]:
# Set the maximum sequence length
max_length = 128

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
text_id = [tokenizer.encode(sent, add_special_tokens=True,max_length=max_length, pad_to_max_length=True) for sent in text]
labels = df.label_num.values

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [35]:
print("Actual sentence before tokenization: ",text[3])
print("Encoded Input from dataset: ",text_id[3])

Actual sentence before tokenization:  ive been really angry with r and i feel like an idiot for trusting him in the first place
Encoded Input from dataset:  [101, 4921, 2063, 2042, 2428, 4854, 2007, 1054, 1998, 1045, 2514, 2066, 2019, 10041, 2005, 19836, 2032, 1999, 1996, 2034, 2173, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


##Creates attention masks to differentiate real tokens from padding tokens.

In [36]:
# Create attention masks for each sentence
attention_masks = []

for sent in text_id:
    # Create a mask of 1s for non-padding tokens and 0s for padding tokens
    # Attention mask values: 1 for non-padding tokens, 0 for padding tokens
    att_mask = [1 if token_id != tokenizer.pad_token_id else 0 for token_id in sent]
    attention_masks.append(att_mask)

# Convert the attention masks list to a PyTorch tensor
attention_masks = torch.tensor(attention_masks)
attention_masks[3]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

##Splits the data into training and validation sets.


In [37]:
# Split tokenized text data and attention masks into training and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    text_id,
    labels,
    random_state=41,
    test_size=0.1
)

train_masks, validation_masks, _, _ = train_test_split(
    attention_masks,
    text_id,
    random_state=41,
    test_size=0.1
)

# Convert inputs, masks, and labels to torch tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

  train_masks = torch.tensor(train_masks)
  validation_masks = torch.tensor(validation_masks)


##Converts the data into PyTorch tensors and creates DataLoader objects for batching.


In [38]:
# Select a batch size for training.
batch_size = 16

# Create the DataLoader for the training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for the validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

##Sets hyperparameters for fine-tuning





In [39]:
num_epochs = 3
learning_rate = 2e-5
adam_epsilon = 1e-8
max_grad_norm = 1.0

# Calculate total number of training steps
num_train_samples = len(train_data)
num_train_steps = (num_train_samples // batch_size) * num_epochs

# Define number of warmup steps as 10% of total training steps
num_warmup_steps = int(num_train_steps * 0.1)


In [40]:
##Model Setup
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=df['Emotion'].nunique())

# 冻结前九层（对于 BERT-base，共12层）
for param in model.bert.encoder.layer[:11].parameters():
    param.requires_grad = False

# 只微调最后三层
for param in model.bert.encoder.layer[11:].parameters():
    param.requires_grad = True
model.to(device)  #move model to GPU if available

##Optimizer & Scheduler Setup
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_train_steps
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##在每個 epoch 執行訓練和驗證循環

In [41]:
def train_model(model, train_dataloader, validation_dataloader, optimizer, scheduler, device, num_epochs):
    for epoch in range(num_epochs):
        print(f'Epoch {epoch + 1}/{num_epochs}')
        print('-' * 10)

        # Training phase
        model.train()
        total_train_loss = 0

        step_count = 0

        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device).long()  # Convert labels to Long

            model.zero_grad()

            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()

            # Clip the norm of the gradients to prevent the "exploding gradients" problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

            optimizer.step()
            scheduler.step()
            step_count += 1

        print(f"step count:{step_count}")

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f'Average training loss: {avg_train_loss}')

        # Validation phase
        model.eval()
        total_eval_loss = 0
        correct_predictions = 0
        total_predictions = 0

        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device).long()  # Convert labels to Long

            with torch.no_grad():
                outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs.loss
                logits = outputs.logits

            total_eval_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == b_labels)
            total_predictions += b_labels.size(0)

        avg_val_loss = total_eval_loss / len(validation_dataloader)
        val_accuracy = correct_predictions.double() / total_predictions

        print(f'Validation Loss: {avg_val_loss}')
        print(f'Validation Accuracy: {val_accuracy}\n')

# Train the model
train_model(model, train_dataloader, validation_dataloader, optimizer, scheduler, device, num_epochs)

# Save the model
model.save_pretrained('/content/drive/MyDrive/saved_model/')
tokenizer.save_pretrained('/content/drive/MyDrive/saved_model/')



Epoch 1/3
----------
step count:57
Average training loss: 1.723358482645269
Validation Loss: 0.8025321108954293
Validation Accuracy: 0.85

Epoch 2/3
----------
step count:57
Average training loss: 0.8083417860039493
Validation Loss: 0.6909507108586175
Validation Accuracy: 0.85

Epoch 3/3
----------
step count:57
Average training loss: 0.7769842197497686
Validation Loss: 0.692708786044802
Validation Accuracy: 0.85



('/content/drive/MyDrive/LAB/saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/LAB/saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/LAB/saved_model/vocab.txt',
 '/content/drive/MyDrive/LAB/saved_model/added_tokens.json')