# Load Dataset

In [1]:
# Install modules
# A '!' in a Jupyter Notebook runs the line in the system's shell, and not in the Python interpreter

# Import necessary libraries
import pandas as pd
import random

# Load dataset 
# you can download this dataset from https://huggingface.co/datasets/stepp1/tweet_emotion_intensity/tree/main
data = pd.read_csv('data/tweet_emotion_intensity/train.csv')

# Preview the data
print(data.head())

      id                                              tweet    class  \
0  40815  Loved @Bethenny independence msg on @WendyWill...     fear   
1  10128  @mark_slifer actually maybe we were supposed t...  sadness   
2  40476  I thought the nausea and headaches had passed ...     fear   
3  20813  Anger, resentment, and hatred are the destroye...    anger   
4  40796  new tires &amp; an alarm system on my car. fwm...     fear   

  sentiment_intensity class_intensity  labels  
0                 low        fear_low       4  
1                high    sadness_high       9  
2              medium     fear_medium       5  
3                high      anger_high       0  
4                 low        fear_low       4  


In [2]:
import re # Import the `re` module for working with regular expressions

# Function to clean the text
def clean_text(text):
    text = text.lower() # Convert all text to lowercase for uniformity
    text = re.sub(r'http\S+', '', text) # Remove URLs from the text
    text = re.sub(r'<.*?>', '', text) # Remove any HTML tags from the text
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation, keep only words and spaces
    return text # Return the cleaned text

# Assume `data` is a pandas DataFrame with a column named 'text'
# Apply the cleaning function to each row of the 'text' column
data['cleaned_text'] = data['tweet'].apply(clean_text)

# Print the first 5 rows of the cleaned text to verify the cleaning process
print(data['cleaned_text'].head())

0    loved bethenny independence msg on wendywillia...
1    mark_slifer actually maybe we were supposed to...
2    i thought the nausea and headaches had passed ...
3    anger resentment and hatred are the destroyer ...
4      new tires amp an alarm system on my car fwm now
Name: cleaned_text, dtype: object


In [3]:
# Check for missing values in the dataset
print(data.isnull().sum()) # Print the count of missing values for each column

# Option 1: Remove rows with missing data in the 'cleaned_text' column
data = data.dropna(subset=['cleaned_text']) # Drop rows where 'cleaned_text' is NaN (missing)

# Option 2: Fill missing values in 'cleaned_text' with a placeholder
data['cleaned_text'].fillna('unknown', inplace=True) # Replace NaN values in 'cleaned_text' with 'unknown'

id                     0
tweet                  0
class                  0
sentiment_intensity    0
class_intensity        0
labels                 0
cleaned_text           0
dtype: int64


In [4]:
from transformers import BertTokenizer

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the cleaned text
tokens = tokenizer(
    data['cleaned_text'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt'
)

print(tokens['input_ids'][:5])  # Preview the first 5 tokenized examples

  from .autonotebook import tqdm as notebook_tqdm


tensor([[  101,  3866,  7014,  2368,  4890,  4336,  5796,  2290,  2006, 12815,
         29602,  6632,  5244,  2022,  3407, 23713, 16829,  2306,  4426, 23713,
         13433, 28032,  7730,  2097, 19311,  2000,  2017,  3407,  2981,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  2928,  1035, 22889, 23780,  2941,  2672,  2057,  2020,  4011,
          2000,  3280,  1998,  2026, 13445,  5552,  2256,  3268, 27451,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2245,  1996, 19029,  1998, 14978,  2015,  2018,  2979,
          2021,  8840,  2140,  1045,  2514,  9643,  2651,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     

In [5]:
# Import necessary modules
import random # Random module for generating random numbers and selections

import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet # NLTK's WordNet corpus for finding synonyms

# Define a function to find and replace a word with a synonym
def synonym_replacement(word):
# Get all synsets (sets of synonyms) for the given word from WordNet
    synonyms = wordnet.synsets(word)

# If the word has synonyms, randomly choose one synonym, otherwise return the original word
    if synonyms:
# Select a random synonym and get the first lemma (word form) of that synonym
        return random.choice(synonyms).lemmas()[0].name()

# If no synonyms are found, return the original word
    return word

# Define a function to augment text by replacing words with synonyms randomly
def augment_text(text):
# Split the input text into individual words
    words = text.split() # Split the input text into individual words

# Replace each word with a synonym with a probability of 20% (random.random() > 0.8)
    augmented_words = [
    synonym_replacement(word) if random.random() > 0.8 else word 
# If random condition met, replace
for word in words] # Iterate over each word in the original text

# Join the augmented words back into a single string and return it
    return ' '.join(augmented_words)

# Apply the text augmentation function to the 'cleaned_text' column in a DataFrame
# Create a new column 'augmented_text' containing the augmented version of 'cleaned_text'
data['augmented_text'] = data['cleaned_text'].apply(augment_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import torch # Import PyTorch library
from torch.utils.data import TensorDataset, DataLoader # Import modules to create datasets and data loaders

# Convert tokenized data into PyTorch tensors
input_ids = tokens['input_ids'] # Extract input IDs from the tokenized data
attention_masks = tokens['attention_mask'] # Extract attention masks from the tokenized data

# Define a mapping function
def map_sentiment(value):
    if value == "high":
        return 1
    elif value == "medium":
        return 0.5
    elif value == "low":
        return 0
    else:
        return None  # Handle unexpected values, if any

# Apply the function to each item in 'sentiment_intensity'
data['sentiment_intensity'] = data['sentiment_intensity'].apply(map_sentiment)

# Drop any rows where 'sentiment_intensity' is None
data = data.dropna(subset=['sentiment_intensity']).reset_index(drop=True)

# Convert the 'sentiment_intensity' column to a tensor
labels = torch.tensor(data['sentiment_intensity'].tolist())

def bin_label(x):
    if x <= 0.33:
        return 0
    elif x <= 0.66:
        return 1
    else:
        return 2
labels = torch.tensor([bin_label(x.item()) for x in labels])

In [7]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_masks[idx],
            'labels': self.labels[idx]
        }

In [8]:
from sklearn.model_selection import train_test_split # Import function to split dataset

# First split: 15% for test set, the rest for training/validation
train_val_inputs, test_inputs, train_val_masks, test_masks, train_val_labels, test_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.15, random_state=42
)

# Second split: 20% for validation set from remaining data
train_inputs, val_inputs, train_masks, val_masks, train_labels, val_labels = train_test_split(
    train_val_inputs, train_val_masks, train_val_labels, test_size=0.2, random_state=42
)

# Create TensorDataset objects for each set, including attention masks
train_dataset =CustomDataset(train_inputs, train_masks, train_labels)
val_dataset = CustomDataset(val_inputs, val_masks, val_labels)
test_dataset = CustomDataset(test_inputs, test_masks, test_labels)

# Create DataLoader objects
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16)
test_dataloader = DataLoader(test_dataset, batch_size=16)

print("Training, validation, and test sets are prepared with attention masks!")

Training, validation, and test sets are prepared with attention masks!


# PEFT

In [29]:
# Load pre-trained BERT model
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Step 1: Freeze all layers except the last one (classification head)
for param in model.base_model.parameters():
    param.requires_grad = False

# If you'd like to fine-tune additional layers (e.g., the last 2 layers), you can unfreeze those layers as well
for param in model.base_model.encoder.layer[-2:].parameters():
    param.requires_grad = True

NameError: name 'BertConfig' is not defined

In [63]:
from transformers import Trainer, TrainingArguments

# Step 1: Set training arguments for fine-tuning the model
training_args = TrainingArguments(
    output_dir='./results',             # Directory where results will be stored
    learning_rate=5e-5,  # Experiment with different learning rates
    num_train_epochs=5,                 # Number of epochs (full passes through the dataset)
    per_device_train_batch_size=16,     # Batch size per GPU/CPU during training
    evaluation_strategy="epoch",        # Evaluate the model at the end of each epoch
)

from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

# Step 2: Fine-tune only the final classification head (since earlier layers were frozen)
trainer = Trainer(
    model=model,                        # Pre-trained BERT model with frozen layers
    args=training_args,                 # Training arguments
    train_dataset=train_dataset,           # Training data for fine-tuning
    eval_dataset=val_dataset,              # Validation data to evaluate performance during training
    compute_metrics=compute_metrics
)

# Step 3: Train the model using PEFT (this performs PEFT because layers were frozen in Step 1)
trainer.train()



🏃 View run ./results at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056/runs/a06a56fa-e41e-4b19-8be1-5e3ef420f549
🧪 View experiment at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.971977,0.58457
2,No log,0.971196,0.58457
3,0.992700,0.98402,0.578635
4,0.992700,0.99714,0.575668
5,0.992700,1.004108,0.568249


🏃 View run ./results at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056/runs/3cd0c046-f826-4827-8e4e-6072ab61450c
🧪 View experiment at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056


TrainOutput(global_step=845, training_loss=0.9683402699126294, metrics={'train_runtime': 731.0016, 'train_samples_per_second': 18.413, 'train_steps_per_second': 1.156, 'total_flos': 297431218579320.0, 'train_loss': 0.9683402699126294, 'epoch': 5.0})

In [64]:
# Evaluate the model
results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Accuracy: {results['eval_accuracy']}")

Test Accuracy: 0.5858585858585859


# LoRA

In [13]:
from transformers import BertForSequenceClassification, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# Load the pretrained model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Define LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "key", "value"],  # For BERT, these are the common attention layers
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS  # Sequence classification
)

# Wrap the model with LoRA
model = get_peft_model(model, peft_config)

# (Optional) Freeze base model weights if desired
for name, param in model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False

# Confirm which parameters are trainable
model.print_trainable_parameters()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 442,368 || all params: 109,929,222 || trainable%: 0.4024


In [16]:
from transformers import Trainer, TrainingArguments

# Step 1: Set training arguments for fine-tuning the model
training_args = TrainingArguments(
    output_dir='./results',             # Directory where results will be stored
    learning_rate=5e-5,  # Experiment with different learning rates
    num_train_epochs=3,                 # Number of epochs (full passes through the dataset)
    per_device_train_batch_size=16,     # Batch size per GPU/CPU during training
    eval_strategy="epoch",        # Evaluate the model at the end of each epoch
)

from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

# Step 2: Fine-tune only the final classification head (since earlier layers were frozen)
trainer = Trainer(
    model=model,                        # Pre-trained BERT model with frozen layers
    args=training_args,                 # Training arguments
    train_dataset=train_dataset,           # Training data for fine-tuning
    eval_dataset=val_dataset,              # Validation data to evaluate performance during training
    compute_metrics=compute_metrics
)

# Step 3: Train the model using PEFT (this performs PEFT because layers were frozen in Step 1)
trainer.train()

🏃 View run ./results at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056/runs/f33e7a45-590a-4307-9866-74defd23ddc0
🧪 View experiment at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.982634,0.58457
2,No log,0.978657,0.58457
3,1.010200,0.977739,0.58457




🏃 View run ./results at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056/runs/6a8c5f97-cfec-480e-ac78-8c4c05f99e4b
🧪 View experiment at: https://australiaeast.api.azureml.ms/mlflow/v2.0/subscriptions/f7f4dd05-cc7b-4921-9af7-d0cd6f686e92/resourceGroups/xyguo94-rg/providers/Microsoft.MachineLearningServices/workspaces/ai-ml-engineer/#/experiments/0f944482-fa23-4d20-985e-2864b5371056


TrainOutput(global_step=507, training_loss=1.0101215204543617, metrics={'train_runtime': 980.0733, 'train_samples_per_second': 8.24, 'train_steps_per_second': 0.517, 'total_flos': 179385259534992.0, 'train_loss': 1.0101215204543617, 'epoch': 3.0})

# QLoRA

In [9]:
from transformers import GPT2ForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch

# Enable 8-bit quantization via bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # QLoRA: load model in 8-bit
    llm_int8_threshold=6.0,
    llm_int8_skip_modules=None,
    load_in_8bit_fp32_cpu_offload=True
)

# Load GPT-2 model for sequence classification in 8-bit
quantized_model = GPT2ForSequenceClassification.from_pretrained(
    "gpt2",
    num_labels=3,
    quantization_config=bnb_config,
    device_map="auto"
)

# Define LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    target_modules=["c_attn"]  # GPT-2 uses fused attention projection layer
)

# Apply LoRA to quantized model
quantized_model = get_peft_model(quantized_model, lora_config)

# Print trainable parameters
quantized_model.print_trainable_parameters()

# (Optional) Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT2 doesn't have a pad token by default

for name, param in quantized_model.named_parameters():
    if "lora" not in name:
        param.requires_grad = False

No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'
Unused kwargs: ['load_in_8bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


RuntimeError: No GPU found. A GPU is needed for quantization.

In [None]:
from transformers import Trainer, TrainingArguments

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
)

# Fine-tune the QLoRA-enhanced model
trainer = Trainer(
    model=quantized_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

In [None]:
# Evaluate the model on the test set
results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Test Accuracy: {results['eval_accuracy']}")