In [4]:
#Task 4
"""
This program demonstrates a binary sentiment classification system using a combination of a pre-trained BERT model 
and a GRU layer. The BERT model extracts contextual embeddings from text, which are passed through a GRU layer 
to learn sequential patterns. A fully connected layer is then used to classify the sentiment as either positive or negative.

The code covers the following steps:
1. Dataset Preparation: Example text data is tokenized using the BERT tokenizer and split into training, validation, and test sets.
2. Model Design: A BERT-GRU model is implemented, where the GRU layer processes embeddings from the BERT model.
3. Training: The model is trained for 5 epochs using binary cross-entropy loss and the Adam optimizer, with the best model saved.
4. Evaluation: The model is evaluated on validation and test datasets, calculating loss and accuracy.
5. Prediction: A function is provided to predict the sentiment of new text inputs, outputting both the sentiment and confidence score.

This code serves as an introduction to using transformers with RNNs for text classification tasks.
"""

import torch  # PyTorch library for deep learning
from torch.utils.data import DataLoader, Dataset  # Classes for handling datasets and batching
import torch.nn as nn  # Neural network module
import torch.optim as optim  # Optimizers for training
from transformers import BertTokenizer, BertModel  # Pre-trained BERT tokenizer and model
from sklearn.model_selection import train_test_split  # Split data into training, validation, and test sets
import time  # To measure training time

# Set random seed for reproducibility
SEED = 1234
torch.manual_seed(SEED)

# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset preparation
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        # Store the input texts and their labels
        self.texts = texts
        self.labels = labels

    def __len__(self):
        # Return the size of the dataset
        return len(self.texts)

    def __getitem__(self, idx):
        # Retrieve a text and its label by index
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text and return input IDs, attention mask, and label
        tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
        return tokens['input_ids'].squeeze(0), tokens['attention_mask'].squeeze(0), torch.tensor(label, dtype=torch.float)

# Example dataset
texts = ["I love this movie!", "This is terrible.", "Fantastic work.", "Horrible experience.", "Amazing product."]  # Example texts
labels = [1, 0, 1, 0, 1]  # 1 for positive, 0 for negative

# Train-test split
# Split data into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.4, random_state=SEED)
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=0.5, random_state=SEED)

# Create dataset objects for each split
train_dataset = TextDataset(train_texts, train_labels)
valid_dataset = TextDataset(valid_texts, valid_labels)
test_dataset = TextDataset(test_texts, test_labels)

BATCH_SIZE = 2  # Number of samples per batch

# Create DataLoaders to batch and shuffle the data
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define the BERT-GRU model
class BERTGRUSentiment(nn.Module):
    def __init__(self, bert, hidden_dim, output_dim, dropout):
        super().__init__()
        self.bert = bert  # Pre-trained BERT model
        self.rnn = nn.GRU(bert.config.hidden_size, hidden_dim, batch_first=True, bidirectional=True)  # GRU layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Fully connected layer for binary classification
        self.dropout = nn.Dropout(dropout)  # Dropout for regularization

    def forward(self, input_ids, attention_mask):
        # Pass input through BERT and extract embeddings
        with torch.no_grad():  # Freeze BERT weights
            embedded = self.bert(input_ids, attention_mask=attention_mask).last_hidden_state
        # Pass embeddings through GRU
        _, hidden = self.rnn(embedded)
        # Concatenate the last forward and backward GRU hidden states
        hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        return self.fc(hidden)  # Pass through the fully connected layer

# Initialize the BERT-GRU model
bert = BertModel.from_pretrained('bert-base-uncased')
model = BERTGRUSentiment(bert, hidden_dim=128, output_dim=1, dropout=0.3).to(torch.device("cpu"))

# Freeze BERT parameters to avoid updating them during training
for param in model.bert.parameters():
    param.requires_grad = False

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss
optimizer = optim.Adam(model.parameters())  # Adam optimizer

# Define helper functions
def binary_accuracy(preds, y):
    # Calculate accuracy by comparing predictions to true labels
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    return correct.sum() / len(correct)

def train(model, loader, optimizer, criterion):
    # Train the model for one epoch
    model.train()
    epoch_loss = 0
    for input_ids, attention_mask, labels in loader:
        optimizer.zero_grad()
        predictions = model(input_ids, attention_mask).squeeze(1)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

def evaluate_with_accuracy(model, loader, criterion):
    # Evaluate the model and calculate accuracy
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for input_ids, attention_mask, labels in loader:
            predictions = model(input_ids, attention_mask).squeeze(1)
            loss = criterion(predictions, labels)
            epoch_loss += loss.item()
            predicted = (torch.sigmoid(predictions) > 0.5).float()
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return epoch_loss / len(loader), correct / total

# Training loop
N_EPOCHS = 5
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate_with_accuracy(model, valid_loader, criterion)
    end_time = time.time()

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert_gru_model.pt')  # Save the best model

    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.3f} | Valid Loss: {valid_loss:.3f} | Valid Acc: {valid_acc*100:.2f}% | Time: {end_time - start_time:.2f}s")

# Load the best model and evaluate on the test set
model.load_state_dict(torch.load('bert_gru_model.pt'))
test_loss, test_acc = evaluate_with_accuracy(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%")

# Prediction function
def predict_sentiment(model, tokenizer, text):
    # Predict sentiment for a single input text
    tokens = tokenizer(text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    model.eval()
    with torch.no_grad():
        prediction = torch.sigmoid(model(input_ids, attention_mask)).item()
    return "Positive" if prediction > 0.5 else "Negative", prediction

# Example predictions
print(predict_sentiment(model, tokenizer, "I absolutely love this!"))
print(predict_sentiment(model, tokenizer, "This was a horrible experience."))

Epoch 1 | Train Loss: 0.712 | Valid Loss: 0.795 | Valid Acc: 50.00% | Time: 6.04s
Epoch 2 | Train Loss: 0.276 | Valid Loss: 0.997 | Valid Acc: 50.00% | Time: 6.73s
Epoch 3 | Train Loss: 0.129 | Valid Loss: 1.255 | Valid Acc: 50.00% | Time: 7.75s
Epoch 4 | Train Loss: 0.069 | Valid Loss: 1.517 | Valid Acc: 50.00% | Time: 7.54s
Epoch 5 | Train Loss: 0.019 | Valid Loss: 1.756 | Valid Acc: 50.00% | Time: 6.75s
Test Loss: 1.100 | Test Acc: 0.00%
('Negative', 0.35799726843833923)
('Negative', 0.32282501459121704)
