In [None]:
# Use a dataset like IMDb for text classification (e.g., sentiment analysis) and preprocess it for BERT input.

from datasets import load_dataset

# Load the IMDb dataset
dataset = load_dataset("imdb")
train_data = dataset['train']
test_data = dataset['test']

# Display a sample
print("Sample Text:", train_data[0]['text'])
print("Label:", train_data[0]['label'])

In [None]:
# Tokenize text data into input IDs and attention masks using a BERT tokenizer.

from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize a sample
sample_text = train_data[0]['text']
encoded = tokenizer(sample_text, padding='max_length', truncation=True, max_length=512, return_tensors='pt')

# Display the tokenized sample
print("Input IDs:", encoded['input_ids'])
print("Attention Mask:", encoded['attention_mask'])

In [None]:
# Tokenize the entire dataset and convert it into PyTorch dataloaders for training.

import torch
from torch.utils.data import DataLoader, Dataset

# Define a custom dataset class
class IMDbDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data[index]['text']
        label = self.data[index]['label']
        encoded = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets and dataloaders
max_len = 128
train_dataset = IMDbDataset(train_data, tokenizer, max_len)
test_dataset = IMDbDataset(test_data, tokenizer, max_len)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [None]:
# Load a pre-trained BERT model and fine-tune it for text classification.

from transformers import BertForSequenceClassification

# Load the pre-trained BERT model for binary classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
# Train the BERT model using a binary cross-entropy loss function and the Adam optimizer.

from transformers import AdamW
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader):.4f}")

In [None]:
# Evaluate the model’s accuracy on the test dataset.

from sklearn.metrics import accuracy_score

model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy:.4f}")