In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset

# Load the Jigsaw dataset
dataset = load_dataset('csv', data_files={'train': 'data/archive/train.csv', 'validation': '/path/to/validation.csv'})

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize the text data and convert the tokens into input features
def tokenize_and_encode(sentences):
    input_ids = []
    attention_masks = []
    for sentence in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sentence,
                            add_special_tokens = True,
                            max_length = 64,
                            padding = 'max_length',
                            truncation=True,
                            return_attention_mask = True,
                            return_tensors = 'pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

# Define the training and validation data loaders
train_loader = DataLoader(dataset['train'], batch_size=32, shuffle=True)
val_loader = DataLoader(dataset['validation'], batch_size=32, shuffle=False)

# Define the optimizer and the loss function
optimizer = AdamW(model.parameters(), lr=5e-5, eps=1e-8)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
for epoch in range(3):
    model.train()
    for i, batch in enumerate(train_loader):
        input_ids, attention_masks = tokenize_and_encode(batch['comment_text'])
        labels = batch['toxicity']
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluate the model on the validation set
    model.eval()
    total_correct = 0
    total_samples = 0
    for batch in val_loader:
        input_ids, attention_masks = tokenize_and_encode(batch['comment_text'])
        labels = batch['toxicity']
        outputs = model(input_ids, attention_mask=attention_masks)
        _, predicted = torch.max(outputs.logits, 1)
        total_correct += (predicted == labels).sum().item()
        total_samples += len(labels)

    accuracy = total_correct / total_samples
    print(f"Epoch {epoch+1}, Validation Accuracy: {accuracy:.4f}")

# Test the model on a held-out test set
test_dataset = load_dataset("jigsaw_toxic_comment_classification", split="test")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)




  from .autonotebook import tqdm as notebook_tqdm


FileNotFoundError: Couldn't find a dataset script at /home/kd/Documents/USC/CSCI544/Project/Code/CSCI544_NLP_Project/finetuning/jigsaw_toxic_comment_classification/jigsaw_toxic_comment_classification.py or any data file in the same directory. Couldn't find 'jigsaw_toxic_comment_classification' on the Hugging Face Hub either: FileNotFoundError: Dataset 'jigsaw_toxic_comment_classification' doesn't exist on the Hub. If the repo is private or gated, make sure to log in with `huggingface-cli login`.

In [None]:
model.eval()
total_correct = 0
total_samples = 0
for batch in test_loader:
input_ids, attention_masks = tokenize_and_encode(batch['comment_text'])
labels = batch['toxicity']
outputs = model(input_ids, attention_mask=attention_masks)
_, predicted = torch.max(outputs.logits, 1)
total_correct += (predicted == labels).sum().item()
total_samples += len(labels)

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy:.4f}")

