In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

dataset = pd.read_csv('./data/edos-labelled-annotations.csv')
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-large', return_tensors='pt')
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

text = list(dataset['text'])
labels = torch.tensor(dataset['label_sexist'].map({'sexist': 1, 'not sexist': 0}).values)

train_x, test_x, train_y, test_y = train_test_split(text, labels, train_size=0.2, random_state=42)

train_encodings = tokenizer(list(train_x), truncation=True, padding=True)
test_encodings = tokenizer(list(test_x), truncation=True, padding=True)

train_dataset = CustomDataset(train_encodings, train_y)
test_dataset = CustomDataset(test_encodings, test_y)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

device = torch.device('cpu')
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

epochs = 25
for epoch in range(epochs):
    model.train()
    total_loss = 0

    pbar = tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch + 1}/{epochs}')

    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        pbar.set_postfix({'Training Loss': total_loss / len(train_loader)})

    pbar.close()

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy on test set: {(correct / total) * 100:.2f}%')

ModuleNotFoundError: No module named 'transformers'