In [182]:
import transformers
import pandas as pd
import sklearn
import torch

In [183]:
torch.manual_seed(1)
device = torch.device('cpu')

In [184]:
model_name = "KB/bert-base-swedish-cased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at KB/bert-base-swedi

In [185]:
class QuestionDataset(torch.utils.data.Dataset):
    def __init__(self, questions, labels, tokenizer):
        self.questions = questions
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, item):
        question = str(self.questions[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=64,
            return_token_type_ids=False,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt",
        ).to(device)

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": label,
        }

In [186]:
data = pd.read_csv('dataset.csv')

# Divide the data into training, validation and test
training_data, testing_data = sklearn.model_selection.train_test_split(data, test_size=0.2, random_state = 1)
training_data, validation_data = sklearn.model_selection.train_test_split(training_data, test_size = 0.2, random_state = 1)

print("training data: ", training_data.shape)
print("validation data: ", validation_data.shape)
print("testing data: ", testing_data.shape)

training data:  (5120, 2)
validation data:  (1280, 2)
testing data:  (1600, 2)


In [187]:
# Load datasets into QuestionDataset objects
training_dataset = QuestionDataset(questions=training_data["question"].tolist(), labels=training_data["label"].tolist(), tokenizer=tokenizer)
validation_dataset = QuestionDataset(questions=validation_data["question"].tolist(), labels=validation_data["label"].tolist(), tokenizer=tokenizer)
testing_dataset = QuestionDataset(questions=testing_data["question"].tolist(), labels=testing_data["label"].tolist(), tokenizer=tokenizer)

In [192]:
def collate_fn(batch):
    # Extract the sequences and labels from the batch
    print("batch", 1)
    sequences = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # Pad the sequences to a fixed length
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = [seq + [0] * (max_len - len(seq)) for seq in sequences]

    # Convert the sequences and labels to tensors
    padded_sequences = torch.tensor(padded_sequences)
    labels = torch.tensor(labels)

    return padded_sequences, labels

In [193]:
# Load the pre-trained model
model_name = "KB/bert-base-swedish-cased"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)

# Set up the optimizer and the loss function
optimizer = torch.optim.Adam(model.parameters(), lr = 2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Load your dataset into a DataLoader
train_loader = torch.utils.data.DataLoader(training_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# Fine-tune the model on your dataset
num_epochs = 3
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    for batch in train_loader:
        # Show procentual progress for every batch
        print(f"\r{batch['label'].shape[0] * (epoch + 1) / len(train_loader.dataset):.2%}", end="")

        # Load the batch into the device (GPU)
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Clear the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask = attention_mask)
        logits = outputs.logits

        # Compute the loss
        loss = loss_fn(logits, labels)

        # Backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()

    # Evaluate the model on the validation set
    val_loader = torch.utils.data.DataLoader(validation_dataset, batch_size = 32)
    model.eval()
    val_preds = []
    val_labels = []
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        with torch.no_grad():
            outputs = model(input_ids, attention_mask = attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)
        val_preds.extend(preds.tolist())
        val_labels.extend(labels.tolist())
    val_acc = sklearn.metrics.accuracy_score(val_labels, val_preds)
    print(f"Validation accuracy: {val_acc:.4f}")

Some weights of the model checkpoint at KB/bert-base-swedish-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at KB/bert-base-swedi

Epoch 1/3
batch [{'input_ids': tensor([    2,  1362,    54,   102,  4326, 10230, 18967,   302,     3,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'label': 0}, {'input_ids': tensor([    2,  1362,  2200, 11106, 25641, 22178,    68, 17547,   100,  8620,
          302,     3,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,  

KeyError: 0