In [1]:
from datasets import load_dataset
imdb_dataset = load_dataset("imdb")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 488061.63 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 672474.48 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 820719.61 examples/s]


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [3]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = imdb_dataset.map(preprocess_function, batched=True)

Map: 100%|██████████| 25000/25000 [00:04<00:00, 5902.08 examples/s]
Map: 100%|██████████| 25000/25000 [00:04<00:00, 6159.35 examples/s]
Map: 100%|██████████| 50000/50000 [00:08<00:00, 5947.56 examples/s]


In [None]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

In [None]:
import torch.nn as nn
from transformers import BertModel

class RegisterClassifier(nn.Module):
    def __init__(self, n_classes):
        super(RegisterClassifier, self).__init__()
        # Load the pretrained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        
        # Dropout layer for regularization
        self.drop = nn.Dropout(p=0.3)
        
        # Fully-connected layer for classification
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        # Pass inputs through BERT
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use the pooled output of the token for classification
        pooled_output = outputs.pooler_output
        
        # Apply dropout and the final classification layer
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

trainer.train()