In [11]:
import random
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [12]:
# Generate random positive and negative sentences
def generate_random_sentences(num_samples):
    positive_sentences = ["I love BERT!", "BERT is amazing.", "I'm a fan of BERT.", "BERT makes me happy."]
    negative_sentences = ["I dislike BERT.", "BERT is not good.", "I'm not impressed with BERT.", "BERT disappoints me."]

    random.seed(42)  # For reproducibility
    random_samples = []
    labels = []

    for _ in range(num_samples):
        sentiment = random.choice([0, 1])  # 0 for negative, 1 for positive
        if sentiment == 0:
            random_samples.append(random.choice(negative_sentences))
            labels.append(0)
        else:
            random_samples.append(random.choice(positive_sentences))
            labels.append(1)

    return random_samples, labels

# Generate the dataset
train_texts, train_labels = generate_random_sentences(80)
eval_texts, eval_labels = generate_random_sentences(20)

In [13]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
# Tokenize the input
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
eval_encodings = tokenizer(eval_texts, truncation=True, padding=True)

In [15]:
# Define a PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
eval_dataset = CustomDataset(eval_encodings, eval_labels)

In [16]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=2,   # batch size for training
    per_device_eval_batch_size=2,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [18]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset            # evaluation dataset
)


In [19]:
# Train the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

print(f"Evaluation results: {eval_result}")

Step,Training Loss
10,0.6059
20,0.7405
30,0.6734
40,0.6806
50,0.5605
60,0.4071
70,0.2585
80,0.1995
90,0.1422
100,0.0956


Evaluation results: {'eval_loss': 0.029392385855317116, 'eval_runtime': 0.2392, 'eval_samples_per_second': 83.616, 'eval_steps_per_second': 41.808, 'epoch': 3.0}
