In [None]:
# ==========================
# 1. Library Installation
# ==========================
!pip install transformers datasets torch nlpaug
!pip install evaluate

In [None]:
# ==========================
# 2. Import Required Libraries
# ==========================
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, pipeline, DataCollatorWithPadding
from datasets import load_dataset
import torch

In [None]:
# ==========================
# 3. Configure Model and Tokenizer
# ==========================
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Configure Device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# ==========================
# 4. Load and Process Dataset
# ==========================
# Load IMDB dataset
dataset = load_dataset("imdb")

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Reduce dataset size for quick testing
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

In [None]:
# ==========================
# 5. Untrained Model (Pretrained)
# ==========================
untrained_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
untrained_pipeline = pipeline("sentiment-analysis", model=untrained_model, tokenizer=tokenizer, device=0)

In [None]:
# ==========================
# 6. Train the Model
# ==========================
trained_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=trained_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)
trainer.train()
trainer.save_model("./trained_model")
trained_pipeline = pipeline("sentiment-analysis", model="./trained_model", tokenizer=tokenizer, device=0)