In [1]:
# ==========================
# 1. Library Installation
# ==========================
!pip install transformers datasets torch nlpaug
!pip install evaluate

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.2 

In [2]:
# ==========================
# 2. Import Required Libraries
# ==========================
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, pipeline, DataCollatorWithPadding
from datasets import load_dataset
import torch

In [3]:
# ==========================
# 3. Configure Model and Tokenizer
# ==========================
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

# Configure Device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# ==========================
# 4. Load and Process Dataset
# ==========================
# Load IMDB dataset
dataset = load_dataset("imdb")

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Reduce dataset size for quick testing
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))  # Subset
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))    # Subset


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
# ==========================
# 5. Untrained Model (Pretrained)
# ==========================
# Load the untrained model
untrained_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# Configure pipeline for the untrained model
untrained_pipeline = pipeline("sentiment-analysis", model=untrained_model, tokenizer=tokenizer, device=0)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [6]:
# ==========================
# 6. Train the Model
# ==========================
# Load the model for fine-tuning
trained_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

# Configure training
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Switch to eval_strategy
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    report_to="none"  # Disable W&B
)

# Create a DataCollator that handles dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=trained_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator  # Replace tokenizer here
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model("./trained_model")

# Configure pipeline for the trained model
trained_pipeline = pipeline("sentiment-analysis", model="./trained_model", tokenizer=tokenizer, device=0)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3441,0.296655
2,0.1799,0.284635
3,0.1192,0.319512


Device set to use cuda:0


In [7]:
# ==========================
# 7. Compare Models
# ==========================
def compare_models(texts):
    print("\n=== Model Comparison ===\n")
    for text in texts:
        untrained_prediction = untrained_pipeline(text)[0]
        trained_prediction = trained_pipeline(text)[0]

        print(f"Text: {text}")
        print(f"Untrained Model: {untrained_prediction}")
        print(f"Trained Model: {trained_prediction}")
        print("\n")

# Examples for comparison
texts = [
    "This movie was fantastic! I loved the characters and the plot.",
    "The film was an absolute disaster. Terrible acting and no story.",
    "An average movie with decent acting but a predictable plot."
]

compare_models(texts)



=== Model Comparison ===

Text: This movie was fantastic! I loved the characters and the plot.
Untrained Model: {'label': 'LABEL_1', 'score': 0.586042582988739}
Trained Model: {'label': 'LABEL_1', 'score': 0.9852017760276794}


Text: The film was an absolute disaster. Terrible acting and no story.
Untrained Model: {'label': 'LABEL_1', 'score': 0.5936596393585205}
Trained Model: {'label': 'LABEL_0', 'score': 0.9703332781791687}


Text: An average movie with decent acting but a predictable plot.
Untrained Model: {'label': 'LABEL_1', 'score': 0.6172637939453125}
Trained Model: {'label': 'LABEL_0', 'score': 0.973656177520752}




In [8]:
# ==========================
# 8. Evaluate Accuracy on Test Set
# ==========================
from evaluate import load

# Truncate sequences during tokenization to avoid errors
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding="max_length",  # Pad to maximum size
        truncation=True,       # Truncate long sequences
        max_length=512         # Maximum length allowed by BERT
    )

# Tokenize the dataset with truncation
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Reduce dataset size for quick testing (optional)
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(2000))  # Subset
test_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))    # Subset

# Load accuracy metric
metric = load("accuracy")

# Function to evaluate a model
def evaluate_model(pipeline_model, dataset):
    predictions = []
    references = []
    for example in dataset:
        # Apply explicit truncation during evaluation
        inputs = tokenizer(
            example["text"],
            truncation=True,         # Truncate long sequences
            padding="max_length",    # Pad to maximum size
            max_length=512,          # Maximum allowed length
            return_tensors="pt"      # Return PyTorch-compatible tensors
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}  # Send to GPU/CPU

        # Get prediction
        with torch.no_grad():
            outputs = pipeline_model.model(**inputs)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).item()

        predictions.append(pred)
        references.append(example["label"])
    return metric.compute(predictions=predictions, references=references)


# Evaluate the untrained model
untrained_accuracy = evaluate_model(untrained_pipeline, test_dataset)
print("\nUntrained Model Accuracy:", untrained_accuracy["accuracy"])

# Evaluate the trained model
trained_accuracy = evaluate_model(trained_pipeline, test_dataset)
print("Trained Model Accuracy:", trained_accuracy["accuracy"])


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Untrained Model Accuracy: 0.494
Trained Model Accuracy: 0.9
