<a href="https://colab.research.google.com/github/lee1201zxc/BERT_IMDB/blob/main/IMDB_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install libraries

In [None]:
import os

In [None]:
!python3 -m pip install pandas
!python3 -m pip install transformers



### import libraries

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import random
import re, copy
from torch.cuda.amp import autocast, GradScaler



### Specify your GPU number if necessary

In [None]:
%env CUDA_VISIBLE_DEVICES = 0

if torch.cuda.is_available() is True:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

env: CUDA_VISIBLE_DEVICES=0


In [None]:
!pip -q install nltk
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
def basic_clean(text: str) -> str:
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def random_deletion(words, p=0.08, rng=None):
    rng = rng or random
    if len(words) <= 3:
        return words
    kept = [w for w in words if rng.random() > p]
    return kept if kept else [rng.choice(words)]

def augment_words(words, del_p=0.08, rng=None):
    if len(words) < 8:
        return words
    return random_deletion(words, p=del_p, rng=rng)

## Preparing dataset

link : https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

1. Download the dataset from attached link.
2. Move the downloaded zip file under the "data" directory and then unzip the zip file.
3. Run the following cell

In [None]:
def load_imdb_data(data_file_path):
    if os.path.exists(data_file_path):
        df = pd.read_csv(data_file_path)
        texts = df['review'].tolist()
        labels = [1 if sentiment == "positive" else 0 for sentiment in df['sentiment'].tolist()]
        return texts, labels
    else:
        raise FileNotFoundError(f"The file '{data_file_path}' does not exist.")

data_file_path = './data/IMDB Dataset Train.csv'
texts, labels = load_imdb_data(data_file_path)

## Dataset class

In [None]:
class EpochDoubleAugDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_seq_length, del_p=0.08, aug_multiplier=1):
        """
        aug_multiplier=1 -> 총 2배 (원본 n + 증강 n)
        aug_multiplier=2 -> 총 3배 (원본 n + 증강 2n)
        """
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.del_p = del_p
        self.aug_multiplier = aug_multiplier

        self.base_texts = [basic_clean(t) for t in texts]
        self.base_words = [t.split() for t in self.base_texts]
        self.n = len(self.base_texts)

        self.aug_texts = self.base_texts[:]  # placeholder

    def set_del_p(self, del_p: float):
        self.del_p = del_p

    def regenerate_aug(self, epoch_seed: int):
        rng = random.Random(epoch_seed)
        aug_all = []
        for _ in range(self.aug_multiplier):
            for words in self.base_words:
                new_words = augment_words(words, del_p=self.del_p, rng=rng)
                aug_all.append(" ".join(new_words))
        self.aug_texts = aug_all

    def __len__(self):
        return self.n * (1 + self.aug_multiplier)

    def __getitem__(self, idx):
        if idx < self.n:
            text = self.base_texts[idx]
            label = self.labels[idx]
        else:
            j = idx - self.n
            text = self.aug_texts[j]
            label = self.labels[j % self.n]

        encoding = self.tokenizer(
            text,
            max_length=self.max_seq_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long)
        }


## Classifier head for BERT( Design your model's prediction head )

In [None]:
class CustomBERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes, dropout_p=0.2, last_k=4):
        super().__init__()
        self.bert = BertModel.from_pretrained(
            bert_model_name,
            output_hidden_states=True,
            return_dict=True
        )
        hidden = self.bert.config.hidden_size
        self.last_k = last_k

        self.layer_weights = nn.Parameter(torch.zeros(last_k))

        self.norm = nn.LayerNorm(hidden * 2)
        self.dropout = nn.Dropout(p=dropout_p)

        self.classifier = nn.Sequential(
            nn.Linear(hidden * 2, hidden),
            nn.GELU(),
            nn.Dropout(p=dropout_p),
            nn.Linear(hidden, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        hs = outputs.hidden_states[-self.last_k:]
        stack = torch.stack(hs, dim=0)
        w = torch.softmax(self.layer_weights, dim=0).view(self.last_k, 1, 1, 1)
        last_hidden = (w * stack).sum(dim=0)

        cls = last_hidden[:, 0]

        mask = attention_mask.unsqueeze(-1).float()
        sum_hidden = (last_hidden * mask).sum(dim=1)
        lengths = mask.sum(dim=1).clamp(min=1.0)
        mean = sum_hidden / lengths

        combined = torch.cat([cls, mean], dim=1)
        combined = self.norm(combined)
        x = self.dropout(combined)
        logits = self.classifier(x)
        return logits


In [None]:
def build_optimizer_with_llrd(model, base_lr=1.5e-5, weight_decay=0.01, layer_decay=0.95):
    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]

    n_layers = model.bert.config.num_hidden_layers
    lr_map = {}
    for i in range(n_layers):
        lr_map[f"bert.encoder.layer.{i}."] = base_lr * (layer_decay ** (n_layers - 1 - i))
    lr_map["bert.embeddings."] = base_lr * (layer_decay ** n_layers)

    head_lr = base_lr

    param_groups = []
    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue


        lr = head_lr
        for k, v in lr_map.items():
            if name.startswith(k):
                lr = v
                break

        wd = 0.0 if any(nd in name for nd in no_decay) else weight_decay

        param_groups.append({"params": [param], "lr": lr, "weight_decay": wd})

    return AdamW(param_groups)


In [None]:
class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow = {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n] = p.data.clone()

    @torch.no_grad()
    def update(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n].mul_(self.decay).add_(p.data, alpha=1.0 - self.decay)

    def apply_to(self, model):
        self.backup = {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.backup[n] = p.data.clone()
                p.data.copy_(self.shadow[n])

    def restore(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                p.data.copy_(self.backup[n])
        self.backup = None


## train and evaluation method

In [None]:
USE_RDROP = False
RDROP_ALPHA = 1
EMA_DECAY = 0.999

def rdrop_kl(logits1, logits2):
    p1 = F.log_softmax(logits1, dim=-1)
    p2 = F.log_softmax(logits2, dim=-1)
    q1 = F.softmax(logits1, dim=-1)
    q2 = F.softmax(logits2, dim=-1)
    kl_1_2 = F.kl_div(p1, q2, reduction="batchmean")
    kl_2_1 = F.kl_div(p2, q1, reduction="batchmean")
    return 0.5 * (kl_1_2 + kl_2_1)

def train_model(model, data_loader, optimizer, scheduler, device, ema=None, label_smoothing=0.02):
    model.train()
    criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)
    scaler = GradScaler(enabled=(device.type == "cuda"))

    for batch in tqdm(data_loader, desc="Train"):
        optimizer.zero_grad(set_to_none=True)
        input_ids = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        labels = batch["label"].to(device, non_blocking=True)

        with autocast(enabled=(device.type == "cuda")):
            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(logits, labels)

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        if ema is not None:
            ema.update(model)

def evaluate_model(model, data_loader, device, ema=None):
    model.eval()
    if ema is not None:
        ema.apply_to(model)

    preds, gold = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["label"].to(device, non_blocking=True)

            logits = model(input_ids=input_ids, attention_mask=attention_mask)
            p = logits.argmax(dim=1)
            preds.extend(p.cpu().tolist())
            gold.extend(labels.cpu().tolist())

    if ema is not None:
        ema.restore(model)

    return accuracy_score(gold, preds), classification_report(gold, preds)


## Hyper-parameter settings

In [None]:

bert_model_name = 'bert-base-uncased'
num_classes = 2

max_seq_length = 512
batch_size = 8
num_epochs = 4
learning_rate = 2e-5

aug_prob = 0.25
aug_mode = "rd"
EMA_DECAY = 0.999

del_p_start = 0.08
del_p_late  = 0.05


## get data utils

In [None]:

train_texts, val_texts, train_labels, val_labels = \
train_test_split(texts, labels, test_size=0.2, random_state=42)


tokenizer = BertTokenizer.from_pretrained(bert_model_name)

train_dataset = EpochDoubleAugDataset(
    train_texts, train_labels, tokenizer, max_seq_length,
    del_p=del_p_start,
    aug_multiplier=2
)
val_dataset = EpochDoubleAugDataset(
    val_texts, val_labels, tokenizer, max_seq_length,
    del_p=del_p_start,
    aug_multiplier=0
)


train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=False)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Define model, optimizer and scheduler

In [None]:
model = CustomBERTClassifier(bert_model_name, num_classes, dropout_p=0.2, last_k=4).to(device)
optimizer = build_optimizer_with_llrd(model, base_lr=learning_rate, weight_decay=0.005, layer_decay=0.95)

total_steps = len(train_loader) * num_epochs
warmup_steps = int(0.06 * total_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

## Train model and save best model

In [None]:
ema = EMA(model, decay=EMA_DECAY)

best_acc = 0.0
model_path = "./finetuned_bert.pth"

for epoch in range(num_epochs):
    if epoch < 2:
        train_dataset.set_del_p(del_p_start)
    else:
        train_dataset.set_del_p(del_p_late)

    train_dataset.regenerate_aug(epoch_seed=42 + epoch)
    print(f"\nEpoch {epoch+1}/{num_epochs} | del_p={train_dataset.del_p} | train_len={len(train_dataset)}")

    if epoch == num_epochs - 1:
        for g in optimizer.param_groups:
            g["lr"] *= 0.3
        print("Lowered LR for final epoch.")

    train_model(model, train_loader, optimizer, scheduler, device, ema=ema, label_smoothing=0.02)
    acc, report = evaluate_model(model, val_loader, device, ema=ema)

    print(f"Validation Accuracy: {acc:.4f}")
    print(report)

    if acc > best_acc:
        best_acc = acc
        torch.save(model.state_dict(), model_path)
        print("Saved Trained Model (best).")

  scaler = GradScaler(enabled=(device.type == "cuda"))



Epoch 1/4 | del_p=0.08 | train_len=96000


  with autocast(enabled=(device.type == "cuda")):
Train: 100%|█████████████████████████████████████████████████████████████████████| 12000/12000 [35:58<00:00,  5.56it/s]
Validation: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [01:54<00:00,  8.72it/s]


Validation Accuracy: 0.9436
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      3911
           1       0.95      0.94      0.94      4089

    accuracy                           0.94      8000
   macro avg       0.94      0.94      0.94      8000
weighted avg       0.94      0.94      0.94      8000

Saved Trained Model (best).


  scaler = GradScaler(enabled=(device.type == "cuda"))



Epoch 2/4 | del_p=0.08 | train_len=96000


  with autocast(enabled=(device.type == "cuda")):
Train: 100%|█████████████████████████████████████████████████████████████████████| 12000/12000 [35:45<00:00,  5.59it/s]
Validation: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [01:55<00:00,  8.66it/s]


Validation Accuracy: 0.9461
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      3911
           1       0.95      0.95      0.95      4089

    accuracy                           0.95      8000
   macro avg       0.95      0.95      0.95      8000
weighted avg       0.95      0.95      0.95      8000

Saved Trained Model (best).


  scaler = GradScaler(enabled=(device.type == "cuda"))



Epoch 3/4 | del_p=0.05 | train_len=96000


  with autocast(enabled=(device.type == "cuda")):
Train: 100%|█████████████████████████████████████████████████████████████████████| 12000/12000 [35:52<00:00,  5.57it/s]
Validation: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [01:54<00:00,  8.70it/s]


Validation Accuracy: 0.9463
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      3911
           1       0.95      0.94      0.95      4089

    accuracy                           0.95      8000
   macro avg       0.95      0.95      0.95      8000
weighted avg       0.95      0.95      0.95      8000

Saved Trained Model (best).


  scaler = GradScaler(enabled=(device.type == "cuda"))



Epoch 4/4 | del_p=0.05 | train_len=96000
Lowered LR for final epoch.


  with autocast(enabled=(device.type == "cuda")):
Train: 100%|█████████████████████████████████████████████████████████████████████| 12000/12000 [36:27<00:00,  5.49it/s]
Validation: 100%|██████████████████████████████████████████████████████████████████| 1000/1000 [01:56<00:00,  8.55it/s]

Validation Accuracy: 0.9454
              precision    recall  f1-score   support

           0       0.94      0.95      0.94      3911
           1       0.95      0.94      0.95      4089

    accuracy                           0.95      8000
   macro avg       0.95      0.95      0.95      8000
weighted avg       0.95      0.95      0.95      8000




