### Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%%capture
!pip install transformers[torch]
!pip install transformers datasets

In [None]:
from pathlib import Path
from datasets import load_dataset, load_metric
import numpy as np
import pickle
import scipy as sp
import torch
import torch.nn as nn
from transformers import BertForSequenceClassification, BertTokenizer, EarlyStoppingCallback, Trainer, TrainingArguments

### Load dataset

In [None]:
raw_datasets = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

## BERT training -- based on HuggingFace tutorial, with Trainer class

### Prepare datasets

In [None]:
def tokenize_function(examples):
  # tokenize and replace page breaking signs
  return tokenizer([i.replace("<br />", "") for i in examples["text"]], padding="max_length", return_tensors="pt", truncation=True)

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
full_train_dataset = tokenized_datasets["train"]
full_eval_dataset = tokenized_datasets["test"]

### Train models

In [None]:
metric = load_metric('accuracy')

def compute_accuracy(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
training_params = {
    "learning_rate": 2e-05,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "weight_decay": 5e-06,
    "max_steps": 3500,
    "evaluation_strategy": "steps",
    "eval_steps": 500,
    "save_steps": 1000,
    "load_best_model_at_end": True,
}
early_stopping_patience=1000

In [None]:
model_path = Path("/content/drive/MyDrive/models_new/bert_imdb/full_trainer_20230912_custom_training_params_3")

In [None]:
training_args = TrainingArguments(model_path, **training_params)
trainer_full = Trainer(
    model=model,
    args=training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_accuracy,
    callbacks = [EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
)
trainer_full.train()

Step,Training Loss,Validation Loss,Accuracy
500,0.3168,0.211346,0.9184
1000,0.2406,0.239908,0.92112
1500,0.2223,0.203125,0.93164
2000,0.1494,0.235149,0.93328
2500,0.1344,0.256136,0.92692
3000,0.1316,0.238516,0.93608
3500,0.0805,0.26464,0.93768


TrainOutput(global_step=3500, training_loss=0.18221623338971818, metrics={'train_runtime': 10481.7858, 'train_samples_per_second': 5.343, 'train_steps_per_second': 0.334, 'total_flos': 1.473000932327424e+16, 'train_loss': 0.18221623338971818, 'epoch': 2.24})

In [None]:
trainer_full.model.save_pretrained(model_path / "_final")