### Fine-Tune Big Bird Large

- Joel Stremmel
- 04-12-23

##### About

Fine-Tune Big Bird Large on the formatted data using K-Fold Cross-Validation and save the scores.

##### Install Libraries

In [1]:
!pip install -q pdfminer.six
!pip install -q pandas
!pip install -q transformers
!pip install -q openpyxl
!pip install -q datasets
!pip install -q sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.3 MB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h

##### Imports

In [2]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

##### Connect to Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##### Set Parameters

In [4]:
max_seq_len = 4096
batch_size = 2
accumulation_steps = 16
lr = 0.00002
weight_decay = 0.01
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_epsilon = 0.00000001
warmup_steps = 5
logging_steps = 1
num_workers = 2
seed = 44
epochs = 10
fp16 = True
colab = True
require_high_ram = False
input_dir = "/content/drive/MyDrive/data/"
model_output_dir = "model_output"
results_dir = "/content/drive/MyDrive/results/"
model_key = "bbl"
lm_path = "google/bigbird-roberta-large"

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Check Runtime

In [6]:
if colab:
  
    gpu_info = !nvidia-smi
    gpu_info = '\n'.join(gpu_info)
    if gpu_info.find('failed') >= 0:
      print('Not connected to a GPU')
    else:
      print(gpu_info)

if require_high_ram:

    from psutil import virtual_memory
    ram_gb = virtual_memory().total / 1e9
    print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

    if ram_gb < 20:
      print('Not using a high-RAM runtime')
    else:
      print('You are using a high-RAM runtime!')

Thu Apr 13 19:58:48 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

##### Load Formatted Data

In [7]:
with open(os.path.join(input_dir, 'X_folds.pkl'), 'rb') as f:
    X_folds = pickle.load(f)

with open(os.path.join(input_dir, 'y_folds.pkl'), 'rb') as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [8]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [9]:
print(f"Target prevalance: {np.mean(np.concatenate(y))}.")

Target prevalance: 0.5.


##### Check that GPU is Available

In [10]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

2.0.0+cu118


##### Tokenize Text and Fit Model to Each Fold

In [None]:
y_probs, y_trues = [], []
for i in range(len(X)):
    
    print(f"Fitting model using fold {i} as out of fold data.")
    
    # Identify train folds and shuffle samples
    X_train, y_train = np.concatenate(X[0:i] + X[i+1:], axis=0), np.concatenate(y[0:i] + y[i+1:], axis=0)
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    # Identify test folds
    X_test, y_test = X[i], y[i]
    
    # Format text and label data as HuggingFace dataset
    train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
    test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})
    
    # Load model and tokenizer
    # This will reset the model weights with each new iteration
    tokenizer = AutoTokenizer.from_pretrained(lm_path)
    model = AutoModelForSequenceClassification.from_pretrained(
        lm_path,
        num_labels=2,
        return_dict=True,
        problem_type="single_label_classification"
    )
    
    # Define function to tokenize text
    def tokenize_function(batch):
        
        return tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=max_seq_len
        )
    
    # Tokenize train dataset
    train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    train_dataset.set_format("pt")
    
    # Tokenize test dataset
    test_dataset = test_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    test_dataset.set_format("pt")
    
    # Define training arguments
    training_args= TrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=accumulation_steps,
        warmup_steps=warmup_steps,
        logging_steps=logging_steps,
        weight_decay=weight_decay,
        learning_rate=lr,
        seed=seed,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        dataloader_num_workers=num_workers,
        fp16=fp16,
        run_name=model_key,
        logging_strategy="steps",
        save_strategy="no",
        lr_scheduler_type='linear',
        optim="adamw_torch",
        do_eval=False,
        fp16_full_eval=False,
        sharded_ddp=False,
        gradient_checkpointing=True,
        load_best_model_at_end=True,
        prediction_loss_only=False,
        disable_tqdm=True,
        logging_dir=None,
    )
    
    # Define model training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset
    )
    
    # Train model
    trainer.train()
    
    # Predict on test dataset
    output = trainer.predict(test_dataset)
    labels = output.label_ids
    y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[:, 1]

    # Save scores and labels
    y_probs.append(y_prob)
    y_trues.append(labels)

Fitting model using fold 0 as out of fold data.


Downloading pytorch_model.bin:   0%|          | 0.00/1.44G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassific

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

{'loss': 0.7, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.59}
{'loss': 0.7019, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.19}
{'loss': 0.7074, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.78}
{'loss': 0.6825, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.37}
{'loss': 0.7032, 'learning_rate': 1.2e-05, 'epoch': 2.96}
{'loss': 0.6927, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.56}
{'loss': 0.714, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.15}
{'loss': 0.6617, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.74}
{'loss': 0.6678, 'learning_rate': 2e-05, 'epoch': 5.33}
{'loss': 0.6809, 'learning_rate': 1.6000000000000003e-05, 'epoch': 5.93}
{'train_runtime': 312.7493, 'train_samples_per_second': 1.727, 'train_steps_per_second': 0.032, 'train_loss': 0.6912033081054687, 'epoch': 5.93}
Fitting model using fold 1 as out of fold data.


Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassific

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.6953, 'learning_rate': 0.0, 'epoch': 0.55}
{'loss': 0.6889, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.1}
{'loss': 0.6841, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.66}
{'loss': 0.703, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.21}
{'loss': 0.684, 'learning_rate': 1.2e-05, 'epoch': 2.76}
{'loss': 0.6784, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.31}
{'loss': 0.6921, 'learning_rate': 2e-05, 'epoch': 3.86}
{'loss': 0.6436, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.41}
{'loss': 0.7148, 'learning_rate': 1.2e-05, 'epoch': 4.97}
{'loss': 0.6738, 'learning_rate': 8.000000000000001e-06, 'epoch': 5.52}
{'train_runtime': 305.4558, 'train_samples_per_second': 1.866, 'train_steps_per_second': 0.033, 'train_loss': 0.6857963562011719, 'epoch': 5.52}
Fitting model using fold 2 as out of fold data.


Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassific

Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.7088, 'learning_rate': 0.0, 'epoch': 0.55}
{'loss': 0.6867, 'learning_rate': 0.0, 'epoch': 1.1}
{'loss': 0.6926, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.66}
{'loss': 0.7065, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.21}
{'loss': 0.6941, 'learning_rate': 8.000000000000001e-06, 'epoch': 2.76}
{'loss': 0.6954, 'learning_rate': 1.2e-05, 'epoch': 3.31}
{'loss': 0.6794, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.86}
{'loss': 0.7065, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.41}
{'loss': 0.6678, 'learning_rate': 2e-05, 'epoch': 4.97}
{'loss': 0.6921, 'learning_rate': 1.6000000000000003e-05, 'epoch': 5.52}
{'train_runtime': 305.5573, 'train_samples_per_second': 1.865, 'train_steps_per_second': 0.033, 'train_loss': 0.6930000305175781, 'epoch': 5.52}
Fitting model using fold 3 as out of fold data.


Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassific

Map:   0%|          | 0/61 [00:00<?, ? examples/s]

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

{'loss': 0.6964, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.52}
{'loss': 0.7184, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.03}
{'loss': 0.6842, 'learning_rate': 1.2e-05, 'epoch': 1.55}
{'loss': 0.7066, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.06}
{'loss': 0.711, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.58}
{'loss': 0.6773, 'learning_rate': 2e-05, 'epoch': 3.1}
{'loss': 0.6911, 'learning_rate': 2e-05, 'epoch': 3.61}
{'loss': 0.6731, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.13}
{'loss': 0.6837, 'learning_rate': 1.2e-05, 'epoch': 4.65}
{'loss': 0.6989, 'learning_rate': 1.2e-05, 'epoch': 5.16}
{'train_runtime': 306.2327, 'train_samples_per_second': 1.992, 'train_steps_per_second': 0.033, 'train_loss': 0.6940544128417969, 'epoch': 5.16}
Fitting model using fold 4 as out of fold data.


Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassific

Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 0.694, 'learning_rate': 0.0, 'epoch': 0.53}
{'loss': 0.6836, 'learning_rate': 4.000000000000001e-06, 'epoch': 1.07}
{'loss': 0.694, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.6}
{'loss': 0.7166, 'learning_rate': 1.2e-05, 'epoch': 2.13}
{'loss': 0.6867, 'learning_rate': 1.2e-05, 'epoch': 2.67}
{'loss': 0.6927, 'learning_rate': 1.6000000000000003e-05, 'epoch': 3.2}
{'loss': 0.6827, 'learning_rate': 2e-05, 'epoch': 3.73}
{'loss': 0.6886, 'learning_rate': 2e-05, 'epoch': 4.27}
{'loss': 0.6728, 'learning_rate': 1.6000000000000003e-05, 'epoch': 4.8}


##### Save Model Probabilities on Test Folds and True Labels

In [None]:
with open(os.path.join(results_dir, f'{model_key}_y_trues.pkl'), 'wb') as f:
    pickle.dump(y_trues, f)

with open(os.path.join(results_dir, f'{model_key}_y_probs.pkl'), 'wb') as f:
    pickle.dump(y_probs, f)