### Fine-Tune Language Models

- Joel Stremmel
- 04-24-23

##### About

Fine-Tune pretrained language models on the formatted data using K-Fold Cross-Validation and save the scores.

##### Set Parameters

In [1]:
size = "large"
params = {
    "env": {"colab": True, "require_high_ram": True},
    "data": {"add_summaries": False},
    "training": {
        "lr": 0.000005,
        "weight_decay": 0.01,
        "adam_beta1": 0.9,
        "adam_beta2": 0.999,
        "adam_epsilon": 0.00000001,
        "warmup_steps": 10,
        "logging_steps": 1,
        "num_workers": 2,
        "epochs": 200,
        "early_stopping_patience": 5,
    },
    "evaluation": {
        "evaluation_strategy": "epoch",
        "save_strategy": "epoch",
        "fp16_full_eval": False,
        "eval_accumulation_steps": 100,
    },
    "small_models": {
          'gpt2': {
            'path': 'gpt2',
            'fp16': False,
            'max_seq_len': 1024,
            'batch_size': 1,
            'accumulation_steps': 16,
            'gradient_checkpointing': True,
            'type': 'gpt'
        },
        'roberta_base': {
            'path': 'roberta-base',
            'fp16': True,
            'max_seq_len': 512,
            'batch_size': 2,
            'accumulation_steps': 8,
            'gradient_checkpointing': True,
            'type': 'mlm'
        },
        'lf_mini': {
            'path': 'kiddothe2b/longformer-mini-1024',
            'max_seq_len': 1024,
            'fp16': True,
            'batch_size': 1,
            'accumulation_steps': 16,
            'gradient_checkpointing': False,
            'type': 'mlm'
        },
        "flan_t5_small": {
            "path": "google/flan-t5-small",
            "max_seq_len": 1024,
            "output_max_seq_len": 5,
            "fp16": False,
            "batch_size": 1,
            "accumulation_steps": 16,
            "gradient_checkpointing": False,
            "type": "seq2seq",
        }
    },
    "large_models": {
        "bb_large": {
            "path": "google/bigbird-roberta-large",
            "max_seq_len": 4096,
            "fp16": True,
            "batch_size": 16,
            "accumulation_steps": 1,
            "gradient_checkpointing": True,
            "type": "mlm",
        },
        "roberta_large": {
            "path": "roberta-large",
            "fp16": True,
            "max_seq_len": 512,
            "batch_size": 16,
            "accumulation_steps": 1,
            "gradient_checkpointing": False,
            "type": "mlm",
        },
    },
    "io": {
        "results_dir": "/content/drive/MyDrive/nlp4psychotherapy/results",
        "input_dir": "/content/drive/MyDrive/nlp4psychotherapy/data",
        "model_output_dir": "/content/drive/MyDrive/nlp4psychotherapy/model_output",
    },
    "augmentation": {
        "add_synthetic": False,
        "aug_p": 0.2,
        "glove_file": "data/glove.6B.50d.txt",
        "glove_zip": "data/glove.6B.zip",
        "glove_url": "http://nlp.stanford.edu/data/glove.6B.zip",
    },
    "random": {"seed": 42},
}

#         "lf_base": {
#             "path": "allenai/longformer-base-4096",
#             "max_seq_len": 4096,
#             "fp16": True,
#             "batch_size": 4,
#             "accumulation_steps": 4,
#             "gradient_checkpointing": False,
#             "type": "mlm",
#         },
#         "bb_base": {
#             "max_seq_len": 4096,
#             "fp16": True,
#             "batch_size": 4,
#             "accumulation_steps": 4,
#             "gradient_checkpointing": False,
#             "type": "mlm",
#         },
#         "gpt_neo_1_3b": {
#             "path": "EleutherAI/gpt-neo-1.3B",
#             "max_seq_len": 2048,
#             "fp16": False,
#             "batch_size": 1,
#             "accumulation_steps": 16,
#             "gradient_checkpointing": False,
#             "type": "gpt",
#         },
#         "flan_t5_large": {
#             "path": "google/flan-t5-large",
#             "max_seq_len": 1024,
#             "output_max_seq_len": 6,
#             "fp16": False,
#             "batch_size": 1,
#             "accumulation_steps": 16,
#             "gradient_checkpointing": False,
#             "type": "seq2seq",
#         },

##### Mount Google Drive and Install Requirements if Using Colab

In [2]:
if params["env"]["colab"]:

    import os
    from google.colab import drive
    drive.mount("/content/drive")
    !pip install -q -r "/content/drive/MyDrive/nlp4psychotherapy/requirements.txt"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


##### Imports

In [3]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    GPT2ForSequenceClassification,
    GPTNeoForSequenceClassification,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
)

##### Check Colab Runtime

In [4]:
if params["env"]["colab"]:
    gpu_info = !nvidia-smi
    gpu_info = "\n".join(gpu_info)
    if gpu_info.find("failed") >= 0:
        print("Not connected to a GPU")
    else:
        print(gpu_info)

if params["env"]["require_high_ram"]:
    from psutil import virtual_memory

    ram_gb = virtual_memory().total / 1e9
    print("Your runtime has {:.1f} gigabytes of available RAM\n".format(ram_gb))

    if ram_gb < 20:
        print("Not using a high-RAM runtime")
    else:
        print("You are using a high-RAM runtime!")

Mon Apr 24 16:32:37 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    45W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [6]:
if params["data"]["add_summaries"]:
    with open(os.path.join(params["io"]["input_dir"], "Xwsum_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

else:
    with open(os.path.join(params["io"]["input_dir"], "X_folds.pkl"), "rb") as f:
        X_folds = pickle.load(f)

    with open(os.path.join(params["io"]["input_dir"], "y_folds.pkl"), "rb") as f:
        y_folds = pickle.load(f)

##### Check Data Shape

In [7]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [8]:
print(f"Target prevalance: {round(np.mean(np.concatenate(y)), 3)}.")

Target prevalance: 0.517.


##### Check that GPU is Available

In [9]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

2.0.0+cu118


##### Select and Preprocess Text and Fit Model to Each Data Fold

In [None]:
y_probs, y_trues = {}, {}
for model in params[f"{size}_models"].keys():

    y_probs[model], y_trues[model] = [], []
    for i in range(len(X)):
        
        # Print model and fold
        print(f"Fitting model: {model} using fold {i} as out of fold test data.")

        # Identify train and test folds
        X_train_temp, y_train_temp = X[0:i] + X[i + 1 :], y[0:i] + y[i + 1 :]
        X_test, y_test = X[i], y[i]

        # Select a validation fold at random
        indices_temp = np.arange(len(y_train_temp))
        val_index = np.random.choice(indices_temp)
        X_val, y_val = X_train_temp[val_index], y_train_temp[val_index]

        # Identify the training folds as the indices not including the validation index
        # Concatenate all examples in the training folds to form the full training set
        X_train = np.concatenate(np.delete(X_train_temp, val_index), axis=0)
        y_train = np.concatenate(np.delete(y_train_temp, val_index), axis=0)

        # Shuffle training data
        indices = np.arange(len(y_train))
        np.random.shuffle(indices)
        X_train, y_train = X_train[indices], y_train[indices]

        # Print data shapes
        print(f"Train data sizes: {len(X_train), len(y_train)}.")
        print(f"Val data sizes: {len(X_val), len(y_val)}.")
        print(f"Test data sizes: {len(X_test), len(y_test)}.")

        # Format text and label data as HuggingFace dataset
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            train_dataset = Dataset.from_dict(
                {"text": X_train, "label_ids": [str(label) for label in y_train]}
            )
            val_dataset = Dataset.from_dict(
                {"text": X_val, "label_ids": [str(label) for label in y_val]}
            )
            test_dataset = Dataset.from_dict(
                {"text": X_test, "label_ids": [str(label) for label in y_test]}
            )

        else:
            train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
            val_dataset = Dataset.from_dict({"text": X_val, "label": y_val})
            test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            params[f"{size}_models"][model]["path"]
        )

        # Load model by model type
        if params[f"{size}_models"][model]["type"] == "mlm":
            
            # Load masked language model with a sequence classification head
            lm = AutoModelForSequenceClassification.from_pretrained(
                params[f"{size}_models"][model]["path"],
                num_labels=2,
                return_dict=True,
                problem_type="single_label_classification",
            )

        elif params[f"{size}_models"][model]["type"] == "gpt":
            
            # Use the end of sentence token as a pad token for GPT models
            tokenizer.pad_token = tokenizer.eos_token

            if model == "gpt2":
                
                # Load GPT-2
                lm = GPT2ForSequenceClassification.from_pretrained(
                    params[f"{size}_models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            elif "gpt_neo" in model:
                
                # Load a GPT Neo version
                lm = GPTNeoForSequenceClassification.from_pretrained(
                    params[f"{size}_models"][model]["path"],
                    num_labels=2,
                    return_dict=True,
                    problem_type="single_label_classification",
                )

            else:
                raise ValueError("Expected GPT model to be gpt2 or a gpt_neo version.")

        elif params[f"{size}_models"][model]["type"] == "seq2seq":
            lm = AutoModelForSeq2SeqLM.from_pretrained(
                params[f"{size}_models"][model]["path"]
            )
        else:
            raise ValueError(
                f"Unexpected model type: {params[f'{size}_models'][model]['path']}."
            )

        # Define function to preprocess and tokenize text
        if params[f"{size}_models"][model]["type"] == "seq2seq":

            def preprocess_function(
                sample, padding="max_length", output_max_seq_len=20
            ):
                
                # Add prefix to the input for t5
                inputs = [
                    "Classify this text as either 1 or 0: " + item
                    for item in sample["text"]
                ]

                # tokenize inputs
                model_inputs = tokenizer(
                    inputs,
                    max_length=params[f"{size}_models"][model]["max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # Tokenize targets with the `text_target` keyword argument
                labels = tokenizer(
                    text_target=sample["label_ids"],
                    max_length=params[f"{size}_models"][model]["output_max_seq_len"],
                    padding=padding,
                    truncation=True,
                )

                # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
                # padding in the loss.
                if padding == "max_length":
                    labels["input_ids"] = [
                        [(l if l != tokenizer.pad_token_id else -100) for l in label]
                        for label in labels["input_ids"]
                    ]

                model_inputs["label_ids"] = labels["input_ids"]

                return model_inputs

        else:

            def preprocess_function(batch):
                return tokenizer(
                    batch["text"],
                    padding="max_length",
                    truncation=True,
                    max_length=params[f"{size}_models"][model]["max_seq_len"],
                )

        # Preprocess datasets
        train_dataset = train_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params[f"{size}_models"][model]["batch_size"],
        )
        train_dataset.set_format("pt")
        val_dataset = val_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params[f"{size}_models"][model]["batch_size"],
        )
        val_dataset.set_format("pt")
        test_dataset = test_dataset.map(
            preprocess_function,
            batched=True,
            remove_columns=["text"],
            batch_size=params[f"{size}_models"][model]["batch_size"],
        )
        test_dataset.set_format("pt")

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=params["io"]["model_output_dir"],
            num_train_epochs=params["training"]["epochs"],
            warmup_steps=params["training"]["warmup_steps"],
            logging_steps=params["training"]["logging_steps"],
            weight_decay=params["training"]["weight_decay"],
            learning_rate=params["training"]["lr"],
            adam_beta1=params["training"]["adam_beta1"],
            adam_beta2=params["training"]["adam_beta2"],
            adam_epsilon=params["training"]["adam_epsilon"],
            dataloader_num_workers=params["training"]["num_workers"],
            seed=params["random"]["seed"],
            run_name=params[f"{size}_models"][model],
            fp16=params[f"{size}_models"][model]["fp16"],
            gradient_checkpointing=params[f"{size}_models"][model][
                "gradient_checkpointing"
            ],
            per_device_train_batch_size=params[f"{size}_models"][model]["batch_size"],
            per_device_eval_batch_size=params[f"{size}_models"][model]["batch_size"],
            gradient_accumulation_steps=params[f"{size}_models"][model][
                "accumulation_steps"
            ],
            evaluation_strategy=params["evaluation"]["evaluation_strategy"],
            save_strategy=params["evaluation"]["save_strategy"],
            fp16_full_eval=params["evaluation"]["fp16_full_eval"],
            eval_accumulation_steps=params["evaluation"]["eval_accumulation_steps"],
            save_total_limit=1,
            logging_strategy="steps",
            lr_scheduler_type="linear",
            optim="adamw_torch",
            sharded_ddp=False,
            prediction_loss_only=False,
            load_best_model_at_end=True,
            disable_tqdm=True,
            logging_dir=None,
        )
        
        # Define special training arguments
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            training_args.generation_max_length = params[f"{size}_models"][model]["output_max_seq_len"]
            training_args.predict_with_generate = True
            training_args.generation_num_beams = None

        # Define early stopping callback
        early_stopping = EarlyStoppingCallback(
            early_stopping_patience=params["training"]["early_stopping_patience"]
        )

        # Define trainer
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            trainer = Seq2SeqTrainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                callbacks=[early_stopping],
            )
        else:
            trainer = Trainer(
                model=lm,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                callbacks=[early_stopping],
            )

        # Train model
        trainer.train()

        # Predict on test dataset for seq2seq models
        if params[f"{size}_models"][model]["type"] == "seq2seq":
            
            # Predict on test dataset with greedy generation
            output = trainer.predict(
                test_dataset,
                do_sample=False,
                max_length=params[f"{size}_models"][model]["output_max_seq_len"],
                early_stopping=True,
            )
            preds_decoded = tokenizer.batch_decode(
                output.predictions, skip_special_tokens=True
            )
            labels = np.where(
                output.label_ids != -100, output.label_ids, tokenizer.pad_token_id
            )
            labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=False)

            # Convert preds to ints
            # We allow additional characters to be generated by check
            # that the first one is a 1 or 0
            preds = []
            for pred in preds_decoded:
                if pred[0] == "1":
                    preds.append(1)
                elif pred[0] == "0":
                    preds.append(0)
                else:
                    print(f"Got unexpected pred: {pred}.")
                    preds.append(np.random.choice([0, 1]))

            # Save scores and labels
            # The labels may contain additional characters, but the first should be
            # a 1 or 0
            y_probs[model].append(preds)
            y_trues[model].append([int(label[0]) for label in labels_decoded])

        # Predict on test set for other model types
        else:
            # Generate scores
            output = trainer.predict(test_dataset)
            labels = output.label_ids
            y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[
                :, 1
            ]

            # Save scores and labels
            y_probs[model].append(y_prob)
            y_trues[model].append(labels)

        # Empty cuda cache
        torch.cuda.empty_cache()

Fitting model: bb_large using fold 0 as out of fold test data.
Train data sizes: (34, 34).
Val data sizes: (13, 13).
Test data sizes: (13, 13).


  arr = asarray(arr)
Some weights of the model checkpoint at google/bigbird-roberta-large were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBir

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 0.7145, 'learning_rate': 5.000000000000001e-07, 'epoch': 0.33}
{'loss': 0.7245, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.67}
{'loss': 0.8362, 'learning_rate': 1.5e-06, 'epoch': 1.0}
{'eval_loss': 0.7054911851882935, 'eval_runtime': 1.4204, 'eval_samples_per_second': 9.153, 'eval_steps_per_second': 0.704, 'epoch': 1.0}


##### Save Model Scores on Test Folds and True Labels

In [None]:
with open(os.path.join(params["io"]["results_dir"], "lm_y_trues.pkl"), "wb") as f:
    pickle.dump(y_trues, f)

with open(os.path.join(params["io"]["results_dir"], "lm_y_probs.pkl"), "wb") as f:
    pickle.dump(y_probs, f)