### Fine-Tune Flan T5

- Joel Stremmel
- 04-20-23

##### About

Fine-Tune Flan T5 on the formatted data using K-Fold Cross-Validation and save the scores.

https://www.philschmid.de/fine-tune-flan-t5

##### Imports

In [1]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Trainer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EvalPrediction,
    EarlyStoppingCallback
)

2023-04-20 17:41:41.907735: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-20 17:41:42.143129: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-20 17:41:42.146315: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


##### Set Parameters

In [2]:
max_seq_len = 1024
output_max_seq_len = 2
batch_size = 1
accumulation_steps = 32
lr = 0.00002
weight_decay = 0.01
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_epsilon = 0.00000001
warmup_steps = 2
logging_steps = 1
num_workers = 8
seed = 44
epochs = 10
fp16 = False
colab = False
gradient_checkpointing = False
add_summaries = False
input_dir = './data' # "/content/drive/MyDrive/data/"
model_output_dir = "model_output"
results_dir = './results/' #"/content/drive/MyDrive/results/"
model_key = "flan_t5_small" 
lm_path = "google/flan-t5-small"

##### Optionally Connect to Google Drive

In [3]:
if colab:

    from google.colab import drive
    drive.mount('/content/drive')

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [5]:
if add_summaries:
    
    with open('data/Xwsum_folds.pkl', 'rb') as f:
        X_folds = pickle.load(f)

else:
    
    with open('data/X_folds.pkl', 'rb') as f:
        X_folds = pickle.load(f)

with open('data/y_folds.pkl', 'rb') as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [6]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [7]:
print(f"Target prevalance: {np.mean(np.concatenate(y))}.")

Target prevalance: 0.5277777777777778.


##### Check that GPU is Available

In [8]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Tokenize Text and Fit Model to Each Fold

In [9]:
y_probs, y_trues = [], []
for i in range(len(X)):
    
    print(f"Fitting model using fold {i} as out of fold data.")
    
    # Identify train folds and shuffle samples
    X_train, y_train = np.concatenate(X[0:i] + X[i+1:], axis=0), np.concatenate(y[0:i] + y[i+1:], axis=0)
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    # Identify test folds
    X_test, y_test = X[i], y[i]
    
    # Format text and label data as HuggingFace dataset
    train_dataset = Dataset.from_dict({"text": X_train, "label_ids": [str(label) for label in y_train]})
    test_dataset = Dataset.from_dict({"text": X_test, "label_ids": [str(label) for label in y_test]})
    
    # Load model and tokenizer
    # This will reset the model weights with each new iteration
    tokenizer = AutoTokenizer.from_pretrained(lm_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(lm_path)
    
    def preprocess_function(sample, padding="max_length", output_max_seq_len=20):
    
        # Add prefix to the input for t5
        inputs = ["Classify this text as either 1 or 0: " + item for item in sample["text"]]

        # tokenize inputs
        model_inputs = tokenizer(
            inputs,
            max_length=max_seq_len,
            padding=padding,
            truncation=True
        )

        # Tokenize targets with the `text_target` keyword argument
        labels = tokenizer(
            text_target=sample["label_ids"],
            max_length=output_max_seq_len,
            padding=padding,
            truncation=True
        )

        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
        # padding in the loss.
        if padding == "max_length":
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
            ]

        model_inputs["label_ids"] = labels["input_ids"]

        return model_inputs
    
    # Tokenize train dataset
    train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    train_dataset.set_format("pt")
    
    # Tokenize test dataset
    test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    test_dataset.set_format("pt")
    
#     eval_steps=PARAMS["eval_steps"],
#     save_steps=PARAMS["save_steps"],
#     evaluation_strategy=PARAMS["evaluation_strategy"],
#     save_strategy=PARAMS["save_strategy"],
#     fp16_full_eval=PARAMS["fp16_eval"],
#     eval_accumulation_steps=PARAMS["eval_accumulation_steps"],
        
    # Define training arguments
    training_args= Seq2SeqTrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=accumulation_steps,
        warmup_steps=warmup_steps,
        logging_steps=logging_steps,
        weight_decay=weight_decay,
        learning_rate=lr,
        seed=seed,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        dataloader_num_workers=num_workers,
        fp16=fp16,
        logging_strategy="steps",
        save_strategy="no",
        lr_scheduler_type='linear',
        optim="adamw_torch",
        run_name=model_key,
        gradient_checkpointing=gradient_checkpointing,
        generation_max_length=output_max_seq_len,
        predict_with_generate=True,
        generation_num_beams=None,
        do_eval=False,
        fp16_full_eval=False,
        sharded_ddp=False,
        load_best_model_at_end=True,
        prediction_loss_only=False,
        disable_tqdm=True,
        logging_dir=None
    )
    
    # Define model training
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset
    )
    
    # Train model
    trainer.train()
    
    # Predict on test dataset with greedy generation
    output = trainer.predict(test_dataset, do_sample=False, max_length=output_max_seq_len, early_stopping=True)
    preds_decoded = tokenizer.batch_decode(output.predictions, skip_special_tokens=True)
    labels = np.where(output.label_ids != -100, output.label_ids, tokenizer.pad_token_id)
    labels_decoded = tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(preds_decoded)
    print(labels_decoded)

#     labels = output.label_ids
#     y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[:, 1]

#     # Save scores and labels
#     y_probs.append(y_prob)
#     y_trues.append(labels)

Fitting model using fold 0 as out of fold data.


Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Map:   0%|          | 0/17 [00:00<?, ? examples/s]

{'loss': 0.3895, 'learning_rate': 1e-05, 'epoch': 0.58}
{'loss': 0.4664, 'learning_rate': 2e-05, 'epoch': 1.16}
{'loss': 0.4501, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.75}
{'loss': 0.493, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.33}
{'loss': 0.3362, 'learning_rate': 1.25e-05, 'epoch': 2.91}
{'loss': 0.4275, 'learning_rate': 1e-05, 'epoch': 3.49}
{'loss': 0.4177, 'learning_rate': 7.500000000000001e-06, 'epoch': 4.07}
{'loss': 0.5317, 'learning_rate': 5e-06, 'epoch': 4.65}
{'loss': 0.2912, 'learning_rate': 2.5e-06, 'epoch': 5.24}
{'loss': 0.3808, 'learning_rate': 0.0, 'epoch': 5.82}
{'train_runtime': 127.7166, 'train_samples_per_second': 4.306, 'train_steps_per_second': 0.078, 'train_loss': 0.41840178370475767, 'epoch': 5.82}
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
['1', '1', '1', '0', '0', '0', '1', '0', '1', '1', '1', '0', '1', '0', '0', '0', '0']
Fitting model using fold 1 as out of fold data.


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.4277, 'learning_rate': 1e-05, 'epoch': 0.56}
{'loss': 0.5368, 'learning_rate': 2e-05, 'epoch': 1.12}
{'loss': 0.4616, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.68}
{'loss': 0.4199, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.25}
{'loss': 0.3911, 'learning_rate': 1.25e-05, 'epoch': 2.81}
{'loss': 0.4434, 'learning_rate': 1e-05, 'epoch': 3.37}
{'loss': 0.424, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.93}
{'loss': 0.3509, 'learning_rate': 5e-06, 'epoch': 4.49}
{'loss': 0.3892, 'learning_rate': 2.5e-06, 'epoch': 5.05}
{'loss': 0.3529, 'learning_rate': 0.0, 'epoch': 5.61}
{'train_runtime': 122.4304, 'train_samples_per_second': 4.656, 'train_steps_per_second': 0.082, 'train_loss': 0.41974426805973053, 'epoch': 5.61}
['', '1', '1', '1', '1', '', '1', '1', '1', '1', '1', '1', '1', '1', '1']
['1', '1', '0', '0', '0', '1', '1', '1', '1', '0', '1', '0', '0', '0', '1']
Fitting model using fold 2 as out of fold data.


Map:   0%|          | 0/57 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

{'loss': 0.4115, 'learning_rate': 1e-05, 'epoch': 0.56}
{'loss': 0.5513, 'learning_rate': 2e-05, 'epoch': 1.12}
{'loss': 0.484, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.68}
{'loss': 0.3656, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.25}
{'loss': 0.41, 'learning_rate': 1.25e-05, 'epoch': 2.81}
{'loss': 0.4662, 'learning_rate': 1e-05, 'epoch': 3.37}
{'loss': 0.375, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.93}
{'loss': 0.3847, 'learning_rate': 5e-06, 'epoch': 4.49}
{'loss': 0.3889, 'learning_rate': 2.5e-06, 'epoch': 5.05}
{'loss': 0.357, 'learning_rate': 0.0, 'epoch': 5.61}
{'train_runtime': 122.4186, 'train_samples_per_second': 4.656, 'train_steps_per_second': 0.082, 'train_loss': 0.4194279730319977, 'epoch': 5.61}
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
['1', '1', '0', '1', '0', '0', '1', '0', '1', '0', '1', '1', '0', '0', '1']
Fitting model using fold 3 as out of fold data.


Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

{'loss': 0.4458, 'learning_rate': 1e-05, 'epoch': 0.53}
{'loss': 0.4319, 'learning_rate': 2e-05, 'epoch': 1.07}
{'loss': 0.4636, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.6}
{'loss': 0.4592, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.13}
{'loss': 0.4393, 'learning_rate': 1.25e-05, 'epoch': 2.67}
{'loss': 0.3981, 'learning_rate': 1e-05, 'epoch': 3.2}
{'loss': 0.3538, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.73}
{'loss': 0.3548, 'learning_rate': 5e-06, 'epoch': 4.27}
{'loss': 0.3974, 'learning_rate': 2.5e-06, 'epoch': 4.8}
{'loss': 0.3547, 'learning_rate': 0.0, 'epoch': 5.33}
{'train_runtime': 122.4298, 'train_samples_per_second': 4.901, 'train_steps_per_second': 0.082, 'train_loss': 0.40986343324184416, 'epoch': 5.33}
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1']
['1', '1', '0', '1', '1', '0', '0', '1', '0', '1', '0', '1']
Fitting model using fold 4 as out of fold data.


Map:   0%|          | 0/59 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 0.4861, 'learning_rate': 1e-05, 'epoch': 0.54}
{'loss': 0.5247, 'learning_rate': 2e-05, 'epoch': 1.08}
{'loss': 0.4139, 'learning_rate': 1.7500000000000002e-05, 'epoch': 1.63}
{'loss': 0.4133, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.17}
{'loss': 0.4468, 'learning_rate': 1.25e-05, 'epoch': 2.71}
{'loss': 0.4705, 'learning_rate': 1e-05, 'epoch': 3.25}
{'loss': 0.355, 'learning_rate': 7.500000000000001e-06, 'epoch': 3.8}
{'loss': 0.4556, 'learning_rate': 5e-06, 'epoch': 4.34}
{'loss': 0.4138, 'learning_rate': 2.5e-06, 'epoch': 4.88}
{'loss': 0.3356, 'learning_rate': 0.0, 'epoch': 5.42}
{'train_runtime': 122.389, 'train_samples_per_second': 4.821, 'train_steps_per_second': 0.082, 'train_loss': 0.43153181970119475, 'epoch': 5.42}
['1', '1', '1', '1', '1', '1', '', '1', '1', '1', '1', '1', '1']
['1', '0', '1', '1', '0', '0', '1', '1', '0', '1', '1', '0', '0']


##### Save Model Probabilities on Test Folds and True Labels

In [10]:
with open(os.path.join(results_dir, f'{model_key}_y_trues.pkl'), 'wb') as f:
    pickle.dump(y_trues, f)

with open(os.path.join(results_dir, f'{model_key}_y_probs.pkl'), 'wb') as f:
    pickle.dump(y_probs, f)