### Fine-Tune GPT Neo

- Joel Stremmel
- 04-19-23

##### About

Fine-Tune GPT Neo on the formatted data using K-Fold Cross-Validation and save the scores.

##### Install Libraries

In [1]:
# !pip install -q pdfminer.six
# !pip install -q pandas
# !pip install -q transformers
# !pip install -q openpyxl
# !pip install -q datasets

##### Imports

In [2]:
import os
import re
import glob
import pickle
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    GPT2ForSequenceClassification,
    GPTNeoForSequenceClassification,
    Trainer,
    TrainingArguments
)

2023-04-21 10:39:36.926081: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-21 10:39:36.974844: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-21 10:39:36.975370: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


##### Set Parameters

In [3]:
max_seq_len = 1024 # 2048
batch_size = 1
accumulation_steps = 32
lr = 2e-5
weight_decay = 0.01
adam_beta1 = 0.9
adam_beta2 = 0.999
adam_epsilon = 0.00000001
warmup_steps = 4
logging_steps = 1
num_workers = 2
seed = 44
epochs = 5
fp16 = False
colab = False
gradient_checkpointing = True
input_dir = './data' # "/content/drive/MyDrive/data/"
model_output_dir = "model_output"
results_dir = './results/' #"/content/drive/MyDrive/results/"
model_key = "gpt2" # "gpt_neo_125m"
lm_path = "gpt2" # "EleutherAI/gpt-neo-125m" # "EleutherAI/gpt-neo-1.3B"

##### Optionally Connect to Google Drive

In [4]:
if colab:

    from google.colab import drive
    drive.mount('/content/drive')

##### Disable Tokenizer Parallelism
This is mostly to avoid warnings.

In [5]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Load Formatted Data

In [6]:
with open(os.path.join(input_dir, 'X_folds.pkl'), 'rb') as f:
    X_folds = pickle.load(f)

with open(os.path.join(input_dir, 'y_folds.pkl'), 'rb') as f:
    y_folds = pickle.load(f)

##### Check Data Shape

In [7]:
assert len(X_folds) == len(y_folds), "Expected the same number of folds in X and y."
X = list(X_folds.values())
y = list(y_folds.values())

##### Check Target Prevalence

In [8]:
print(f"Target prevalance: {np.mean(np.concatenate(y))}.")

Target prevalance: 0.5166666666666667.


##### Check that GPU is Available

In [9]:
assert torch.cuda.is_available(), "Run this script on a GPU."
print(torch.__version__)

1.8.1+cu101


##### Tokenize Text and Fit Model to Each Fold

In [10]:
y_probs, y_trues = [], []
for i in range(len(X)):
    
    print(f"Fitting model using fold {i} as out of fold data.")
    
    # Identify train folds and shuffle samples
    X_train, y_train = np.concatenate(X[0:i] + X[i+1:], axis=0), np.concatenate(y[0:i] + y[i+1:], axis=0)
    indices = np.arange(len(y_train))
    np.random.shuffle(indices)
    X_train, y_train = X_train[indices], y_train[indices]
    
    # Identify test folds
    X_test, y_test = X[i], y[i]
    
    # Format text and label data as HuggingFace dataset
    train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
    test_dataset = Dataset.from_dict({"text": X_test, "label": y_test})
    
    # Load model and tokenizer
    # This will reset the model weights with each new iteration
    tokenizer = AutoTokenizer.from_pretrained(lm_path)
    tokenizer.pad_token = tokenizer.eos_token
    # GPTNeoForSequenceClassification
    model = GPT2ForSequenceClassification.from_pretrained(
        lm_path,
        num_labels=2,
        return_dict=True,
        problem_type="single_label_classification"
    )
    
    # Define function to tokenize text
    def tokenize_function(batch):
        
        return tokenizer(
            batch["text"],
            padding="max_length",
            truncation=True,
            max_length=max_seq_len
        )
    
    # Tokenize train dataset
    train_dataset = train_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    train_dataset.set_format("pt")
    
    # Tokenize test dataset
    test_dataset = test_dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=["text"],
        batch_size=batch_size
    )
    test_dataset.set_format("pt")
    
    # Define training arguments
    training_args= TrainingArguments(
        output_dir=model_output_dir,
        num_train_epochs=epochs,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=accumulation_steps,
        warmup_steps=warmup_steps,
        logging_steps=logging_steps,
        weight_decay=weight_decay,
        learning_rate=lr,
        seed=seed,
        adam_beta1=adam_beta1,
        adam_beta2=adam_beta2,
        adam_epsilon=adam_epsilon,
        dataloader_num_workers=num_workers,
        fp16=fp16,
        run_name=model_key,
        gradient_checkpointing=gradient_checkpointing,
        logging_strategy="steps",
        save_strategy="no",
        lr_scheduler_type='linear',
        optim="adamw_torch",
        do_eval=False,
        fp16_full_eval=False,
        sharded_ddp=False,
        load_best_model_at_end=True,
        prediction_loss_only=False,
        disable_tqdm=True,
        logging_dir=None,
    )
    
    # Define model training
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset
    )
    
    # Train model
    trainer.train()
    
    # Predict on test dataset
    output = trainer.predict(test_dataset)
    labels = output.label_ids
    y_prob = torch.sigmoid(torch.tensor(output.predictions).double()).numpy()[:, 1]

    # Save scores and labels
    y_probs.append(y_prob)
    y_trues.append(labels)

Fitting model using fold 0 as out of fold data.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/47 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 2.0993, 'learning_rate': 5e-06, 'epoch': 0.68}
{'loss': 2.9252, 'learning_rate': 1e-05, 'epoch': 1.36}
{'loss': 2.1166, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.04}
{'loss': 2.4474, 'learning_rate': 2e-05, 'epoch': 2.72}
{'loss': 1.4642, 'learning_rate': 0.0, 'epoch': 3.4}
{'train_runtime': 231.9369, 'train_samples_per_second': 1.013, 'train_steps_per_second': 0.022, 'train_loss': 2.2105465650558473, 'epoch': 3.4}
Fitting model using fold 1 as out of fold data.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/47 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 7.4091, 'learning_rate': 5e-06, 'epoch': 0.68}
{'loss': 6.8118, 'learning_rate': 1e-05, 'epoch': 1.36}
{'loss': 6.6715, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.04}
{'loss': 6.8829, 'learning_rate': 2e-05, 'epoch': 2.72}
{'loss': 5.4715, 'learning_rate': 0.0, 'epoch': 3.4}
{'train_runtime': 232.0578, 'train_samples_per_second': 1.013, 'train_steps_per_second': 0.022, 'train_loss': 6.6493833541870115, 'epoch': 3.4}
Fitting model using fold 2 as out of fold data.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/48 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

{'loss': 7.7568, 'learning_rate': 5e-06, 'epoch': 0.67}
{'loss': 7.3869, 'learning_rate': 1e-05, 'epoch': 1.33}
{'loss': 8.0632, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.0}
{'loss': 6.7216, 'learning_rate': 2e-05, 'epoch': 2.67}
{'loss': 6.7424, 'learning_rate': 0.0, 'epoch': 3.33}
{'train_runtime': 232.0231, 'train_samples_per_second': 1.034, 'train_steps_per_second': 0.022, 'train_loss': 7.33416748046875, 'epoch': 3.33}
Fitting model using fold 3 as out of fold data.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/51 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

{'loss': 6.7751, 'learning_rate': 5e-06, 'epoch': 0.63}
{'loss': 9.5796, 'learning_rate': 1e-05, 'epoch': 1.25}
{'loss': 9.0351, 'learning_rate': 1.5000000000000002e-05, 'epoch': 1.88}
{'loss': 7.0917, 'learning_rate': 2e-05, 'epoch': 2.51}
{'loss': 7.0614, 'learning_rate': 0.0, 'epoch': 3.14}
{'train_runtime': 232.5776, 'train_samples_per_second': 1.096, 'train_steps_per_second': 0.021, 'train_loss': 7.908562278747558, 'epoch': 3.14}
Fitting model using fold 4 as out of fold data.


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/47 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

{'loss': 8.2867, 'learning_rate': 5e-06, 'epoch': 0.68}
{'loss': 7.3387, 'learning_rate': 1e-05, 'epoch': 1.36}
{'loss': 7.581, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.04}
{'loss': 5.4619, 'learning_rate': 2e-05, 'epoch': 2.72}
{'loss': 7.5689, 'learning_rate': 0.0, 'epoch': 3.4}
{'train_runtime': 234.0161, 'train_samples_per_second': 1.004, 'train_steps_per_second': 0.021, 'train_loss': 7.247429847717285, 'epoch': 3.4}


##### Save Model Probabilities on Test Folds and True Labels

In [11]:
with open(os.path.join(results_dir, f'{model_key}_y_trues.pkl'), 'wb') as f:
    pickle.dump(y_trues, f)

with open(os.path.join(results_dir, f'{model_key}_y_probs.pkl'), 'wb') as f:
    pickle.dump(y_probs, f)