# Transformers

In [1]:
# warnings
from warnings import simplefilter
simplefilter("ignore", category=FutureWarning)
simplefilter("ignore", category=DeprecationWarning)

# import libraries
seed=42
import os
os.environ['PYTHONHASHSEED'] = str(seed)
import random
random.seed(seed)
import numpy as np
np.random.seed(seed)
import time
import pandas as pd

# preprocess
from datasets import Dataset, DatasetDict, concatenate_datasets

# train
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# prediction
import torch

# custom built functions
from logs.get_logs import setup_logger
from dataPrep.get_data_fold import data_read
from models.TFs.Transformers_model import BatchTokenize, BatchTokenizeCombine, Transformers_train, Transformers_predict
from utils.utils import set_seed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
# functions go here

def getModelType(model_select):
    
    if model_select=="RoBERTa": # batch size 10
        model_type="roberta-large" # https://huggingface.co/models?sort=downloads&search=roberta
    elif model_select=="Longformer": # batch size 1
        model_type="allenai/longformer-large-4096" # https://huggingface.co/models?sort=downloads&search=longformer-large
    elif model_select=="OpenAIGPT2": # batch size 6
        model_type="gpt2" # https://huggingface.co/models?sort=downloads&search=gpt2
    
    return model_type

In [4]:
## inputs
ite = 1

# Choose model
model_select = "OpenAIGPT2" # Options: RoBERTa, Longformer, OpenAIGPT2
model_type = getModelType(model_select)

# Choose
model_tokenize=0
TokenizeCombine=0
model_train=0
model_predict=0

# logger
task = "_Tokenize_Train_Test_"+str(ite) # Train Test
taskName = model_select + task
root_dir = '/home/ravi/raviProject/DataModelsResults/'
model_folder = root_dir + "/Results/" + model_select + "_" + str(ite) + "/"
log_dir_fname = model_folder + taskName +".log"
print("log_dir_fname: {}".format(log_dir_fname))
logger = setup_logger(log_dir_fname=log_dir_fname)

logger.info("=========================================================")  
logger.info("==================== New execution ======================")
logger.info("=========================================================")
execution_st = time.time()



log_dir_fname: /home/ravi/raviProject/DataModelsResults//Results/OpenAIGPT2_1/OpenAIGPT2_Tokenize_Train_Test_1.log


In [5]:
# elif model_tokenize: 
# inputs
logger.info("Get inputs data")
# Load data. Get K-Fold data. Save 5 fold indices (80% train, 20% test)
all_train_data = pd.read_json("/home/ravi/raviProject/DataModelsResults/Data/V1_Labeled_300_sampled.json", orient='records')
all_train_data = all_train_data.drop(columns=['label'])
all_train_data['FinalLabel'] = all_train_data['FinalLabel'].astype('int64')
logger.info("all_train_data.shape {}".format(all_train_data.shape))

# format
all_train_data = all_train_data.rename(columns={"FinalLabel": "label"})

# Sample the DataFrame with replacement
# n=2000
val_data = all_train_data.sample(frac=0.2, random_state=42, replace=False)
# Drop the sampled rows from the DataFrame
train_data = all_train_data.drop(val_data.index)

train_data = train_data[['text','label']]
val_data = val_data[['text','label']]

train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True)

# Tokenize
BatchTokenize(logger, model_tokenize, model_type, model_select, model_folder, train_data, val_data)

# end
logger.info("Execution time {} seconds".format(time.time()-execution_st))

2024-04-09 12:58:24,810 | 1269898119.py: 3: <cell line: 3>() | INFO: Get inputs data
2024-04-09 12:58:24,826 | 1269898119.py: 8: <cell line: 8>() | INFO: all_train_data.shape (300, 12)
2024-04-09 12:58:24,830 | 1269898119.py: 29: <cell line: 29>() | INFO: Execution time 0.030343294143676758 seconds


In [6]:
# BatchTokenizeCombine
BatchTokenizeCombine(logger, model_folder)



0
1
2
3
4
5
6
7
8
9


2024-04-09 12:58:45,035 | Transformers_model.py: 221: BatchTokenizeCombine() | INFO: tokenized_AllTrainData is 
 DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 240
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 60
    })
})
2024-04-09 12:58:45,035 | Transformers_model.py: 223: BatchTokenizeCombine() | INFO: Combine Tokens n time 20.195966720581055 seconds


In [7]:
# warnings
from warnings import simplefilter
simplefilter("ignore", category=FutureWarning)
simplefilter("ignore", category=DeprecationWarning)

# import libraries
seed=42
import os
os.environ['PYTHONHASHSEED'] = str(seed)
import random
random.seed(seed)
import numpy as np
np.random.seed(seed)
import time

import time
import joblib
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
import json

import pickle
import json
import io
import shutil

# preprocess
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding

# preprocess
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset, load_from_disk
import pandas as pd

# train
import evaluate
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# prediction
import torch
import glob
from natsort import natsorted

In [8]:
model_st = time.time()

# del AllTrainData # to clear memory

logger.info("=========================================================")  
logger.info("================Finetuning on Train data=================")
logger.info("=========================================================")

logger.info("=======load Tokenize train  and val data==========")

tokenized_AllTrainData = load_from_disk(model_folder+'/tokenized_AllTrainData')
# tokenized_AllTrainData = load_from_disk(model_folder+'/gpt2-large-tokenized_AllTrainData') # model OOM

logger.info("tokenized_AllTrainData is \n {}".format(tokenized_AllTrainData))

2024-04-09 12:58:45,071 | 3493274202.py: 14: <cell line: 14>() | INFO: tokenized_AllTrainData is 
 DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 240
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 60
    })
})


In [9]:
tokenized_AllTrainData

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 240
    })
    val: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 60
    })
})

In [10]:
# Now create a batch of examples using DataCollatorWithPadding. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.
# The next step is to load a DistilBERT tokenizer to preprocess the text field:
tokenizer = AutoTokenizer.from_pretrained(model_type)

if model_select=="OpenAIGPT2": # https://github.com/huggingface/transformers/issues/3859 and https://gmihaila.github.io/tutorial_notebooks/gpt2_finetune_classification/ 
    # default to left padding
    tokenizer.padding_side = "left"
    # Define PAD Token = EOS Token = 50256
    tokenizer.pad_token = tokenizer.eos_token
    
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
logger.info("======== train =========")

# compute metric
weighted_f1_metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return weighted_f1_metric.compute(predictions=predictions, references=labels, average="weighted")



In [12]:
# Before you start training your model, create a map of the expected ids to their labels with id2label and label2id:
id2label = {1: "1", 2: "2", 3: "3"}
logger.info("id2label is \n {}".format(id2label))
label2id = {'1': 1, '2': 2, '3': 3}

2024-04-09 12:58:46,995 | 2021215553.py: 3: <cell line: 3>() | INFO: id2label is 
 {1: '1', 2: '2', 3: '3'}


In [13]:
# train

logger.info("========Training Model=========")

# from epoch 0
# multi class and single label; not problem_type="multi_label_classification"
model = AutoModelForSequenceClassification.from_pretrained(model_type, num_labels=3, id2label=id2label, label2id=label2id)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [15]:
model.to(device)

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [16]:
next(model.parameters()).device

device(type='cuda', index=0)

In [17]:
# # path to the model checkpoint from the 36th epoch
# model_checkpoint = "/home/ravi/UCF Dropbox/KAMALAKKANNAN RAVI/guyonDesktop/DATA_AutomatedHarmDetection/DataModelsResults/Results/OpenAIGPT2/checkpoint-288000/"
# # Load the model from the checkpoint
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

if model_select=="OpenAIGPT2": # https://github.com/huggingface/transformers/issues/3859 and https://gmihaila.github.io/tutorial_notebooks/gpt2_finetune_classification/ 
    # resize model embedding to match new tokenizer
    model.resize_token_embeddings(len(tokenizer))

    # fix model padding token id
    model.config.pad_token_id = model.config.eos_token_id

In [22]:
batch_size = 1
training_args = TrainingArguments(
    output_dir=model_folder,
    seed=seed,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size, # to avoid OOM
    gradient_accumulation_steps=1, # to avoid OOM
    per_device_eval_batch_size=batch_size, # to avoid OOM
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    save_total_limit=2,
    save_steps=10,
    eval_steps=10,
    fp16=True, # to avoid OOM
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [23]:
# Set CUDA_LAUNCH_BLOCKING environment variable
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_AllTrainData["train"],
    eval_dataset=tokenized_AllTrainData["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

RuntimeError: CUDA error: device-side assert triggered

In [25]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2ForSequenceClassification.forward` and have been ignored: text. If text are not expected by `GPT2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 240
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 30


RuntimeError: CUDA error: device-side assert triggered

In [None]:
logger.info("========Saving Model=========")
trainer.save_model()