In [1]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 134.9 gigabytes of available RAM

You are using a high-RAM runtime!


In [2]:
# common imports

import sys
assert sys.version_info >= (3, 5)


import sklearn
assert sklearn.__version__ >= "0.20"


import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

2026-01-12 23:01:19.760226: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-12 23:01:19.760262: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-12 23:01:19.761505: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-12 23:01:19.767899: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import datasets

dataset = datasets.load_from_disk('../../datasets/ARID_supporting_scripts/5_1_training_set')
dataset = datasets.concatenate_datasets([dataset['train'], dataset['test']])

In [4]:
dataset

Dataset({
    features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label'],
    num_rows: 2396
})

In [5]:
dataset.features['label'].names

['may_signal_keyword_general_text',
 'may_signal_keyword_requirement',
 'may_signal_keyword_srs_text',
 'must_signal_keyword_general_text',
 'must_signal_keyword_requirement',
 'must_signal_keyword_srs_text',
 'no_signal_keyword_general_text',
 'no_signal_keyword_srs_text',
 'shall_signal_keyword_general_text',
 'shall_signal_keyword_requirement',
 'shall_signal_keyword_srs_text',
 'should_signal_keyword_general_text',
 'should_signal_keyword_requirement',
 'should_signal_keyword_srs_text',
 'will_signal_keyword_general_text',
 'will_signal_keyword_requirement',
 'will_signal_keyword_srs_text']

In [6]:
import evaluate
from transformers.keras_callbacks import KerasMetricCallback

def compute_metrics(eval_predictions):
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")

    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis = 1)

    precision = metric1.compute(predictions = predictions, references = labels, average = 'macro')["precision"]
    recall = metric2.compute(predictions = predictions, references = labels, average = 'macro')["recall"]
    f1_weighted = metric3.compute(predictions = predictions, references = labels, average = 'macro')["f1"]
    return {"precision": precision, "recall": recall, 'f1_macro': f1_weighted}

In [7]:
from transformers import create_optimizer
from transformers import GPT2Tokenizer, TFGPT2ForSequenceClassification


def train_fold(model_ckpt, encoded_tt_splits, lbl_, save_path):
    label2id = {lbl: idx for idx, lbl in enumerate(lbl_)}
    id2label = {val: key for key, val in label2id.items()}

    model = TFGPT2ForSequenceClassification.from_pretrained(model_ckpt, num_labels = len(lbl_), id2label = id2label, label2id = label2id,)
    if model.config.pad_token_id is None:
        model.config.pad_token_id = tokenizer.pad_token_id
        print("New Pad Token ID set:", model.config.pad_token_id)
    
    tf_train_dataset = model.prepare_tf_dataset(encoded_tt_splits['train'], shuffle = True, batch_size = batch_size, tokenizer = tokenizer)
    tf_valid_dataset = model.prepare_tf_dataset(encoded_tt_splits['test'], shuffle = False, batch_size = batch_size, tokenizer = tokenizer)

    batches_per_epoch = len(encoded_tt_splits['train']) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)
    optimizer, schedule = create_optimizer(init_lr = 2e-5, num_warmup_steps = 0, num_train_steps = total_train_steps)
    
    metric_callback = KerasMetricCallback(metric_fn = compute_metrics, eval_dataset = tf_valid_dataset)
    
    model.compile(optimizer = optimizer)
    history = model.fit(tf_train_dataset, validation_data = (tf_valid_dataset), epochs = num_epochs, callbacks = [metric_callback])

    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

In [8]:
def preprocess_function(dataset):
    return tokenizer(dataset['Requirement Sentences'], padding = 'max_length', max_length = 64, truncation = True, return_tensors = 'tf')

In [9]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

K = 10
batch_size = 16
num_epochs = 30
model_ckpt = 'openai-community/gpt2'
model_name = model_ckpt.split('/')[-1]
label_names = dataset.features['label'].names

labels = np.array(dataset["label"])
skf = StratifiedKFold(n_splits = K, shuffle = True, random_state = 42)

tokenizer = GPT2Tokenizer.from_pretrained(model_ckpt)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = "left"

for fold, (train_idx, test_idx) in enumerate(skf.split(np.zeros(len(labels)), labels), start = 1):
    print(f'[TRAINING] Fold {fold}/{K}')
    train_split = dataset.select(train_idx.tolist())
    test_split  = dataset.select(test_idx.tolist())
    train_data = datasets.DatasetDict({"train": train_split, "test": test_split})
    encoded_tt_splits = train_data.map(preprocess_function, batched = True)        

    save_path = f'./models/tuned_10_fold/reqseek_gpt2_kfold_trained/trained_fold_{fold}'
    train_fold(model_ckpt, encoded_tt_splits, label_names, save_path)

    train_data.save_to_disk(f'./models_10fold_dataset_splits/kfold_gpt2_data/train_test_fold{fold}')

[TRAINING] Fold 1/10


Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

2026-01-12 23:01:33.769781: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2026-01-12 23:01:33.771894: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2026-01-12 23:01:33.775691: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256


2026-01-12 23:01:36.980260: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Epoch 1/30
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


2026-01-12 23:02:04.757642: I external/local_xla/xla/service/service.cc:168] XLA service 0x7ab570077590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2026-01-12 23:02:04.757673: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2026-01-12 23:02:04.757678: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2026-01-12 23:02:04.762865: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2026-01-12 23:02:04.786134: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
I0000 00:00:1768255324.844701  184135 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
  1/134 [..............................] - ETA: 15s - loss: 2.5665

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  3/134 [..............................] - ETA: 5s - loss: 2.6188

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 2.3748

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 1.5582

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
  3/134 [..............................] - ETA: 4s - loss: 0.9266

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240 [00:00<?, ? examples/s]

[TRAINING] Fold 2/10


Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 5s - loss: 2.6182

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  1/134 [..............................] - ETA: 6s - loss: 2.8680

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 2.4170

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  1/134 [..............................] - ETA: 6s - loss: 1.4395

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
  3/134 [..............................] - ETA: 5s - loss: 0.5034

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240 [00:00<?, ? examples/s]

[TRAINING] Fold 3/10


Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 16s - loss: 2.9614

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  3/134 [..............................] - ETA: 5s - loss: 2.4982

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 1.1743

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 0.8617

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
  3/134 [..............................] - ETA: 5s - loss: 0.8146

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7/30
  3/134 [..............................] - ETA: 5s - loss: 0.7935

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240 [00:00<?, ? examples/s]

[TRAINING] Fold 4/10


Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  3/134 [..............................] - ETA: 5s - loss: 2.7226

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  3/134 [..............................] - ETA: 5s - loss: 2.6927

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 2.5642

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 2.3357

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
  3/134 [..............................] - ETA: 5s - loss: 1.2823

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240 [00:00<?, ? examples/s]

[TRAINING] Fold 5/10


Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 17s - loss: 2.8007

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  3/134 [..............................] - ETA: 5s - loss: 2.6069

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  1/134 [..............................] - ETA: 6s - loss: 2.4268

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  1/134 [..............................] - ETA: 5s - loss: 1.6007

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240 [00:00<?, ? examples/s]

[TRAINING] Fold 6/10


Map:   0%|          | 0/2156 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 14s - loss: 2.8003

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  3/134 [..............................] - ETA: 5s - loss: 2.7891

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 2.4966

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 2.5363

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2156 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/240 [00:00<?, ? examples/s]

[TRAINING] Fold 7/10


Map:   0%|          | 0/2157 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 6s - loss: 2.7790

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  1/134 [..............................] - ETA: 7s - loss: 2.7421

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  1/134 [..............................] - ETA: 5s - loss: 2.4245

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 0.9025

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/239 [00:00<?, ? examples/s]

[TRAINING] Fold 8/10


Map:   0%|          | 0/2157 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 15s - loss: 2.9184

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  3/134 [..............................] - ETA: 5s - loss: 2.7327

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 2.6524

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 2.1261

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/239 [00:00<?, ? examples/s]

[TRAINING] Fold 9/10


Map:   0%|          | 0/2157 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 16s - loss: 2.7930

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30
  3/134 [..............................] - ETA: 5s - loss: 2.6539

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 2.6326

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 1.9093

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/239 [00:00<?, ? examples/s]

[TRAINING] Fold 10/10


Map:   0%|          | 0/2157 [00:00<?, ? examples/s]

Map:   0%|          | 0/239 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFGPT2ForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFGPT2ForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


New Pad Token ID set: 50256
Epoch 1/30
Epoch 2/30
  1/134 [..............................] - ETA: 15s - loss: 2.5169

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 3/30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 4/30
  3/134 [..............................] - ETA: 5s - loss: 2.4283

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 5/30
  3/134 [..............................] - ETA: 5s - loss: 1.8717

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 6/30
  3/134 [..............................] - ETA: 5s - loss: 0.9191

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


Saving the dataset (0/1 shards):   0%|          | 0/2157 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/239 [00:00<?, ? examples/s]