In [1]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 134.9 gigabytes of available RAM

You are using a high-RAM runtime!


In [2]:
# common imports

import sys
assert sys.version_info >= (3, 5)


import sklearn
assert sklearn.__version__ >= "0.20"


import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

2024-05-09 17:03:00.649870: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-09 17:03:00.649907: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-09 17:03:00.651209: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-09 17:03:00.657855: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
import datasets

dataset = datasets.load_from_disk('./ARID/')

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label'],
        num_rows: 1916
    })
    test: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label'],
        num_rows: 480
    })
})

In [11]:
from transformers import GPT2Tokenizer, TFGPT2ForSequenceClassification


batch_size = 16


lbl_ = dataset['train'].features['label'].names
label2id = {lbl: idx for idx, lbl in enumerate(lbl_)}
id2label = {val: key for key, val in label2id.items()}
id2label

model_ckpt = 'openai-community/gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_ckpt)

model = TFGPT2ForSequenceClassification.from_pretrained(model_ckpt,
                                                        num_labels = len(lbl_),
                                                        id2label = id2label,
                                                        label2id = label2id)

2024-05-09 17:03:16.403172: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 17:03:16.403439: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-09 17:03:16.404412: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [12]:
# important to do as the model itself does not
# if not done, error while training
if model.config.pad_token_id is None:
    print("Pad token ID is not set. Setting now...")
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
    model.config.pad_token_id = tokenizer.pad_token_id
    tokenizer.padding_side = "left"
    print("New Pad Token ID set:", model.config.pad_token_id)

Pad token ID is not set. Setting now...
New Pad Token ID set: 50256


In [14]:
def preprocess_function(dataset):
    return tokenizer(dataset['Requirement Sentences'], padding = 'max_length', max_length = 256, truncation = True, return_tensors = 'tf')

In [15]:
X_train_encoded = dataset.map(preprocess_function, batched = True)

In [16]:
print(X_train_encoded['train']['Requirement Sentences'][0])
print(X_train_encoded['train']['input_ids'][0])
print(tokenizer.convert_ids_to_tokens(X_train_encoded['train']['input_ids'][0]))

The DWA must request DWA acknowledgment flashing when the DWA has assumed the "armed" state and the outer skin is closed.
[50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50

In [17]:
X_train_encoded

DatasetDict({
    train: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1916
    })
    test: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label', 'input_ids', 'attention_mask'],
        num_rows: 480
    })
})

In [18]:
tf_train_dataset = model.prepare_tf_dataset(
    X_train_encoded['train'],
    shuffle = True,
    batch_size = batch_size,
    tokenizer = tokenizer
)

tf_valid_dataset = model.prepare_tf_dataset(
    X_train_encoded['test'],
    shuffle = False,
    batch_size = batch_size,
    tokenizer = tokenizer
)

In [19]:
from transformers import create_optimizer

num_epochs = 30
batches_per_epoch = len(X_train_encoded['train']) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr = 2e-5, num_warmup_steps = 0, num_train_steps = total_train_steps
)

In [20]:
import evaluate
from transformers.keras_callbacks import KerasMetricCallback


def compute_metrics(eval_predictions):
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")


    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis = 1)

    precision = metric1.compute(predictions = predictions, references = labels, average = 'macro')["precision"]
    recall = metric2.compute(predictions = predictions, references = labels, average = 'macro')["recall"]
    f1 = metric3.compute(predictions = predictions, references = labels, average = 'macro')["f1"]
    return {"precision": precision, "recall": recall, "f1": f1}

metric_callback = KerasMetricCallback(metric_fn = compute_metrics, eval_dataset = tf_valid_dataset)

In [21]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard

model_name = model_ckpt.split("/")[-1]
print(model_name)
push_to_hub_model_id = f'{model_name}-finetuned-iso29148-req-detector'
tensorboard_callback = TensorBoard(log_dir="./requirement_detector_model_save_2/logs")

push_to_hub_callback = PushToHubCallback(
    output_dir = "./requirement_detector_model_save_2",
    tokenizer = tokenizer,
    hub_model_id = push_to_hub_model_id,
)

callbacks = [push_to_hub_callback, tensorboard_callback, metric_callback]

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/kasra/classification/requirement_detector_model_save_2 is already a clone of https://huggingface.co/kasrahabib/gpt2-finetuned-iso29148-req-detector. Make sure you pull the latest changes with `repo.git_pull()`.


gpt2


In [22]:
model.compile(optimizer = optimizer)
history = model.fit(tf_train_dataset, validation_data = (tf_valid_dataset), epochs = num_epochs, callbacks = callbacks)

Epoch 1/30


2024-05-09 17:03:36.970930: I external/local_xla/xla/service/service.cc:168] XLA service 0x70145b511880 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-09 17:03:36.970960: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2024-05-09 17:03:36.970966: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA GeForce RTX 4090, Compute Capability 8.9
2024-05-09 17:03:36.975908: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-09 17:03:36.990703: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8907
I0000 00:00:1715267017.066167  748698 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
  1/119 [..............................] - ETA: 13s - loss: 2.7783

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (2) will be pushed upstream.


Epoch 3/30
  1/119 [..............................] - ETA: 13s - loss: 2.7185

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (3) will be pushed upstream.


Epoch 4/30
  1/119 [..............................] - ETA: 16s - loss: 2.6976

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (4) will be pushed upstream.


Epoch 5/30
  1/119 [..............................] - ETA: 13s - loss: 2.8288

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (5) will be pushed upstream.


Epoch 6/30
  1/119 [..............................] - ETA: 13s - loss: 2.2832

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (6) will be pushed upstream.


Epoch 7/30

Several commits (7) will be pushed upstream.


Epoch 8/30

Several commits (8) will be pushed upstream.


Epoch 9/30

Several commits (9) will be pushed upstream.


Epoch 10/30

Several commits (10) will be pushed upstream.


Epoch 11/30

Several commits (11) will be pushed upstream.


Epoch 12/30

Several commits (12) will be pushed upstream.


Epoch 13/30

Several commits (13) will be pushed upstream.


Epoch 14/30

Several commits (14) will be pushed upstream.


Epoch 15/30

Several commits (15) will be pushed upstream.


Epoch 16/30

Several commits (16) will be pushed upstream.


Epoch 17/30

Several commits (17) will be pushed upstream.


Epoch 18/30

Several commits (18) will be pushed upstream.


Epoch 19/30

Several commits (19) will be pushed upstream.


Epoch 20/30

Several commits (20) will be pushed upstream.


Epoch 21/30

Several commits (21) will be pushed upstream.


Epoch 22/30

Several commits (22) will be pushed upstream.


Epoch 23/30

Several commits (23) will be pushed upstream.


Epoch 24/30

Several commits (24) will be pushed upstream.


Epoch 25/30

Several commits (25) will be pushed upstream.


Epoch 26/30

Several commits (26) will be pushed upstream.


Epoch 27/30

Several commits (27) will be pushed upstream.


Epoch 28/30

Several commits (28) will be pushed upstream.


Epoch 29/30

Several commits (29) will be pushed upstream.


Epoch 30/30

Several commits (30) will be pushed upstream.




Several commits (31) will be pushed upstream.
The progress bars may be unreliable.
EOF
EOF
error: failed to push some refs to 'https://huggingface.co/kasrahabib/gpt2-finetuned-iso29148-req-detector'



OSError: EOF
EOF
error: failed to push some refs to 'https://huggingface.co/kasrahabib/gpt2-finetuned-iso29148-req-detector'


In [23]:
from huggingface_hub import HfApi

api = HfApi()

api.upload_folder(
    folder_path = "./requirement_detector_model_save_2",
    repo_id = "kasrahabib/" + push_to_hub_model_id,
    repo_type = "model",
)

events.out.tfevents.1715264632.iste.735681.0.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715264960.iste.740663.1.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715265874.iste.746549.0.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715264715.iste.740663.0.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Upload 14 LFS files:   0%|          | 0/14 [00:00<?, ?it/s]

events.out.tfevents.1715265195.iste.740663.2.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715265886.iste.746549.1.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715266154.iste.746549.2.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715266268.iste.746549.3.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715266585.iste.746549.4.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715266792.iste.747601.0.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715266929.iste.748139.0.v2:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

events.out.tfevents.1715267006.iste.748518.0.v2:   0%|          | 0.00/2.15M [00:00<?, ?B/s]

events.out.tfevents.1715267040.iste.748518.1.v2:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kasrahabib/gpt2-finetuned-iso29148-req-detector/commit/338204e8f6d3ecbcbdaa253b06aa089975603342', commit_message='Upload folder using huggingface_hub', commit_description='', oid='338204e8f6d3ecbcbdaa253b06aa089975603342', pr_url=None, pr_revision=None, pr_num=None)