In [24]:
# from google.colab import drive
# drive.mount('/content/drive')

In [25]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 134.9 gigabytes of available RAM

You are using a high-RAM runtime!


In [26]:
# common imports

import sys
assert sys.version_info >= (3, 5)


import sklearn
assert sklearn.__version__ >= "0.20"


import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import os

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [29]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
import datasets

dataset = datasets.load_from_disk('./ARID/')

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label'],
        num_rows: 1916
    })
    test: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label'],
        num_rows: 480
    })
})

In [33]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


batch_size = 16


lbl_ = dataset['train'].features['label'].names
label2id = {lbl: idx for idx, lbl in enumerate(lbl_)}
id2label = {val: key for key, val in label2id.items()}
id2label

model_ckpt = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = TFAutoModelForSequenceClassification.from_pretrained(model_ckpt,
                                                             num_labels = len(lbl_),
                                                             id2label = id2label,
                                                             label2id = label2id)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
def preprocess_function(dataset):
    return tokenizer(dataset['Requirement Sentences'], truncation = True)

In [36]:
X_train_encoded = dataset.map(preprocess_function, batched = True)

Map:   0%|          | 0/1916 [00:00<?, ? examples/s]

Map:   0%|          | 0/480 [00:00<?, ? examples/s]

In [37]:
print(X_train_encoded['train']['Requirement Sentences'][0])
print(X_train_encoded['train']['input_ids'][0])
print(tokenizer.convert_ids_to_tokens(X_train_encoded['train']['input_ids'][0]))

The DWA must request DWA acknowledgment flashing when the DWA has assumed the "armed" state and the outer skin is closed.
[101, 1996, 1040, 4213, 2442, 5227, 1040, 4213, 9353, 2243, 19779, 3709, 21693, 4765, 12659, 2043, 1996, 1040, 4213, 2038, 5071, 1996, 1000, 4273, 1000, 2110, 1998, 1996, 6058, 3096, 2003, 2701, 1012, 102]
['[CLS]', 'the', 'd', '##wa', 'must', 'request', 'd', '##wa', 'ac', '##k', '##now', '##led', '##gm', '##ent', 'flashing', 'when', 'the', 'd', '##wa', 'has', 'assumed', 'the', '"', 'armed', '"', 'state', 'and', 'the', 'outer', 'skin', 'is', 'closed', '.', '[SEP]']


In [38]:
X_train_encoded

DatasetDict({
    train: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1916
    })
    test: Dataset({
        features: ['REQID', 'REQID_expanded', 'Requirement Sentences', 'Open/ Closed Source', 'class', 'signal_keyword', 'Source', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 480
    })
})

In [39]:
tf_train_dataset = model.prepare_tf_dataset(
    X_train_encoded['train'],
    shuffle = True,
    batch_size = batch_size,
    tokenizer = tokenizer
)

tf_valid_dataset = model.prepare_tf_dataset(
    X_train_encoded['test'],
    shuffle = False,
    batch_size = batch_size,
    tokenizer = tokenizer
)

In [40]:
from transformers import create_optimizer

num_epochs = 30
batches_per_epoch = len(X_train_encoded['train']) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)

optimizer, schedule = create_optimizer(
    init_lr = 2e-5, num_warmup_steps = 0, num_train_steps = total_train_steps
)

In [41]:
import evaluate
from transformers.keras_callbacks import KerasMetricCallback


def compute_metrics(eval_predictions):
    metric1 = evaluate.load("precision")
    metric2 = evaluate.load("recall")
    metric3 = evaluate.load("f1")


    predictions, labels = eval_predictions
    predictions = np.argmax(predictions, axis = 1)

    precision = metric1.compute(predictions = predictions, references = labels, average = 'macro')["precision"]
    recall = metric2.compute(predictions = predictions, references = labels, average = 'macro')["recall"]
    f1 = metric3.compute(predictions = predictions, references = labels, average = 'macro')["f1"]
    return {"precision": precision, "recall": recall, "f1": f1}

metric_callback = KerasMetricCallback(metric_fn = compute_metrics, eval_dataset = tf_valid_dataset)

In [42]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard

model_name = model_ckpt.split("/")[-1]
print(model_name)
push_to_hub_model_id = f'{model_name}-finetuned-iso29148-req-detector'
tensorboard_callback = TensorBoard(log_dir="./requirement_detector_model_save/logs")

push_to_hub_callback = PushToHubCallback(
    output_dir = "./requirement_detector_model_save",
    tokenizer = tokenizer,
    hub_model_id = push_to_hub_model_id,
)

callbacks = [push_to_hub_callback, tensorboard_callback, metric_callback]

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/kasrahabib/all-MiniLM-L6-v2-finetuned-isobased-req-detector_v3 into local empty directory.


all-MiniLM-L6-v2


In [43]:
model.compile(optimizer = optimizer)
history = model.fit(tf_train_dataset, validation_data = (tf_valid_dataset), epochs = num_epochs, callbacks = callbacks)

Epoch 1/30
Epoch 2/30
  1/119 [..............................] - ETA: 12s - loss: 2.6834

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (2) will be pushed upstream.


Epoch 3/30
  1/119 [..............................] - ETA: 16s - loss: 1.8646

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (3) will be pushed upstream.


Epoch 4/30
  5/119 [>.............................] - ETA: 3s - loss: 1.4577

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (4) will be pushed upstream.


Epoch 5/30
  1/119 [..............................] - ETA: 11s - loss: 1.4434

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (5) will be pushed upstream.


Epoch 6/30
  5/119 [>.............................] - ETA: 3s - loss: 0.8370

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




Several commits (6) will be pushed upstream.


Epoch 7/30

Several commits (7) will be pushed upstream.


Epoch 8/30

Several commits (8) will be pushed upstream.


Epoch 9/30

Several commits (9) will be pushed upstream.


Epoch 10/30

Several commits (10) will be pushed upstream.


Epoch 11/30

Several commits (11) will be pushed upstream.


Epoch 12/30

Several commits (12) will be pushed upstream.


Epoch 13/30

Several commits (13) will be pushed upstream.


Epoch 14/30

Several commits (14) will be pushed upstream.


Epoch 15/30

Several commits (15) will be pushed upstream.


Epoch 16/30

Several commits (16) will be pushed upstream.


Epoch 17/30

Several commits (17) will be pushed upstream.


Epoch 18/30

Several commits (18) will be pushed upstream.


Epoch 19/30

Several commits (19) will be pushed upstream.


Epoch 20/30

Several commits (20) will be pushed upstream.


Epoch 21/30

Several commits (21) will be pushed upstream.


Epoch 22/30

Several commits (22) will be pushed upstream.


Epoch 23/30

Several commits (23) will be pushed upstream.


Epoch 24/30

Several commits (24) will be pushed upstream.


Epoch 25/30

Several commits (25) will be pushed upstream.


Epoch 26/30

Several commits (26) will be pushed upstream.


Epoch 27/30

Several commits (27) will be pushed upstream.


Epoch 28/30

Several commits (28) will be pushed upstream.


Epoch 29/30

Several commits (29) will be pushed upstream.


Epoch 30/30

Several commits (30) will be pushed upstream.




Several commits (31) will be pushed upstream.
The progress bars may be unreliable.
EOF
error: failed to push some refs to 'https://huggingface.co/kasrahabib/all-MiniLM-L6-v2-finetuned-isobased-req-detector_v3'



OSError: EOF
error: failed to push some refs to 'https://huggingface.co/kasrahabib/all-MiniLM-L6-v2-finetuned-isobased-req-detector_v3'


In [44]:
from huggingface_hub import HfApi

api = HfApi()

api.upload_folder(
    folder_path = "./requirement_detector_model_save",
    repo_id = "kasrahabib/" + push_to_hub_model_id,
    repo_type = "model",
)

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1715005761.iste.358702.2.v2:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/91.0M [00:00<?, ?B/s]

events.out.tfevents.1715005788.iste.358702.3.v2:   0%|          | 0.00/4.75k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kasrahabib/all-MiniLM-L6-v2-finetuned-isobased-req-detector_v3/commit/429fdd3b4de59bc7f4b4eb09aad306db2b187a80', commit_message='Upload folder using huggingface_hub', commit_description='', oid='429fdd3b4de59bc7f4b4eb09aad306db2b187a80', pr_url=None, pr_revision=None, pr_num=None)