<a href="https://colab.research.google.com/github/limshaocong/SysBERT/blob/main/t3_finetuning_seqlabel_cr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Preliminaries**

In [1]:
! pip install --user datasets transformers torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
! nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-5dba96d0-d296-ee9a-89bf-12769d386cf1)


In [3]:
# ! huggingface-cli login
# # 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

# **Import & Pre-process Data**

In [4]:
model_type_dict = {
    'bert-base-cased' : 'bert-base-cased',
    'roberta-base' : 'roberta-base',
    'allenai/scibert_scivocab_cased' : 'allenai/scibert_scivocab_cased',
    'limsc/reqbert-tapt-epoch29' : 'bert-base-cased', # preferred
    'limsc/reqbert-tapt-epoch30' : 'bert-base-cased',
    'limsc/reqroberta-tapt-epoch20' : 'roberta-base',
    'limsc/reqroberta-tapt-epoch33' : 'roberta-base',
    'limsc/reqroberta-tapt-epoch43' : 'roberta-base', # preferred
    'limsc/reqroberta-tapt-epoch50' : 'roberta-base',
    'limsc/reqscibert-tapt-epoch10' : 'allenai/scibert_scivocab_cased', # preferred
    'limsc/reqscibert-tapt-epoch20' : 'allenai/scibert_scivocab_cased', # preferred
    'limsc/reqscibert-tapt-epoch31' : 'allenai/scibert_scivocab_cased',
    'limsc/reqscibert-tapt-epoch49' : 'allenai/scibert_scivocab_cased',
}

model_name_dict = {
    'bert-base-cased' : 'bert',
    'roberta-base' : 'roberta',
    'allenai/scibert_scivocab_cased' : 'scibert',
    'limsc/reqbert-tapt-epoch29' : 'reqbert-e29',
    'limsc/reqbert-tapt-epoch30' : 'reqbert-e30',
    'limsc/reqroberta-tapt-epoch20' : 'reqroberta-e20',
    'limsc/reqroberta-tapt-epoch33' : 'reqroberta-e33',
    'limsc/reqroberta-tapt-epoch43' : 'reqroberta-e43',
    'limsc/reqroberta-tapt-epoch50' : 'reqroberta-e50',
    'limsc/reqscibert-tapt-epoch10' : 'reqscibert-e10',
    'limsc/reqscibert-tapt-epoch20' : 'reqscibert-e20',
    'limsc/reqscibert-tapt-epoch31' : 'reqscibert-e31',
    'limsc/reqscibert-tapt-epoch49' : 'reqscibert-e49',
}

task_name_dict = {
    'limsc/fr-nfr-classification' : 'frnfr',
    'limsc/nfr-subclass-classification' : 'subclass',
    'limsc/concept-recognition' : 'cr',
    'limsc/concept-recognition-not-iob' : 'cr',
    'limsc/sysmlv2-entity-extraction' : 'ee'
}

In [5]:
from datasets import load_dataset

ds_name = 'limsc/concept-recognition-not-iob'
ds = load_dataset(ds_name)
ds

Using custom data configuration limsc--concept-recognition-not-iob-49ad4a4453826183
Reusing dataset parquet (/root/.cache/huggingface/datasets/limsc___parquet/limsc--concept-recognition-not-iob-49ad4a4453826183/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['id', 'tokens', 'concept_tags'],
        num_rows: 132
    })
    train: Dataset({
        features: ['id', 'tokens', 'concept_tags'],
        num_rows: 611
    })
    val: Dataset({
        features: ['id', 'tokens', 'concept_tags'],
        num_rows: 131
    })
})

In [6]:
label_list = ds["train"].features['concept_tags'].feature.names
label_list

['Materials / EEEs',
 'O',
 'GN&C',
 'Thermal',
 'Parameter',
 'Quality control',
 'Safety / Risk (Control)',
 'System engineering',
 'Space Environment',
 'Cleanliness',
 'Measurement',
 'Telecom.',
 'Project Organisation / Documentation',
 'Project Scope',
 'Power',
 'OBDH',
 'Structure & Mechanism',
 'Nonconformity',
 'Propulsion']

In [7]:
model_checkpoint = 'bert-base-cased'

In [8]:
from transformers import AutoTokenizer

prefix_space = True if model_type_dict[model_checkpoint] == 'roberta-base' else False

tokenizer = AutoTokenizer.from_pretrained(
    model_type_dict[model_checkpoint],
    use_fast = True,
    add_prefix_space = prefix_space
)

In [9]:
def tokenize_and_align_labels(examples, label_all_tokens = False):

    tokenized_inputs = tokenizer(
        examples['tokens'],
        truncation = True,
        is_split_into_words = True
    )
    labels = []
    
    for i, label in enumerate(examples['concept_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:           
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.

            if word_idx is None:
                label_ids.append(-100)
            
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [10]:
tokenized_ds = ds.map(tokenize_and_align_labels, batched = True)

Loading cached processed dataset at /root/.cache/huggingface/datasets/limsc___parquet/limsc--concept-recognition-not-iob-49ad4a4453826183/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-18edfbbac157d63e.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/limsc___parquet/limsc--concept-recognition-not-iob-49ad4a4453826183/0.0.0/0b6d5799bb726b24ad7fc7be720c170d8e497f575d02d47537de9a5bac074901/cache-8a00082e2ee7aac4.arrow


# **Model Fine-tuning (Single Loop)**

In [11]:
from transformers import DataCollatorForTokenClassification

batch_size = 16

data_collator = DataCollatorForTokenClassification(
    tokenizer = tokenizer,
    padding = True,
    return_tensors = 'tf'
)

def batching(tokenized_ds, batch_size):

    batched_train_ds = tokenized_ds['train'].to_tf_dataset(
        columns = ['attention_mask', 'input_ids', 'labels'],
        shuffle = True,
        batch_size = batch_size,
        collate_fn = data_collator,
    )

    batched_val_ds = tokenized_ds['val'].to_tf_dataset(
        columns = ['attention_mask', 'input_ids', 'labels'],
        shuffle = False,
        batch_size = batch_size,
        collate_fn = data_collator,
    )

    batched_test_ds = tokenized_ds['test'].to_tf_dataset(
        columns = ['attention_mask', 'input_ids', 'labels'],
        shuffle = False,
        batch_size = batch_size,
        collate_fn = data_collator,
    )
    
    return batched_train_ds, batched_val_ds, batched_test_ds

batched_train_ds, batched_val_ds, batched_test_ds = batching(tokenized_ds, batch_size)

  tensor = as_tensor(value)


In [12]:
import tensorflow as tf
from transformers import TFAutoModelForTokenClassification, create_optimizer

seed = 6789767
tf.random.set_seed(seed)

num_epochs = 10
initial_lr = 2e-5

def create_model(num_epochs, initial_lr):

    model = TFAutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        num_labels = len(label_list)
    )

    batches_per_epoch = len(tokenized_ds['train']) // batch_size
    total_train_steps = int(batches_per_epoch * num_epochs)

    optimizer, schedule = create_optimizer(
        init_lr = initial_lr,
        num_warmup_steps = total_train_steps // 20,
        num_train_steps = total_train_steps,
        weight_decay_rate = 0.01
    )

    model.compile(optimizer = optimizer)

    return model

model = create_model(num_epochs, initial_lr)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [13]:
# import numpy as np
# from transformers.keras_callbacks import KerasMetricCallback

# metric = load_metric("seqeval")
# labels = [label_list[i] for i in example['concept_tags']]
# metric.compute(predictions=[labels], references=[labels])

# def compute_metrics(p):
#     predictions, labels = p
#     # predictions = np.argmax(predictions, axis=2)

#     # Remove ignored index (special tokens)
#     true_predictions = [
#         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]
#     true_labels = [
#         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
#         for prediction, label in zip(predictions, labels)
#     ]

#     results = metric.compute(predictions=true_predictions, references=true_labels)
#     return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
#         "accuracy": results["overall_accuracy"],
#     }

# metric_callback = KerasMetricCallback(
#     metric_fn=compute_metrics, eval_dataset=validation_set
# )

In [14]:
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, CSVLogger

class update_logger(Callback):

    def __init__(self):    
        super(update_logger, self).__init__()

    def on_epoch_end(self, epoch, logs = {}):
        logs['seed'] = seed
        logs['batch_size'] = batch_size
        logs['learning_rate'] = initial_lr

update_logger_cb = update_logger()

checkpoint_path = '/content/cr/'
modelcheckpoint_cb = ModelCheckpoint(
    filepath = checkpoint_path,
    save_weights_only = True,
    verbose = 1
)

csvlogger_file = f'{model_name_dict[model_checkpoint]}-{task_name_dict[ds_name]}.csv'
csvlogger_cb = CSVLogger(csvlogger_file, append = True)

callbacks = [update_logger_cb, modelcheckpoint_cb, csvlogger_cb]

In [None]:
# model = create_model(num_epochs, initial_lr)

model.fit(
    batched_train_ds,
    validation_data = batched_val_ds,
    epochs = num_epochs,
    callbacks = callbacks
)

# **Hyperparameter tuning**

In [25]:
batch_sizes = [16, 32]
initial_lrs = [5e-5, 3e-5, 2e-5]
seeds = [21916, 25412, 56281, 61712, 30488,
         28215, 78867, 87843, 67918, 93327,
         95420, 11905, 86349, 12082, 81996]

num_epochs = 5

for batch_size in batch_sizes:

    batched_train_ds, batched_val_ds, batched_test_ds = batching(tokenized_ds, batch_size)
    
    for initial_lr in initial_lrs:
    
        for seed in seeds:
    
            tf.random.set_seed(seed)
            model = create_model(num_epochs, initial_lr)

            csvlogger_file = f'{model_name_dict[model_checkpoint]}-{task_name_dict[ds_name]}.csv'
            csvlogger_cb = CSVLogger(csvlogger_file, append = True)

            callbacks = [update_logger_cb, modelcheckpoint_cb, csvlogger_cb]

            model.fit(
                batched_train_ds,
                validation_data = batched_val_ds,
                epochs = num_epochs,
                callbacks = callbacks
            )

  tensor = as_tensor(value)
All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour, please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


Epoch 1/5
Epoch 1: saving model to /content/cr/
Epoch 2/5
Epoch 2: saving model to /content/cr/
Epoch 3/5
Epoch 3: saving model to /content/cr/
Epoch 4/5
Epoch 4: saving model to /content/cr/
Epoch 5/5
Epoch 5: saving model to /content/cr/


AttributeError: ignored

# **Evaluate on test set**

In [16]:
predicts = [model.predict(batch) for batch in batched_test_ds]

  tensor = as_tensor(value)


In [17]:
import numpy as np

all_preds = []
all_trues = []

for batch in batched_test_ds:

    y_preds_logits = model.predict(batch)['logits']
    y_preds = np.argmax(y_preds_logits, axis = 2)
    y_trues = batch['labels']
    p = y_preds, y_trues

    all_pred = [
        [label_list[p] for (p, l) in zip(y_pred, y_true) if l != -100]
        for y_pred, y_true in zip(y_preds, y_trues)]
    all_true = [
        [label_list[l] for (p, l) in zip(y_pred, y_true) if l != -100]
        for y_pred, y_true in zip(y_preds, y_trues)]
    
    all_preds.extend([item for sublist in all_pred for item in sublist])
    all_trues.extend([item for sublist in all_true for item in sublist])
    
    # break

  tensor = as_tensor(value)


In [18]:
from sklearn.metrics import classification_report

print(classification_report(all_trues, all_preds))
cr = classification_report(all_trues, all_preds, output_dict = True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                                      precision    recall  f1-score   support

                         Cleanliness       0.41      0.46      0.43        28
                                GN&C       0.67      0.63      0.65        54
                    Materials / EEEs       0.49      0.46      0.47        79
                         Measurement       0.91      0.78      0.84        78
                       Nonconformity       0.00      0.00      0.00        23
                                   O       0.96      0.93      0.94      3028
                                OBDH       0.46      0.83      0.60       104
                           Parameter       0.64      0.48      0.54       103
                               Power       0.67      0.75      0.71       166
Project Organisation / Documentation       0.33      0.31      0.32        29
                       Project Scope       0.66      0.56      0.61        66
                          Propulsion       0.53      0.61      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
import pandas as pd

pd.DataFrame(cr)

Unnamed: 0,Cleanliness,GN&C,Materials / EEEs,Measurement,Nonconformity,O,OBDH,Parameter,Power,Project Organisation / Documentation,...,Quality control,Safety / Risk (Control),Space Environment,Structure & Mechanism,System engineering,Telecom.,Thermal,accuracy,macro avg,weighted avg
precision,0.40625,0.666667,0.486486,0.910448,0.0,0.959589,0.464865,0.636364,0.67027,0.333333,...,0.470588,0.453125,0.509901,0.645161,0.4,0.648148,0.581818,0.821307,0.548903,0.835004
recall,0.464286,0.62963,0.455696,0.782051,0.0,0.925363,0.826923,0.475728,0.746988,0.310345,...,0.680851,0.517857,0.774436,0.43956,0.516854,0.426829,0.477612,0.821307,0.558919,0.821307
f1-score,0.433333,0.647619,0.470588,0.841379,0.0,0.942165,0.595156,0.544444,0.706553,0.321429,...,0.556522,0.483333,0.614925,0.522876,0.45098,0.514706,0.52459,0.821307,0.544252,0.823882
support,28.0,54.0,79.0,78.0,23.0,3028.0,104.0,103.0,166.0,29.0,...,94.0,56.0,133.0,91.0,89.0,82.0,67.0,0.821307,4421.0,4421.0


# **Prediction Pipeline**

In [20]:
from transformers import TokenClassificationPipeline

pipe = TokenClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    # aggregation_strategy = 'simple'
)

In [21]:
text = 'The Micro-VCM datasheet shall contain the product type.'
text = 'Storage conditions shall prevent the degredation of the structure.'
text = 'A record of the process data shall be part of the process procedure'
text = 'In case of doubt, the internal NRB shall classify nonconformances as major.'
text = 'The Reserved shall be an 8-bit field that is set to 0x00.'

pipe(text)

[{'end': 3,
  'entity': 'LABEL_1',
  'index': 1,
  'score': 0.99723744,
  'start': 0,
  'word': 'The'},
 {'end': 11,
  'entity': 'LABEL_15',
  'index': 2,
  'score': 0.3509737,
  'start': 4,
  'word': 'Reserve'},
 {'end': 12,
  'entity': 'LABEL_1',
  'index': 3,
  'score': 0.9812609,
  'start': 11,
  'word': '##d'},
 {'end': 18,
  'entity': 'LABEL_1',
  'index': 4,
  'score': 0.9972826,
  'start': 13,
  'word': 'shall'},
 {'end': 21,
  'entity': 'LABEL_1',
  'index': 5,
  'score': 0.9973143,
  'start': 19,
  'word': 'be'},
 {'end': 24,
  'entity': 'LABEL_1',
  'index': 6,
  'score': 0.9972166,
  'start': 22,
  'word': 'an'},
 {'end': 26,
  'entity': 'LABEL_15',
  'index': 7,
  'score': 0.65452904,
  'start': 25,
  'word': '8'},
 {'end': 27,
  'entity': 'LABEL_15',
  'index': 8,
  'score': 0.63611233,
  'start': 26,
  'word': '-'},
 {'end': 30,
  'entity': 'LABEL_15',
  'index': 9,
  'score': 0.6254104,
  'start': 27,
  'word': 'bit'},
 {'end': 36,
  'entity': 'LABEL_15',
  'index': 10,