# Loading data using Hugging Face datasets methods

https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html

https://huggingface.co/learn/nlp-course/chapter5/5?fw=tf

In [1]:
import tensorflow as tf
from datasets import load_dataset, Dataset, DatasetDict 
import pandas as pd
import sys
import os
import glob
import numpy as np
import toml
import json

from transformers import (AutoTokenizer,
                         TFAutoModelForSequenceClassification,
                          TFBertForSequenceClassification,
                         DataCollatorWithPadding,
                         TFPreTrainedModel,)

# keras for training
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
import sklearn.metrics as metrics
from lxml import etree

currentdir = os.path.abspath(os.path.curdir)
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
sys.path.insert(0,parentdir+'/embed') 
from classifier_trainer.trainer import stream_arxiv_paragraphs
import parsing_xml as px
import peep_tar as peep

from train_lstm import gen_cfg, find_best_cutoff
%load_ext autoreload
%autoreload 2
from extract import Definiendum
args = []
#xml_lst, cfg = gen_cfg(config_path='../config.toml')

In [4]:
tf.config.list_physical_devices("GPU")                                  

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
cfg = toml.load('../config.toml')

In [4]:
cfg = cfg['finetuning']

In [6]:
xml_lst = glob.glob('/media/hd1/training_defs/math18/*.xml.gz')
#xml_lst = xml_lst[:len(xml_lst)//4]

In [19]:
stream = stream_arxiv_paragraphs(xml_lst, samples=cfg['batch_size'])

all_data = []
all_labels = []
all_texts = []
for s in stream:
    try:
        #all_data += list(zip(s[0], s[1]))
        all_texts += s[0]
        all_labels += s[1]
    except IndexError:
        logger.warning('Index error in the data stream.')
data_dict = {
    'text': all_texts,
    'label': all_labels
}
ds = Dataset.from_dict(data_dict)

In [20]:
all_texts[:3]

[' If _inline_math_ is given and _inline_math_ is a continuous function on the horizontal line _inline_math_ such that _inline_math_ for all _inline_math_, then _display_math_ ',
 ' Let _inline_math_ be a relation on _inline_math_. The (complementary) neighborhood _inline_math_ of _inline_math_ w.r.t. _inline_math_ is defined as follows: _display_math_ Moreover, we define the set of neighborhoods w.r.t. _inline_math_ as follows: _inline_math_ _inline_math_ ',
 ' When _inline_math_ satisfies () but is allowed to be discontinuous, much less is known about the regularity of _inline_math_. Caffarelli showed that _inline_math_ is _inline_math_ up to the boundary for some small dimensional _inline_math_ _citation_. In terms of Sobolev regularity, Wang _citation_ showed that for any _inline_math_, one can find sufficiently large _inline_math_ such that _inline_math_ fails to be in _inline_math_ even in the interior of the domain. Nevertheless, for fixed _inline_math_, De Philippis-Figalli _ci

In [9]:
# quick check the loading of the model
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

#model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
#sequences = all_texts[:10]
#batch = dict(
#    tokenizer(sequences, return_tensors='tf',
#             padding=True, truncation=True)
#)
#model.compile(
#    optimizer='adam',
#    loss='binary_crossentropy'
#)
#labels = tf.convert_to_tensor(all_labels[:10])
#model.train_on_batch(batch, labels)

In [11]:
tokenizer(['hi, this is exciting',
          'this is getting boring'])

{'input_ids': [[101, 7632, 1010, 2023, 2003, 10990, 102], [101, 2023, 2003, 2893, 11771, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [21]:
def tok_function(example):
    # This function can be used with the Dataset.map() method
    return tokenizer(example['text'], truncation=True)

tkn_data = ds.map(tok_function, batched=True)
tkn_data

  0%|          | 0/140 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 139787
})

In [22]:
# shrink the data
# and split into train, test, and validation
tkn_data = tkn_data.select(range(int(0.1*len(tkn_data))))
temp1_dd = tkn_data.train_test_split(test_size=0.1, shuffle=True)
temp2_dd = temp1_dd['train'].train_test_split(test_size=0.1, shuffle=True)

tkn_data = DatasetDict({
    'train': temp2_dd['train'],
    'test': temp1_dd['test'],
    'valid': temp2_dd['test'],
})
del temp1_dd
del temp2_dd
tkn_data  

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11322
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1398
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1258
    })
})

In [23]:
# This function does no accept the return_tensors argument.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

# Take care of everyting using `to_tf_dataset()`
tf_train_data = tkn_data['train'].to_tf_dataset(
       columns=['attention_mask', 'input_ids', 'token_type_ids'],
       label_cols=['label'],
       shuffle=True,
       collate_fn=data_collator,
       batch_size=8 )

tf_valid_data = tkn_data['valid'].to_tf_dataset(
       columns=['attention_mask', 'input_ids', 'token_type_ids'],
       label_cols=['label'],
       shuffle=True,
       collate_fn=data_collator,
       batch_size=8 )

tf_test_data = tkn_data['test'].to_tf_dataset(
       columns=['attention_mask', 'input_ids', 'token_type_ids'],
       label_cols=['label'],
       shuffle=False,
       collate_fn=data_collator,
       batch_size=8 )

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [9]:
%%script echo This is the training cell
# Decay the learning rate w/ PolynomialDecay

batch_size = 8
num_epochs = 1
num_train_steps = len(tf_train_data)*num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

#reload the model to change the optimizer
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#loss = tf.keras.losses.BinaryCrossentropy()

model.compile(optimizer=opt,
             loss=loss,
             metrics=['accuracy'])

# Training
model.fit( tf_train_data, validation_data=tf_valid_data, epochs=num_epochs)

This is the training cell


In [10]:
with open(model_path+'/cfg_dict.json', 'r') as fobj:
    cfg = json.loads(fobj.read())
cfg

{'shrink_data_factor': 1.0,
 'checkpoint': 'bert-base-uncased',
 'glob_data_source': 'training_defs/math*/*.xml.gz',
 'data_stream_batch_size': 5000,
 'num_epochs': 3,
 'batch_size': 32,
 'initial_lr': 5e-06,
 'end_lr': 0.0,
 'savedir': '/opt/data_dir/finetune/class-2023-06-29_1436/model',
 'configpath': '/opt/arxivDownload/config.toml',
 'base_dir': '/opt/data_dir',
 'local_dir': '/tmp/trainer',
 'timestamp': 'Jun-29_14-36',
 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Jun-29_14-36',
 'num_train_steps': 471501}

In [2]:
#model.fit( tf_train_data, validation_data=tf_valid_data, epochs=num_epochs)
    
model_path = '/media/hd1/TransformersFineTuned/class-2023-06-29_1436/'
with open(model_path+'/cfg_dict.json', 'r') as fobj:
    cfg = json.loads(fobj.read())
    
#model = TFAutoModelForSequenceClassification.from_pretrained(model_path+'model')
model = TFBertForSequenceClassification.from_pretrained(model_path+'model')
tokenizer = AutoTokenizer.from_pretrained(cfg['checkpoint'])

2023-08-16 16:31:34.328216: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-16 16:31:34.329187: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-16 16:31:34.329349: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-08-16 16:31:34.329766: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [4]:
# test the the opening of the promath tar.gz, parsing and extracting workflow with HF transformers
tarpath = '/media/hd1/promath/math19/1906_002.tar.gz'
tar_iter = peep.tar_iter(tarpath, '.xml')
fname, tobj = next(tar_iter)
parsing_obj = px.DefinitionsXML(tobj)
dd = Definiendum(parsing_obj, model, None, None, tokenizer)

  0%|          | 0/1 [00:00<?, ?ba/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [1]:
import transformers
transformers.__version__

'4.25.1'

In [5]:
for ele in dd.root.findall('definition'):
    print(ele.find('stmnt').text)
    print("----------------------------------------------")

 We say a continuous function _inline_math_ is convex on _inline_math_, if _inline_math_, one has (2.16) Equation 2.16 _display_math_ 
----------------------------------------------
 _inline_math_ is _inline_math_-strongly convex on _inline_math_ _inline_math_ _inline_math_, if _inline_math_ is _inline_math_-uniformly convex on _inline_math_ _inline_math_ _inline_math_. 
----------------------------------------------
 We say a function _inline_math_ on _inline_math_ has _inline_math_-Hölder continuous derivatives _inline_math_ _inline_math_, if _inline_math_, one has 
----------------------------------------------
 _inline_math_ is said to have _inline_math_-Lipschitz continuous derivatives on _inline_math_ _inline_math_ _inline_math_ if _inline_math_ has _inline_math_-Hölder continuous derivatives on _inline_math_ _inline_math_ _inline_math_. 
----------------------------------------------
 In Definition , we unify the definition of Hölder continuous gradients _inline_math_ and high-o

In [33]:
prepreds = model.predict(tf_test_data)

In [37]:
preds

array([[ 3.9831173, -4.1443624],
       [-2.4828198,  2.5047653],
       [-3.7847457,  3.9654768],
       ...,
       [-3.8161604,  4.048975 ],
       [-1.749708 ,  1.7498296],
       [ 1.2877778, -1.6421337]], dtype=float32)

In [24]:
preds = model.predict(tf_test_data)['logits']
class_preds = np.argmax(preds, axis=1)

In [25]:
targets = []
for b in tf_test_data.as_numpy_iterator():
    targets.extend(list(b[1])) 

In [26]:
metric_str = metrics.classification_report((class_preds > 0.5).astype(int), targets)
print(metric_str)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       686
           1       0.96      0.97      0.97       712

    accuracy                           0.96      1398
   macro avg       0.96      0.96      0.96      1398
weighted avg       0.96      0.96      0.96      1398



In [14]:
%%script echo uncomment if want to save
#model.save_pretrained(save_directory='/media/hd1/TransformersFineTuned/BertHF1/')

uncomment if want to save


In [15]:
#dataset = load_dataset("rotten_tomatoes", split="train")

In [16]:
#dataset._info.features

## class-2023-08-14_1829
```
List of XLA GPUs: []
opt_prob=0.1 and f1_max=0.9500941619585688
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     55519
           1       0.95      0.95      0.95     57465

    accuracy                           0.95    112984
   macro avg       0.95      0.95      0.95    112984
weighted avg       0.95      0.95      0.95    112984

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     55519
           1       0.95      0.95      0.95     57465

    accuracy                           0.95    112984
   macro avg       0.95      0.95      0.95    112984
weighted avg       0.95      0.95      0.95    112984

{'shrink_data_factor': 1.0, 'checkpoint': 'bert-large-cased', 'glob_data_source': 'training_defs/math1*/*.xml.gz', 'data_stream_batch_size': 5000, 'num_epochs': 2, 'batch_size': 32, 'initial_lr': 2e-06, 'end_lr': 0, 'savedir': '/opt/data_dir/finetune/class-2023-08-14_1829/model', 'configpath': '/opt/arxivDownload/config.toml', 'base_dir': '/opt/data_dir', 'local_dir': '/tmp/trainer', 'timestamp': 'Aug-14_18-29', 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Aug-14_18-29', 'num_train_steps': 228792}
```

## class-2023-08-07_1327
```
List of XLA GPUs: []
opt_prob=0.1 and f1_max=0.9479913667061198
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      6788
           1       0.95      0.95      0.95      7191

    accuracy                           0.95     13979
   macro avg       0.95      0.95      0.95     13979
weighted avg       0.95      0.95      0.95     13979

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      6788
           1       0.95      0.95      0.95      7191

    accuracy                           0.95     13979
   macro avg       0.95      0.95      0.95     13979
weighted avg       0.95      0.95      0.95     13979

{'shrink_data_factor': 1.0, 'checkpoint': 'bert-large-cased', 'glob_data_source': 'training_defs/math18/*.xml.gz', 'data_stream_batch_size': 5000, 'num_epochs': 3, 'batch_size': 32, 'initial_lr': 6e-06, 'end_lr': 0, 'savedir': '/opt/data_dir/finetune/class-2023-08-07_1327/model', 'configpath': '/opt/arxivDownload/config.toml', 'base_dir': '/opt/data_dir', 'local_dir': '/tmp/trainer', 'timestamp': 'Aug-07_13-28', 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Aug-07_13-28', 'num_train_steps': 42462}
```

## class-2023-08-05_1512
```
List of XLA GPUs: []
opt_prob=0.1 and f1_max=0.944743935309973
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       653
           1       0.95      0.94      0.94       745

    accuracy                           0.94      1398
   macro avg       0.94      0.94      0.94      1398
weighted avg       0.94      0.94      0.94      1398

              precision    recall  f1-score   support

           0       0.93      0.94      0.94       653
           1       0.95      0.94      0.94       745

    accuracy                           0.94      1398
   macro avg       0.94      0.94      0.94      1398
weighted avg       0.94      0.94      0.94      1398

{'shrink_data_factor': 0.1, 'checkpoint': 'bert-large-cased', 'glob_data_source': 'training_defs/math18/*.xml.gz', 'data_stream_batch_size': 5000, 'num_epochs': 3, 'batch_size': 32, 'initial_lr': 2e-05, 'end_lr': 0, 'savedir': '/opt/data_dir/finetune/class-2023-08-05_1512/model', 'configpath': '/opt/arxivDownload/config.toml', 'base_dir': '/opt/data_dir', 'local_dir': '/tmp/trainer', 'timestamp': 'Aug-05_15-12', 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Aug-05_15-12', 'num_train_steps': 4248}
```