# Loading data using Hugging Face datasets methods

https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html

https://huggingface.co/learn/nlp-course/chapter5/5?fw=tf

In [3]:
import tensorflow as tf
from datasets import load_dataset, Dataset, DatasetDict 
import pandas as pd
import sys
import os
import glob
import numpy as np
import toml
import json

from transformers import (AutoTokenizer,
                         TFAutoModelForSequenceClassification,
                         DataCollatorWithPadding,
                         TFPreTrainedModel,)

# keras for training
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
import sklearn.metrics as metrics

currentdir = os.path.abspath(os.path.curdir)
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
sys.path.insert(0,parentdir+'/embed') 
from classifier_trainer.trainer import stream_arxiv_paragraphs
from extract import Definiendum
import parsing_xml as px
import peep_tar as peep

from train_lstm import gen_cfg, find_best_cutoff
args = []
#xml_lst, cfg = gen_cfg(config_path='../config.toml')

In [4]:
cfg = toml.load('../config.toml')

In [5]:
cfg = cfg['finetuning']

In [13]:
class RandClf():
    def __init__(self):
        pass
    def predict(self, lst, **kwargs):                                                 
        np.random.seed(seed=42)
        return np.zeros(len(lst))                                               
        
class FarseVectorizer():
    def __init__(self):                                                               
        pass                                                                          
    def transform(self, lst):
        return [0.0 for _ in lst]

class FarseBio():
    def __init__(self):
        pass
    def parse(self, lst):
        return [0.0 for _ in lst]

tarpath = '/media/hd1/promath/math11/1105_005.tar.gz'
tar_iter = peep.tar_iter(tarpath, '.xml')
fname, tobj = next(tar_iter)
parsing_obj = px.DefinitionsXML(tobj)
clf = RandClf()
vtr = FarseVectorizer()
dd = Definiendum(parsing_obj, clf, None, vtr, None)

In [19]:
dd.first_col

(1, 3, 4, 5, 7, 8, 9, 10, 11)

In [None]:
xml_lst = glob.glob('/media/hd1/training_defs/math18/*.xml.gz')
#xml_lst = xml_lst[:len(xml_lst)//4]

In [13]:
stream = stream_arxiv_paragraphs(xml_lst, samples=cfg['batch_size'])

all_data = []
all_labels = []
all_texts = []
for s in stream:
    try:
        #all_data += list(zip(s[0], s[1]))
        all_texts += s[0]
        all_labels += s[1]
    except IndexError:
        logger.warning('Index error in the data stream.')
data_dict = {
    'text': all_texts,
    'label': all_labels
}
ds = Dataset.from_dict(data_dict)

In [14]:
ds

Dataset({
    features: ['text', 'label'],
    num_rows: 139787
})

In [17]:
# quick check the loading of the model
checkpoint = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

#model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)
#sequences = all_texts[:10]
#batch = dict(
#    tokenizer(sequences, return_tensors='tf',
#             padding=True, truncation=True)
#)
#model.compile(
#    optimizer='adam',
#    loss='binary_crossentropy'
#)
#labels = tf.convert_to_tensor(all_labels[:10])
#model.train_on_batch(batch, labels)

In [21]:
def tok_function(example):
    # This function can be used with the Dataset.map() method
    return tokenizer(example['text'], truncation=True)

tkn_data = ds.map(tok_function, batched=True)
tkn_data

  0%|          | 0/140 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 139787
})

In [22]:
# shrink the data
# and split into train, test, and validation
tkn_data = tkn_data.select(range(int(0.1*len(tkn_data))))
temp1_dd = tkn_data.train_test_split(test_size=0.1, shuffle=True)
temp2_dd = temp1_dd['train'].train_test_split(test_size=0.1, shuffle=True)

tkn_data = DatasetDict({
    'train': temp2_dd['train'],
    'test': temp1_dd['test'],
    'valid': temp2_dd['test'],
})
del temp1_dd
del temp2_dd
tkn_data  

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 11322
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1398
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1258
    })
})

In [23]:
# This function does no accept the return_tensors argument.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

# Take care of everyting using `to_tf_dataset()`
tf_train_data = tkn_data['train'].to_tf_dataset(
       columns=['attention_mask', 'input_ids', 'token_type_ids'],
       label_cols=['label'],
       shuffle=True,
       collate_fn=data_collator,
       batch_size=8 )

tf_valid_data = tkn_data['valid'].to_tf_dataset(
       columns=['attention_mask', 'input_ids', 'token_type_ids'],
       label_cols=['label'],
       shuffle=True,
       collate_fn=data_collator,
       batch_size=8 )

tf_test_data = tkn_data['test'].to_tf_dataset(
       columns=['attention_mask', 'input_ids', 'token_type_ids'],
       label_cols=['label'],
       shuffle=False,
       collate_fn=data_collator,
       batch_size=8 )

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [9]:
%%script echo This is the training cell
# Decay the learning rate w/ PolynomialDecay

batch_size = 8
num_epochs = 1
num_train_steps = len(tf_train_data)*num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

#reload the model to change the optimizer
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#loss = tf.keras.losses.BinaryCrossentropy()

model.compile(optimizer=opt,
             loss=loss,
             metrics=['accuracy'])

# Training
model.fit( tf_train_data, validation_data=tf_valid_data, epochs=num_epochs)

This is the training cell


{'_name_or_path': 'bert-base-uncased',
 'architectures': ['BertForSequenceClassification'],
 'attention_probs_dropout_prob': 0.1,
 'classifier_dropout': None,
 'gradient_checkpointing': False,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'layer_norm_eps': 1e-12,
 'max_position_embeddings': 512,
 'model_type': 'bert',
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'pad_token_id': 0,
 'position_embedding_type': 'absolute',
 'transformers_version': '4.30.2',
 'type_vocab_size': 2,
 'use_cache': True,
 'vocab_size': 30522}

In [32]:
#model.fit( tf_train_data, validation_data=tf_valid_data, epochs=num_epochs)
with open(model_path+'/config.json', 'r') as fobj:
    HF_cfg = json.loads(fobj.read())
    
model_path = '/media/hd1/TransformersFineTuned/class-2023-06-29_1436/model'
model = TFAutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(HF_cfg['_name_or_path'])

Some layers from the model checkpoint at /media/hd1/TransformersFineTuned/class-2023-06-29_1436/model were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /media/hd1/TransformersFineTuned/class-2023-06-29_1436/model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further

In [33]:
prepreds = model.predict(tf_test_data)

In [37]:
preds

array([[ 3.9831173, -4.1443624],
       [-2.4828198,  2.5047653],
       [-3.7847457,  3.9654768],
       ...,
       [-3.8161604,  4.048975 ],
       [-1.749708 ,  1.7498296],
       [ 1.2877778, -1.6421337]], dtype=float32)

In [24]:
preds = model.predict(tf_test_data)['logits']
class_preds = np.argmax(preds, axis=1)

In [25]:
targets = []
for b in tf_test_data.as_numpy_iterator():
    targets.extend(list(b[1])) 

In [26]:
metric_str = metrics.classification_report((class_preds > 0.5).astype(int), targets)
print(metric_str)

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       686
           1       0.96      0.97      0.97       712

    accuracy                           0.96      1398
   macro avg       0.96      0.96      0.96      1398
weighted avg       0.96      0.96      0.96      1398



In [14]:
%%script echo uncomment if want to save
#model.save_pretrained(save_directory='/media/hd1/TransformersFineTuned/BertHF1/')

uncomment if want to save


In [15]:
#dataset = load_dataset("rotten_tomatoes", split="train")

In [16]:
#dataset._info.features

In [2]:
d = {'a': 1, 'b':2}
d.update({'c':3})
d

{'a': 1, 'b': 2, 'c': 3}

In [5]:
2 in d

False

In [6]:
import argparse

In [24]:
parser = argparse.ArgumentParser()
p.add_argument('filename')           # positional argument
p.parse_args()

usage: ipykernel_launcher.py [-h] filename filename
ipykernel_launcher.py: error: the following arguments are required: filename


SystemExit: 2

In [27]:
P = parser.parse_args("")
type(P)

argparse.Namespace

In [28]:
dict(P)

TypeError: 'Namespace' object is not iterable