# Loading data using Hugging Face datasets methods

https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html

https://huggingface.co/learn/nlp-course/chapter5/5?fw=tf

In [1]:
import tensorflow as tf
from datasets import load_dataset, Dataset, DatasetDict 
import pandas as pd
import sys
import os
import glob
import numpy as np
import toml
import json

from transformers import (AutoTokenizer,
                         TFAutoModelForSequenceClassification,
                          TFBertForSequenceClassification,
                         DataCollatorWithPadding,
                         TFPreTrainedModel,
              TFGPT2ForSequenceClassification,)

# keras for training
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
import sklearn.metrics as metrics
from lxml import etree

currentdir = os.path.abspath(os.path.curdir)
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
sys.path.insert(0,parentdir+'/embed') 
from classifier_trainer.trainer import stream_arxiv_paragraphs
import parsing_xml as px
import peep_tar as peep

from train_lstm import gen_cfg, find_best_cutoff
%load_ext autoreload
%autoreload 2
from extract import Definiendum
args = []
#xml_lst, cfg = gen_cfg(config_path='../config.toml')

In [2]:
tf.config.list_physical_devices("GPU")                                  

2023-11-06 20:07:08.359990: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error
2023-11-06 20:07:08.360019: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: zorn
2023-11-06 20:07:08.360024: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: zorn
2023-11-06 20:07:08.360141: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 525.147.5
2023-11-06 20:07:08.360159: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 525.125.6
2023-11-06 20:07:08.360164: E tensorflow/stream_executor/cuda/cuda_diagnostics.cc:313] kernel version 525.125.6 does not match DSO version 525.147.5 -- cannot find working devices in this configuration


[]

In [3]:
cfg = toml.load('../config.toml')
cfg = cfg['finetuning']

In [4]:
xml_lst = glob.glob('/media/hd1/training_defs/math18/*.xml.gz')
#xml_lst = xml_lst[:len(xml_lst)//4]

In [5]:
stream = stream_arxiv_paragraphs(xml_lst, samples=cfg['batch_size'])

all_data = []
all_labels = []
all_texts = []
for s in stream:
    try:
        #all_data += list(zip(s[0], s[1]))
        all_texts += s[0]
        all_labels += s[1]
    except IndexError:
        logger.warning('Index error in the data stream.')
data_dict = {
    'text': all_texts,
    'label': all_labels
}
ds = Dataset.from_dict(data_dict)

In [6]:
all_texts[:3]

[' Let _inline_math_ be a relation on _inline_math_ that satisfies HLC. The edge-labeled tree _inline_math_ has the following cluster set _display_math_ and each edge _inline_math_ of _inline_math_ obtains the label _display_math_ ',
 ' An _inline_math_-category _inline_math_ is called minimal if _inline_math_. ',
 ' Combining the lower and upper bounds in () gives _display_math_ Since _inline_math_ and _inline_math_, this implies that _inline_math_ _inline_math_ _inline_math_ using _inline_math_, using _inline_math_, _inline_math_ by the definition of _inline_math_ in (). This finishes the proof. ∎ ']

In [8]:
# quick check the loading of the model
#checkpoint = 'bert-base-uncased'
#checkpoint = 'gpt2' Does not work
checkpoint = 'facebook/bart-base'
#checkpoint = 'distilgpt2'
#checkpoint = 'openai-gpt'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True)
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

#tok2 = AutoTokenizer.from_pretrained('roberta-base')

#sequences = all_texts[:10]
#batch = dict(
#    tokenizer(sequences, return_tensors='tf',
#             padding=True, truncation=True)
#)
#model.compile(
#    optimizer='adam',
#    loss='binary_crossentropy'
#)
#labels = tf.convert_to_tensor(all_labels[:10])
#model.train_on_batch(batch, labels)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

ValueError: Unrecognized configuration class <class 'transformers.models.bart.configuration_bart.BartConfig'> for this kind of AutoModel: TFAutoModelForSequenceClassification.
Model type should be one of AlbertConfig, BertConfig, CamembertConfig, ConvBertConfig, CTRLConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, EsmConfig, FlaubertConfig, FunnelConfig, GPT2Config, GPTJConfig, LayoutLMConfig, LayoutLMv3Config, LongformerConfig, MobileBertConfig, MPNetConfig, OpenAIGPTConfig, RemBertConfig, RobertaConfig, RoFormerConfig, TapasConfig, TransfoXLConfig, XLMConfig, XLMRobertaConfig, XLNetConfig.

In [8]:
print(tokenizer(['hi, this is exciting',
          'this is getting boring']))

{'input_ids': [[5303, 11, 428, 318, 7895], [5661, 318, 1972, 14262]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1]]}


In [9]:
def tok_function(example):
    # This function can be used with the Dataset.map() method
    return tokenizer(example['text'], truncation=True)

def add_missing_token_type(ex):
    #print(len(ex['attention_mask']))
    ex['token_type_ids'] = [len(x)*[0] for x in ex['attention_mask']]
    return ex

tkn_data = ds.map(tok_function, batched=True)
print(tkn_data)

  0%|          | 0/140 [00:00<?, ?ba/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 139787
})


In [10]:
# shrink the data
# and split into train, test, and validation
tkn_data = tkn_data.select(range(int(0.1*len(tkn_data))))
temp1_dd = tkn_data.train_test_split(test_size=0.1, shuffle=True)
temp2_dd = temp1_dd['train'].train_test_split(test_size=0.1, shuffle=True)

tkn_data = DatasetDict({
    'train': temp2_dd['train'],
    'test': temp1_dd['test'],
    'valid': temp2_dd['test'],
})
del temp1_dd
del temp2_dd
tkn_data  

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 11322
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1398
    })
    valid: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1258
    })
})

In [11]:
# This function does no accept the return_tensors argument.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

#column_lst = ['attention_mask', 'input_ids', 'token_type_ids']
column_lst = ['attention_mask', 'input_ids' ]

#For GPT2 which has no padding token

#tokenizer.pad_token = tokenizer.special_tokens_map['eos_token']

# Take care of everyting using `to_tf_dataset()`
tf_train_data = tkn_data['train'].to_tf_dataset(
       columns=column_lst,
       label_cols=['label'],
       shuffle=True,
       collate_fn=data_collator,
       batch_size=2 )

tf_valid_data = tkn_data['valid'].to_tf_dataset(
       columns=column_lst,
       label_cols=['label'],
       shuffle=True,
       collate_fn=data_collator,
       batch_size=2 )

tf_test_data = tkn_data['test'].to_tf_dataset(
       columns=column_lst,
       label_cols=['label'],
       shuffle=False,
       collate_fn=data_collator,
       batch_size=1 )

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [12]:
ii = next(iter(tf_train_data))
#ii['input_ids']
print(ii[0])
print(ii[1])
M = model(**ii[0])

{'input_ids': <tf.Tensor: shape=(2, 112), dtype=int64, numpy=
array([[ 1309,  4808, 45145,    62, 11018,    62,   290,  4808, 45145,
           62, 11018,    62,   307,   884,   326,  4808, 45145,    62,
        11018,    62,    11,  4808, 45145,    62, 11018,    62,   290,
         4808, 45145,    62, 11018,    62,   329,   477,  4808, 45145,
           62, 11018,    62,   329,  4808, 45145,    62, 11018, 44807,
         3244,  4808, 45145,    62, 11018,    62,   290,   611,  4808,
        45145,    62, 11018,    62,   788,  4808, 45145,    62, 11018,
           62,   318,  2048, 10403,   257,  2880,  1352,    11,   981,
          611,  4808, 45145,    62, 11018,    62,   788,    11,  2048,
        10403,    11,  2035,  4808, 45145,    62, 11018,    62,  2048,
         8347,   393,  4808, 45145,    62, 11018,    62,  2048,  8347,
           13,   220, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256],
       [ 3914,  4808, 45145,    62, 11018, 44807,

In [13]:
#%%script echo This is the training cell
# Decay the learning rate w/ PolynomialDecay

batch_size = 8
num_epochs = 1
num_train_steps = len(tf_train_data)*num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)

opt = Adam(learning_rate=lr_scheduler)

#reload the model to change the optimizer
#model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
#model = TFGPT2ForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
#model.resize_token_embeddings(new_num_tokens=50257 )
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
#loss = tf.keras.losses.BinaryCrossentropy()

model.compile(optimizer=opt,
             loss=loss,
             metrics=['accuracy'])

# Training
model.fit( tf_train_data, validation_data=tf_valid_data, epochs=num_epochs)



<keras.callbacks.History at 0x7fb05012b100>

In [14]:
#model.fit( tf_train_data,  epochs=3)

In [15]:
%%script echo no hacer
#model.fit( tf_train_data, validation_data=tf_valid_data, epochs=num_epochs)
    
model_path = '/media/hd1/TransformersFineTuned/class-2023-06-29_1436/'
with open(model_path+'/cfg_dict.json', 'r') as fobj:
    cfg = json.loads(fobj.read())
    
#model = TFAutoModelForSequenceClassification.from_pretrained(model_path+'model')
model = TFBertForSequenceClassification.from_pretrained(model_path+'model')
tokenizer = AutoTokenizer.from_pretrained(cfg['checkpoint'])

no hacer


In [16]:
%%script echo no hacer
# test the the opening of the promath tar.gz, parsing and extracting workflow with HF transformers
tarpath = '/media/hd1/promath/math19/1906_002.tar.gz'
tar_iter = peep.tar_iter(tarpath, '.xml')
fname, tobj = next(tar_iter)
parsing_obj = px.DefinitionsXML(tobj)
dd = Definiendum(parsing_obj, model, None, None, tokenizer)

no hacer


In [17]:
it = iter(tf_test_data)
next(it)

({'input_ids': <tf.Tensor: shape=(1, 94), dtype=int64, numpy=
  array([[  317, 14136,    72,   495, 22664,    89,    12,   305,  5191,
          28418,   375,   282,  1641,   318,   257,  1641,  4808, 45145,
             62, 11018,    62,    11,  4808, 45145,    62, 11018,    62,
             11,   286,  1729,    12, 13500,   877,   378,   311,    12,
            403,   320,   375,   282,  8739,   319,  4808, 45145,    62,
          11018,    62,   351,   262,  4688,   966,  4808, 45145,    62,
          11018, 44807,   775,  2421,   326,  4808, 45145,    62, 11018,
             62,   318,   257, 14136,    72,   495, 22664,    89,  3975,
            290,  4808, 45145,    62, 11018,    62,   318,  4808, 45145,
             62, 11018,    62,   355,   257,  2163,   286,  4808, 45145,
             62, 11018, 44807,   220]])>,
  'attention_mask': <tf.Tensor: shape=(1, 94), dtype=int64, numpy=
  array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1

In [31]:
shapes_lst = []
for dat, tar in tf_test_data:
    shapes_lst.append(dat['input_ids'].shape[1])
max(shapes_lst)

986

In [32]:
for _ in range(3):
    tt = next(iter(tf_test_data))[0]
prepreds = model.predict(tt)

2023-10-25 22:52:04.944769: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 282.0KiB (rounded to 288768)requested by op tfgpt2_for_sequence_classification/transformer/h_._0/attn/split
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-10-25 22:52:04.945300: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2023-10-25 22:52:04.945315: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 1212, Chunks in use: 1212. 303.0KiB allocated for chunks. 303.0KiB in use in bin. 9.1KiB client-requested in use in bin.
2023-10-25 22:52:04.945325: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 1, Chunks in use: 1. 512B allocated for chunks. 512B in use in bin. 376B client-request

ResourceExhaustedError: Graph execution error:

Detected at node 'tfgpt2_for_sequence_classification/transformer/h_._0/attn/split' defined at (most recent call last):
    File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/usr/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/luis/.local/lib/python3.10/site-packages/traitlets/config/application.py", line 1041, in launch_instance
      app.start()
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 724, in start
      self.io_loop.start()
    File "/home/luis/.local/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
      handle._run()
    File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 512, in dispatch_queue
      await self.process_one()
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 501, in process_one
      await dispatch(*args)
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 408, in dispatch_shell
      await result
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 731, in execute_request
      reply_content = await reply_content
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 417, in do_execute
      res = shell.run_cell(
    File "/home/luis/.local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2945, in run_cell
      result = self._run_cell(
    File "/home/luis/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3000, in _run_cell
      return runner(coro)
    File "/home/luis/.local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/luis/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3203, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/luis/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3382, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/luis/.local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3442, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_263569/86284319.py", line 1, in <module>
      prepreds = model.predict(next(iter(tf_test_data))[0])
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1982, in predict
      tmp_batch_outputs = self.predict_function(iterator)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1801, in predict_function
      return step_function(self, iterator)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1790, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1783, in run_step
      outputs = model.predict_step(data)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/training.py", line 1751, in predict_step
      return self(x, training=False)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1148, in run_call_with_unpacked_inputs
      return cls._from_config(config, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 1160, in call
      transformer_outputs = self.transformer(
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/modeling_tf_utils.py", line 1148, in run_call_with_unpacked_inputs
      return cls._from_config(config, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 476, in call
      for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 480, in call
      outputs = block(
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 255, in call
      output_attn = self.attn(
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1096, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
      return fn(*args, **kwargs)
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 168, in call
      if encoder_hidden_states is not None:
    File "/home/luis/.local/lib/python3.10/site-packages/transformers/models/gpt2/modeling_tf_gpt2.py", line 181, in call
      query, key, value = tf.split(x, 3, axis=2)
Node: 'tfgpt2_for_sequence_classification/transformer/h_._0/attn/split'
OOM when allocating tensor with shape[1,94,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node tfgpt2_for_sequence_classification/transformer/h_._0/attn/split}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_predict_function_110438]

.cc:1074] 6 Chunks of size 932608 totalling 5.34MiB
2023-10-25 22:52:05.324476: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] 1 Chunks of size 935424 totalling 913.5KiB
2023-10-25 22:52:05.324480: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] 1 Chunks of size 936448 totalling 914.5KiB
2023-10-25 22:52:05.324484: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] 60 Chunks of size 940032 totalling 53.79MiB
2023-10-25 22:52:05.324488: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] 1 Chunks of size 940288 totalling 918.2KiB
2023-10-25 22:52:05.324492: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] 1 Chunks of size 941312 totalling 919.2KiB
2023-10-25 22:52:05.324496: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] 37 Chunks of size 946176 totalling 33.39MiB
2023-10-25 22:52:05.324500: I tensorflow/core/common_runtime/bfc_allocator.cc:1074] 1 Chunks of size 946944 totalling 924.8KiB
2023-10-25 22:52:05.324505: I tensorflow/core/common_runt

In [34]:
model.

<transformers.models.gpt2.modeling_tf_gpt2.TFGPT2ForSequenceClassification at 0x7fb0b6dcfbe0>

In [None]:
preds = model.predict(tf_test_data)['logits']
class_preds = np.argmax(preds, axis=1)

In [None]:
%%script echo no hacer
targets = []
for b in tf_test_data.as_numpy_iterator():
    targets.extend(list(b[1])) 

In [None]:
%%script echo no hacer
metric_str = metrics.classification_report((class_preds > 0.5).astype(int), targets)
print(metric_str)

In [35]:
#%%script echo uncomment if want to save
model.save_pretrained(save_directory='/home/luis/rm_me_culito')

2023-10-25 22:54:52.451346: W tensorflow/core/common_runtime/bfc_allocator.cc:462] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.00MiB (rounded to 3145728)requested by op ReadVariableOp
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2023-10-25 22:54:52.451681: I tensorflow/core/common_runtime/bfc_allocator.cc:1010] BFCAllocator dump for GPU_0_bfc
2023-10-25 22:54:52.451700: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (256): 	Total Chunks: 1209, Chunks in use: 1209. 302.2KiB allocated for chunks. 302.2KiB in use in bin. 9.1KiB client-requested in use in bin.
2023-10-25 22:54:52.451711: I tensorflow/core/common_runtime/bfc_allocator.cc:1017] Bin (512): 	Total Chunks: 0, Chunks in use: 0. 0B allocated for chunks. 0B in use in bin. 0B client-requested in use in bin.
2023-10-25 22:54:52.451721: I tensorf

ResourceExhaustedError: OOM when allocating tensor with shape[1024,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:ReadVariableOp]

In [None]:
#dataset = load_dataset("rotten_tomatoes", split="train")

In [None]:
#dataset._info.features

## class-2023-08-14_1829
```
List of XLA GPUs: []
opt_prob=0.1 and f1_max=0.9500941619585688
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     55519
           1       0.95      0.95      0.95     57465

    accuracy                           0.95    112984
   macro avg       0.95      0.95      0.95    112984
weighted avg       0.95      0.95      0.95    112984

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     55519
           1       0.95      0.95      0.95     57465

    accuracy                           0.95    112984
   macro avg       0.95      0.95      0.95    112984
weighted avg       0.95      0.95      0.95    112984

{'shrink_data_factor': 1.0, 'checkpoint': 'bert-large-cased', 'glob_data_source': 'training_defs/math1*/*.xml.gz', 'data_stream_batch_size': 5000, 'num_epochs': 2, 'batch_size': 32, 'initial_lr': 2e-06, 'end_lr': 0, 'savedir': '/opt/data_dir/finetune/class-2023-08-14_1829/model', 'configpath': '/opt/arxivDownload/config.toml', 'base_dir': '/opt/data_dir', 'local_dir': '/tmp/trainer', 'timestamp': 'Aug-14_18-29', 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Aug-14_18-29', 'num_train_steps': 228792}
```

## class-2023-08-07_1327
```
List of XLA GPUs: []
opt_prob=0.1 and f1_max=0.9479913667061198
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      6788
           1       0.95      0.95      0.95      7191

    accuracy                           0.95     13979
   macro avg       0.95      0.95      0.95     13979
weighted avg       0.95      0.95      0.95     13979

              precision    recall  f1-score   support

           0       0.94      0.95      0.95      6788
           1       0.95      0.95      0.95      7191

    accuracy                           0.95     13979
   macro avg       0.95      0.95      0.95     13979
weighted avg       0.95      0.95      0.95     13979

{'shrink_data_factor': 1.0, 'checkpoint': 'bert-large-cased', 'glob_data_source': 'training_defs/math18/*.xml.gz', 'data_stream_batch_size': 5000, 'num_epochs': 3, 'batch_size': 32, 'initial_lr': 6e-06, 'end_lr': 0, 'savedir': '/opt/data_dir/finetune/class-2023-08-07_1327/model', 'configpath': '/opt/arxivDownload/config.toml', 'base_dir': '/opt/data_dir', 'local_dir': '/tmp/trainer', 'timestamp': 'Aug-07_13-28', 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Aug-07_13-28', 'num_train_steps': 42462}
```

## class-2023-08-05_1512
```
List of XLA GPUs: []
opt_prob=0.1 and f1_max=0.944743935309973
              precision    recall  f1-score   support

           0       0.93      0.94      0.94       653
           1       0.95      0.94      0.94       745

    accuracy                           0.94      1398
   macro avg       0.94      0.94      0.94      1398
weighted avg       0.94      0.94      0.94      1398

              precision    recall  f1-score   support

           0       0.93      0.94      0.94       653
           1       0.95      0.94      0.94       745

    accuracy                           0.94      1398
   macro avg       0.94      0.94      0.94      1398
weighted avg       0.94      0.94      0.94      1398

{'shrink_data_factor': 0.1, 'checkpoint': 'bert-large-cased', 'glob_data_source': 'training_defs/math18/*.xml.gz', 'data_stream_batch_size': 5000, 'num_epochs': 3, 'batch_size': 32, 'initial_lr': 2e-05, 'end_lr': 0, 'savedir': '/opt/data_dir/finetune/class-2023-08-05_1512/model', 'configpath': '/opt/arxivDownload/config.toml', 'base_dir': '/opt/data_dir', 'local_dir': '/tmp/trainer', 'timestamp': 'Aug-05_15-12', 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Aug-05_15-12', 'num_train_steps': 4248}
```

## class-2023-10-20_1733
```
opt_prob=0.1 and f1_max=0.9485215543995035
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     77142
           1       0.95      0.95      0.95     78085

    accuracy                           0.95    155227
   macro avg       0.95      0.95      0.95    155227
weighted avg       0.95      0.95      0.95    155227

              precision    recall  f1-score   support

           0       0.95      0.95      0.95     77142
           1       0.95      0.95      0.95     78085

    accuracy                           0.95    155227
   macro avg       0.95      0.95      0.95    155227
weighted avg       0.95      0.95      0.95    155227

{'shrink_data_factor': 1.0, 'checkpoint': 'roberta-large', 'glob_data_source': 'training_defs/math*/*.xml.gz', 'data_stream_batch_size': 5000, 'num_epochs': 3, 'batch_size': 32, 'initial_lr': 5e-06, 'end_lr': 0.0, 'savedir': '/opt/data_dir/finetune/class-2023-10-20_1733/model', 'configpath': '/opt/arxivDownload/rmme_config.toml', 'base_dir': '/opt/data_dir', 'local_dir': '/tmp/trainer', 'timestamp': 'Oct-20_17-33', 'save_path_dir': '/opt/data_dir/trained_models/finetuning/HFTransformers_Oct-20_17-33', 'num_train_steps': 117876, 'opt_thresh': 0.1, 'f1_max': 0.9485215543995035}
```