In [1]:
# https://www.kdnuggets.com/using-hugging-face-transformers-with-pytorch-and-tensorflow

In [2]:
# loading dataset

In [3]:
from datasets import load_dataset

In [4]:
#dataset = load_dataset("asuender/motivational-quotes", "quotes", split="train")
dataset = load_dataset("SetFit/bbc-news", split = "train")
#dataset = load_dataset("imdb", split="train")

In [5]:
dataset[0]

{'text': 'wales want rugby league training wales could follow england s lead by training with a rugby league club.  england have already had a three-day session with leeds rhinos  and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval  but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week  while wales will play england in the opening six nations match on 5 february.  we have had an approach from wales   confirmed a saints spokesman.  it s in the very early stages but it is something we are giving serious consideration to.  st helens  who are proud of their welsh connections  are obvious partners for the welsh rugby union  despite a spat in 2001 over the collapse of kieron cunningham s proposed £500 000 move to union side swansea. a similar cross-code deal that took iestyn harris from leeds to cardiff in 2001 did go through  before the talented stand-off retu

In [6]:
dataset.format

{'type': None,
 'format_kwargs': {},
 'columns': ['text', 'label', 'label_text'],
 'output_all_columns': False}

In [7]:
# option 1: set the format of columns using with_format(), create PyTorch tensors by setting type="torch"

In [8]:
py_1 = dataset.with_format(type="torch")

In [9]:
py_1.format

{'type': 'torch',
 'format_kwargs': {},
 'columns': ['text', 'label', 'label_text'],
 'output_all_columns': False}

In [10]:
tf_1 = dataset.with_format(type="tf")

In [11]:
tf_1.format

{'type': 'tensorflow',
 'format_kwargs': {},
 'columns': ['text', 'label', 'label_text'],
 'output_all_columns': False}

In [12]:
# we can also set our tokenizer options

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [14]:
tokenizer(dataset[0]['text'], max_length=20, padding='max_length', truncation=True, return_tensors="pt")

{'input_ids': tensor([[ 101, 3575, 2215, 4043, 2223, 2731, 3575, 2071, 3582, 2563, 1055, 2599,
         2011, 2731, 2007, 1037, 4043, 2223, 2252,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [15]:
tokenizer(dataset[0]['text'], max_length=20, padding='max_length', truncation=True, return_tensors="tf")

{'input_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[ 101, 3575, 2215, 4043, 2223, 2731, 3575, 2071, 3582, 2563, 1055,
        2599, 2011, 2731, 2007, 1037, 4043, 2223, 2252,  102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}

In [16]:
# or apply transformations after tokenization

In [17]:
def tokenize_function(examples):
    return tokenizer(examples['text'], max_length=50, padding='max_length', truncation=True)

tokenized = dataset.map(tokenize_function, batched=True)
tokenized = tokenized.remove_columns(['text', 'label_text'])

Map:   0%|          | 0/1225 [00:00<?, ? examples/s]

In [18]:
tokenized_py_1 = tokenized.with_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [19]:
tokenized_py_1[0]

{'label': tensor(2),
 'input_ids': tensor([  101,  3575,  2215,  4043,  2223,  2731,  3575,  2071,  3582,  2563,
          1055,  2599,  2011,  2731,  2007,  1037,  4043,  2223,  2252,  1012,
          2563,  2031,  2525,  2018,  1037,  2093,  1011,  2154,  5219,  2007,
          7873, 24091,  2015,  1998,  3575,  2024,  2245,  2000,  2022,  4699,
          1999,  1037,  2714,  9349,  2007,  9169,  2358, 24074,  1012,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1])}

In [20]:
tokenized_py_1 = tokenized_py_1.train_test_split(test_size=0.3)

In [21]:
# now with tensorflow
# pip install --no-deps tensorflow (running without no deps broke numpy)
import tensorflow as tf

In [22]:
tokenized_tf_1 = tokenized.with_format(type='tf', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [23]:
tokenized_tf_1[0]

{'label': <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([  101,  3575,  2215,  4043,  2223,  2731,  3575,  2071,  3582,
         2563,  1055,  2599,  2011,  2731,  2007,  1037,  4043,  2223,
         2252,  1012,  2563,  2031,  2525,  2018,  1037,  2093,  1011,
         2154,  5219,  2007,  7873, 24091,  2015,  1998,  3575,  2024,
         2245,  2000,  2022,  4699,  1999,  1037,  2714,  9349,  2007,
         9169,  2358, 24074,  1012,   102], dtype=int64)>,
 'token_type_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int64)>,
 'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      

In [24]:
tokenized_tf_1 = tokenized_tf_1.train_test_split(test_size=0.3)

In [25]:
# loading model
# pip install tf-keras

#pytorch
from transformers import AutoModelForSequenceClassification
model_pt = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5, torch_dtype="auto")

#tensorflow
from transformers import TFAutoModelForSequenceClassification
model_tf = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.






All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer
# pytorch model
# we can use the trainer for high level cutsomization 
#PyTorch
training_args = TrainingArguments(
    output_dir="./pt_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model_pt,
    args=training_args,
    train_dataset=tokenized_py_1['train'],
    eval_dataset=tokenized_py_1['test'],
)

trainer.train()

Epoch,Training Loss,Validation Loss


In [33]:
#TensorFlow
model_tf.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

model_tf.fit(tokenized_tf_1['train'], epochs=2)

ValueError: dictionary update sequence element #0 has length 3; 2 is required