## Load dataset

In [1]:
from datasets import load_dataset

In [2]:
dataset = load_dataset("SetFit/bbc-news", split = "train")

In [3]:
dataset[0]

{'text': 'wales want rugby league training wales could follow england s lead by training with a rugby league club.  england have already had a three-day session with leeds rhinos  and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval  but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week  while wales will play england in the opening six nations match on 5 february.  we have had an approach from wales   confirmed a saints spokesman.  it s in the very early stages but it is something we are giving serious consideration to.  st helens  who are proud of their welsh connections  are obvious partners for the welsh rugby union  despite a spat in 2001 over the collapse of kieron cunningham s proposed £500 000 move to union side swansea. a similar cross-code deal that took iestyn harris from leeds to cardiff in 2001 did go through  before the talented stand-off retu

In [4]:
dataset.format # note type = None

{'type': None,
 'format_kwargs': {},
 'columns': ['text', 'label', 'label_text'],
 'output_all_columns': False}

## Convert dataset

### Option 1: format using with_format()

**Pytorch**

In [5]:
pt_format = dataset.with_format(type="torch")
pt_format.format # note type = Torch

{'type': 'torch',
 'format_kwargs': {},
 'columns': ['text', 'label', 'label_text'],
 'output_all_columns': False}

**Tensorflow**

In [6]:
tf_format = dataset.with_format(type="tf")
tf_format.format # note type = tensorflow

{'type': 'tensorflow',
 'format_kwargs': {},
 'columns': ['text', 'label', 'label_text'],
 'output_all_columns': False}

### Option 2: set our tokenizer to return the desired format

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

**Pytorch**

In [8]:
tokenizer(dataset[0]['text'], max_length=20, padding='max_length', truncation=True, return_tensors="pt")

{'input_ids': tensor([[ 101, 3575, 2215, 4043, 2223, 2731, 3575, 2071, 3582, 2563, 1055, 2599,
         2011, 2731, 2007, 1037, 4043, 2223, 2252,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

**Tensorflow**

In [9]:
tokenizer(dataset[0]['text'], max_length=20, padding='max_length', truncation=True, return_tensors="tf")

{'input_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=
array([[ 101, 3575, 2215, 4043, 2223, 2731, 3575, 2071, 3582, 2563, 1055,
        2599, 2011, 2731, 2007, 1037, 4043, 2223, 2252,  102]])>, 'token_type_ids': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(1, 20), dtype=int32, numpy=array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}

### Option 3: apply transformations after tokenization

In [10]:
# tokenize our dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], max_length=50, padding='max_length', truncation=True) # shorter max length to speed up training

tokenized = dataset.map(tokenize_function, batched=True)
tokenized = tokenized.remove_columns(['text', 'label_text'])

**Pytorch**

In [11]:
tokenized_pt = tokenized.with_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [12]:
tokenized_pt = tokenized_pt.train_test_split(test_size=0.3)

In [13]:
tokenized_pt['train'][0]

{'label': tensor(4),
 'input_ids': tensor([  101,  8359,  2000,  4895,  3726,  4014,  7521,  2933,  2047,  7711,
          2006,  3171, 16836,  1998, 12347,  3675, 14703,  2097,  2022,  2112,
          1997,  2231,  3488, 11521,  2006,  6928,  1012,  2188,  3187,  2798,
          8359,  4122,  2000,  8970,  1037,  2685,  2291,  2005,  3171, 16836,
          1998,  3623, 23702,  2015,  1997,  3478, 11386, 24071,  1012,   102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1])}

**Tensorflow**

In [14]:
# pip install --no-deps tensorflow (running without no deps broke numpy)
import tensorflow as tf

In [32]:
tokenized_tf = tokenized.with_format(type='tf', columns=['input_ids', 'token_type_ids', 'attention_mask', 'label'])

In [33]:
tokenized_tf = tokenized_tf.train_test_split(test_size=0.3)

In [34]:
tokenized_tf['train'][0]

{'label': <tf.Tensor: shape=(), dtype=int64, numpy=2>,
 'input_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([  101,  5982, 11124,  2015, 23439,  6904,  6834,  2346,  5823,
         3306, 19938, 12849,  9153,  2015,  5982, 11124,  2015,  2038,
         6380,  4447,  2008,  2002,  8275,  2094,  1037,  5013,  5638,
         3489,  5823,  2000,  4468,  1037, 23799,  3231,  2420,  2077,
         1996,  2707,  1997,  1996,  3783,  1012,  5982, 11124,  2015,
         1998,  3507, 19938,  5736,   102], dtype=int64)>,
 'token_type_ids': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int64)>,
 'attention_mask': <tf.Tensor: shape=(50,), dtype=int64, numpy=
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
      

## Load model

In [18]:
# pip install tf-keras

**Pytorch**

In [19]:
from transformers import AutoModelForSequenceClassification
model_pt = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Tensorflow**

In [20]:
from transformers import TFAutoModelForSequenceClassification
model_tf = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train model

**Pytorch**

In [21]:
from transformers import TrainingArguments, Trainer

In [22]:
training_args = TrainingArguments(
    output_dir="pt_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01
)

trainer = Trainer(
    model=model_pt,
    args=training_args,
    train_dataset=tokenized_pt['train'],
    eval_dataset=tokenized_pt['test']
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.291571
2,No log,0.151326


TrainOutput(global_step=216, training_loss=0.47597309395119, metrics={'train_runtime': 404.9987, 'train_samples_per_second': 4.232, 'train_steps_per_second': 0.533, 'total_flos': 44041454705400.0, 'train_loss': 0.47597309395119, 'epoch': 2.0})

**Tensorflow**

In [35]:
tokenized_tf = tokenized_tf['train'].to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols=["label"],
    shuffle=True,
    batch_size=16
) # you could also run this directly on tokenized without doing the other steps above in option 3

In [36]:
model_tf.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

model_tf.fit(tokenized_tf, epochs=2)

Epoch 1/2
Epoch 2/2


<tf_keras.src.callbacks.History at 0x1f4a2ea4dd0>