## Prepare dataset

In [1]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    'hfl/rbt3'
)

In [2]:
tokenizer

BertTokenizerFast(name_or_path='hfl/rbt3', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [3]:
#trail
tokenizer.batch_encode_plus(['Êò•Â§©ÊòØÂêÉÂÜ∞Ê∑áÂáåÁöÑÊó∂ËäÇ', 'ÊàëÂú®ÂñúÈ©¨ÊãâÈõÖÂ±±ÁöÑ‰∏äÁ©∫ÊäòÁ∫∏È£ûÊú∫'], truncation = True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[101, 3217, 1921, 3221, 1391, 1102, 3899, 1119, 4638, 3198, 5688, 102], [101, 2769, 1762, 1599, 7716, 2861, 7414, 2255, 4638, 677, 4958, 2835, 5291, 7607, 3322, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [4]:
from datasets import load_from_disk
dataset = load_from_disk('../ChnSentiCorp/')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 0
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [6]:
dataset['train'] = dataset['train'].shuffle().select(range(2000))
dataset['test'] = dataset['test'].shuffle().select(range(100))

In [7]:
dataset['train'][0]

{'text': '‰ª∑Ê†ºËøòÂèØ‰ª•ÂÜçÂæÄ‰∏ã‰∏ÄÁÇπ(Ê∏ØË°åËøô‰∏™‰ª∑Èí±ÂèØÊòØÂèåÁîµ+ÂÜÖÂåÖ+xp),3ËäØÁîµÊ±†ÊúâÁÇπÂ∞è,1gÂÜÖÂ≠òÊúâÁÇπÂ∞è,ÊúÄÈáçË¶ÅÁöÑÊòØÂàÜËæ®ÁéáÊòØ1024*576,Ëøô‰∏™ÊúâÁÇπÁÉ¶',
 'label': 0}

In [8]:
def f(data, tokenizer):
    return tokenizer.batch_encode_plus(data['text'], truncation = True)

In [9]:
dataset = dataset.map(f,
                      batched = True,
                      batch_size = 1000,
                      num_proc = 4,
                      remove_columns = ['text'],
                      fn_kwargs = {'tokenizer':tokenizer})

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['label'],
        num_rows: 0
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [11]:
dataset['train'][0]

{'label': 0,
 'input_ids': [101,
  817,
  3419,
  6820,
  1377,
  809,
  1086,
  2518,
  678,
  671,
  4157,
  113,
  3949,
  6121,
  6821,
  702,
  817,
  7178,
  1377,
  3221,
  1352,
  4510,
  116,
  1079,
  1259,
  116,
  8766,
  114,
  117,
  124,
  5708,
  4510,
  3737,
  3300,
  4157,
  2207,
  117,
  10719,
  1079,
  2100,
  3300,
  4157,
  2207,
  117,
  3297,
  7028,
  6206,
  4638,
  3221,
  1146,
  6795,
  4372,
  3221,
  11570,
  115,
  8272,
  8158,
  117,
  6821,
  702,
  3300,
  4157,
  4172,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

In [12]:
# delete long sentences
def f2(data):
    return [len(i) <=  512 for i in data['input_ids']]

dataset = dataset.filter(
    f2,
    batched = True,
    batch_size = 1000,
    num_proc = 4
)

Filter (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

`batch_size` doesn‚Äôt change what gets filtered (the output result will be the same) ‚Äîit just controls how efficiently the filtering happens.
- It affects memory usage: Larger batches use more RAM.
- It affects speed: Larger batches usually mean fewer function calls and faster processing.
- It affects parallelism: With num_proc=4, each of the 4 processes will handle batches of 1000 examples independently.

In [13]:
from transformers import AutoModelForSequenceClassification
from transformers import BertForSequenceClassification

W0705 11:40:19.551000 34752 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [14]:
!pip show transformers

Name: transformers
Version: 4.41.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /opt/anaconda3/envs/Advanced_AI/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [15]:
!pip show accelerate

Name: accelerate
Version: 0.30.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /opt/anaconda3/envs/Advanced_AI/lib/python3.10/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [16]:
# model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3', num_labels=2)
model = BertForSequenceClassification.from_pretrained('hfl/rbt3', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
sum([i.nelement() for i in model.parameters()])

38478338

There are about 38 millions parameters in the model

In [18]:
import torch

In [19]:
#trail
data = {
    'input_ids': torch.ones(4,10, dtype = torch.long),
    'token_type_ids': torch.ones(4,10, dtype = torch.long),
    'attention_mask':torch.ones(4,10, dtype = torch.long),
    'labels':torch.ones(4, dtype = torch.long),
}

In [20]:
data

{'input_ids': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'token_type_ids': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([1, 1, 1, 1])}

In [21]:
out = model(**data)

In [22]:
out

SequenceClassifierOutput(loss=tensor(0.9882, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0209, -0.5017],
        [ 0.0209, -0.5017],
        [ 0.0209, -0.5017],
        [ 0.0209, -0.5017]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [23]:
out['loss']

tensor(0.9882, grad_fn=<NllLossBackward0>)

In [24]:
out['logits']

tensor([[ 0.0209, -0.5017],
        [ 0.0209, -0.5017],
        [ 0.0209, -0.5017],
        [ 0.0209, -0.5017]], grad_fn=<AddmmBackward0>)

## Evaluation metrics

In [25]:
from evaluate import load as load_metric
metric = load_metric('accuracy')

In [26]:
import numpy as np
from transformers.trainer_utils import EvalPrediction

In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pred = logits.argmax(axis = 1)
    print (pred)
    return metric.compute(predictions = pred, references = labels)
    # return {'accuracy' : (pred == labels).mean()}
    

In [28]:
eval_pred = EvalPrediction(
    predictions = np.array([[1,0],[9,3],[0.4,0.3],[6,7]]),
    label_ids = np.array([1,1,0,1])
)


In [35]:
logits, labels = eval_pred

In [36]:
logits

array([[1. , 0. ],
       [9. , 3. ],
       [0.4, 0.3],
       [6. , 7. ]])

In [40]:
logits.argmax(axis = 1)

array([0, 0, 0, 1])

In [39]:
eval_pred[1]

array([1, 1, 0, 1])

In [31]:
compute_metrics(eval_pred)

[0 0 0 1]


{'accuracy': 0.5}

## Parameter in training
In Huggingface, the training parameters are packed in a **class -- TrainingArguments**

I was using transformers 4.50.0 and accelerate 1.8.1, but they are not doing well with the below code.

So I swithed to transfomers 4.41.0 and accelerate 0.30.0 


In [30]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

In [31]:
args = TrainingArguments(
    output_dir = './output_dir', # temporary output data's saving path
    # evaluation_strategy = 'steps',
    
    eval_steps = 30,#how many steps to perform an evaluation
    save_strategy = 'steps', # it can be 'no', 'epoch', 'steps'
    save_steps = 30,

    num_train_epochs = 2,

    learning_rate = 0.001,

    weight_decay = 0.01, #ùúÜ to multiply on the sum of square of weights in loss function, to reduce overfitting

    per_device_eval_batch_size = 8,
    per_device_train_batch_size = 8,

    no_cuda = False, #It will use MPS if available

    optim="adamw_torch",

)

In [32]:
#define the trainer
from transformers import Trainer
from transformers.data.data_collator import DataCollatorWithPadding 

In [33]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = dataset['train'],
    eval_dataset = dataset['test'],
    compute_metrics = compute_metrics,
    data_collator = DataCollatorWithPadding(tokenizer)
)

Data collator standadize the input data and make them the same length. Check out the test below.

In [34]:
#test
data_collator = DataCollatorWithPadding(tokenizer)

In [35]:
data = dataset['train'][:5]
print(data)

{'label': [0, 0, 0, 0, 0], 'input_ids': [[101, 122, 8021, 2242, 2391, 3221, 100, 7262, 7481, 100, 4638, 8024, 680, 2145, 3302, 722, 1184, 4638, 1726, 5031, 4685, 1353, 1557, 8020, 2769, 702, 782, 1599, 3614, 4836, 3763, 7481, 4638, 8024, 7262, 7481, 1922, 3230, 4706, 8021, 8013, 123, 8021, 7599, 2794, 1898, 7509, 3683, 2682, 6496, 4638, 1920, 679, 2208, 8013, 124, 8021, 8403, 2940, 9769, 11379, 8024, 5905, 2242, 8024, 7444, 6206, 934, 3121, 9324, 6392, 5390, 8020, 4801, 4669, 1346, 3144, 3121, 711, 11319, 3175, 2466, 8021, 102], [101, 7564, 6163, 8403, 3221, 711, 749, 5688, 4689, 2768, 3315, 1416, 8024, 2940, 5143, 5320, 4924, 7937, 4172, 749, 4157, 511, 7241, 4669, 4802, 2141, 3300, 4157, 1510, 2418, 679, 6639, 4638, 7309, 7579, 8024, 2902, 7241, 6206, 4924, 4500, 1213, 8020, 679, 6814, 1377, 5543, 3221, 1728, 711, 3173, 3315, 2094, 679, 5650, 2533, 4500, 1213, 2902, 4638, 1333, 1728, 8021, 511, 1377, 5543, 3221, 3227, 1305, 1922, 4162, 1416, 8024, 2802, 3952, 2767, 3126, 3362, 679, 4

In [36]:
data

{'label': [0, 0, 0, 0, 0],
 'input_ids': [[101,
   122,
   8021,
   2242,
   2391,
   3221,
   100,
   7262,
   7481,
   100,
   4638,
   8024,
   680,
   2145,
   3302,
   722,
   1184,
   4638,
   1726,
   5031,
   4685,
   1353,
   1557,
   8020,
   2769,
   702,
   782,
   1599,
   3614,
   4836,
   3763,
   7481,
   4638,
   8024,
   7262,
   7481,
   1922,
   3230,
   4706,
   8021,
   8013,
   123,
   8021,
   7599,
   2794,
   1898,
   7509,
   3683,
   2682,
   6496,
   4638,
   1920,
   679,
   2208,
   8013,
   124,
   8021,
   8403,
   2940,
   9769,
   11379,
   8024,
   5905,
   2242,
   8024,
   7444,
   6206,
   934,
   3121,
   9324,
   6392,
   5390,
   8020,
   4801,
   4669,
   1346,
   3144,
   3121,
   711,
   11319,
   3175,
   2466,
   8021,
   102],
  [101,
   7564,
   6163,
   8403,
   3221,
   711,
   749,
   5688,
   4689,
   2768,
   3315,
   1416,
   8024,
   2940,
   5143,
   5320,
   4924,
   7937,
   4172,
   749,
   4157,
   511,
   7241,
   4669,
   4

In [37]:
for i in data['input_ids']:
    print(len(i))

84
83
37
42
40


In [38]:
#test the data_collator
data = data_collator(data)

In [39]:
data

{'input_ids': tensor([[  101,   122,  8021,  2242,  2391,  3221,   100,  7262,  7481,   100,
          4638,  8024,   680,  2145,  3302,   722,  1184,  4638,  1726,  5031,
          4685,  1353,  1557,  8020,  2769,   702,   782,  1599,  3614,  4836,
          3763,  7481,  4638,  8024,  7262,  7481,  1922,  3230,  4706,  8021,
          8013,   123,  8021,  7599,  2794,  1898,  7509,  3683,  2682,  6496,
          4638,  1920,   679,  2208,  8013,   124,  8021,  8403,  2940,  9769,
         11379,  8024,  5905,  2242,  8024,  7444,  6206,   934,  3121,  9324,
          6392,  5390,  8020,  4801,  4669,  1346,  3144,  3121,   711, 11319,
          3175,  2466,  8021,   102],
        [  101,  7564,  6163,  8403,  3221,   711,   749,  5688,  4689,  2768,
          3315,  1416,  8024,  2940,  5143,  5320,  4924,  7937,  4172,   749,
          4157,   511,  7241,  4669,  4802,  2141,  3300,  4157,  1510,  2418,
           679,  6639,  4638,  7309,  7579,  8024,  2902,  7241,  6206,  4924,


In [40]:
for k, v in data.items():
    print(k, v.shape)

input_ids torch.Size([5, 84])
token_type_ids torch.Size([5, 84])
attention_mask torch.Size([5, 84])
labels torch.Size([5])


In [41]:
tokenizer.decode(data['input_ids'][1])

'[CLS] È¢Ñ Ë£Ö linux ÊòØ ‰∏∫ ‰∫Ü ËäÇ ÁúÅ Êàê Êú¨ Âêß Ôºå Êç¢ Á≥ª Áªü Á®ç È∫ª ÁÉ¶ ‰∫Ü ÁÇπ „ÄÇ ÈîÆ Áõò Á°Æ ÂÆû Êúâ ÁÇπ Âìç Â∫î ‰∏ç Ë∂≥ ÁöÑ ÈóÆ È¢ò Ôºå Êåâ ÈîÆ Ë¶Å Á®ç Áî® Âäõ Ôºà ‰∏ç Ëøá ÂèØ ËÉΩ ÊòØ Âõ† ‰∏∫ Êñ∞ Êú¨ Â≠ê ‰∏ç Ëàç Âæó Áî® Âäõ Êåâ ÁöÑ Âéü Âõ† Ôºâ „ÄÇ ÂèØ ËÉΩ ÊòØ Êòæ Âç° Â§™ ÁÉÇ Âêß Ôºå Êâì Ê∏∏ Êàè Êïà Êûú ‰∏ç ÁêÜ ÊÉ≥ „ÄÇ [SEP] [PAD]'

In [42]:
torch.mps.empty_cache()

In [43]:
#Do a trail
trainer.evaluate()



[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


{'eval_loss': 0.6947501301765442,
 'eval_accuracy': 0.5102040816326531,
 'eval_runtime': 1.187,
 'eval_samples_per_second': 82.56,
 'eval_steps_per_second': 10.952}

In [44]:
print(TrainingArguments.optim)

adamw_torch


In [45]:
torch.mps.empty_cache()

In [46]:
trainer.train()



Step,Training Loss




TrainOutput(global_step=494, training_loss=0.7254352029035931, metrics={'train_runtime': 137.4792, 'train_samples_per_second': 28.746, 'train_steps_per_second': 3.593, 'total_flos': 115669785493536.0, 'train_loss': 0.7254352029035931, 'epoch': 2.0})

In [47]:
trainer.evaluate()



[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


{'eval_loss': 0.6929463148117065,
 'eval_accuracy': 0.5102040816326531,
 'eval_runtime': 0.4867,
 'eval_samples_per_second': 201.337,
 'eval_steps_per_second': 26.708,
 'epoch': 2.0}

## Save and load the model
By default, all the model related files are saved in the checkpoint folders under output_dir. But you can also save it manually.

In [48]:
trainer.save_model(output_dir = './output_dir/save_model')

In [57]:
# To load a model, the old way of using pytorch doesn't work any more
# model.load_state_dict(torch.load('./output_dir/save_model/pytorch_model.bin', weights_only=True))

In [58]:
# Load a model with transformers API

In [59]:
AutoModelForSequenceClassification.from_pretrained('./output_dir/save_model')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

### Resume from a checkpoint

In [73]:
# Patch Trainer to skip RNG state loading
trainer._load_rng_state = lambda *args, **kwargs: None

# Resume training
trainer.train(resume_from_checkpoint='./output_dir/checkpoint-450')



Step,Training Loss


TrainOutput(global_step=494, training_loss=0.061844929992428674, metrics={'train_runtime': 11.0817, 'train_samples_per_second': 356.624, 'train_steps_per_second': 44.578, 'total_flos': 115669785493536.0, 'train_loss': 0.061844929992428674, 'epoch': 2.0})

In [74]:
trainer.evaluate()



[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


{'eval_loss': 0.692955493927002,
 'eval_accuracy': 0.5102040816326531,
 'eval_runtime': 0.5656,
 'eval_samples_per_second': 173.271,
 'eval_steps_per_second': 22.985,
 'epoch': 2.0}

In [75]:
trainer.train(resume_from_checkpoint='./output_dir/checkpoint-420')



Step,Training Loss


TrainOutput(global_step=494, training_loss=0.10614210584385675, metrics={'train_runtime': 17.3678, 'train_samples_per_second': 227.547, 'train_steps_per_second': 28.443, 'total_flos': 115669785493536.0, 'train_loss': 0.10614210584385675, 'epoch': 2.0})

In [76]:
trainer.evaluate()



[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


{'eval_loss': 0.6930639147758484,
 'eval_accuracy': 0.5102040816326531,
 'eval_runtime': 0.5189,
 'eval_samples_per_second': 188.878,
 'eval_steps_per_second': 25.055,
 'epoch': 2.0}

## Prediction

In [78]:
model.eval()

for i, data in enumerate(trainer.get_eval_dataloader()):
    break

for k, v in data.items():
    #put the data to gpu
    data[k] = v.to('mps')

out = model(**data)
pred = out['logits'].argmax(dim = 1)

In [86]:
data

{'input_ids': tensor([[ 101, 6006, 4197,  ...,    0,    0,    0],
        [ 101, 7478, 2382,  ...,    0,    0,    0],
        [ 101,  517, 4895,  ...,    0,    0,    0],
        ...,
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101,  976, 2339,  ..., 2742, 8013,  102],
        [ 101, 6821,  702,  ...,    0,    0,    0]], device='mps:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='mps:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='mps:0'), 'labels': tensor([0, 0, 1, 1, 1, 1, 1, 1], device='mps:0')}

In [85]:
data.items()

dict_items([('input_ids', tensor([[ 101, 6006, 4197,  ...,    0,    0,    0],
        [ 101, 7478, 2382,  ...,    0,    0,    0],
        [ 101,  517, 4895,  ...,    0,    0,    0],
        ...,
        [ 101, 2791, 7313,  ...,    0,    0,    0],
        [ 101,  976, 2339,  ..., 2742, 8013,  102],
        [ 101, 6821,  702,  ...,    0,    0,    0]], device='mps:0')), ('token_type_ids', tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='mps:0')), ('attention_mask', tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]], device='mps:0')), ('labels', tensor([0, 0, 1, 1, 1, 1, 1, 1], device='mps:0'))])

In [89]:
out

SequenceClassifierOutput(loss=tensor(0.6757, device='mps:0', grad_fn=<NllLossBackward0>), logits=tensor([[0.3124, 0.3848],
        [0.3124, 0.3848],
        [0.3124, 0.3848],
        [0.3124, 0.3848],
        [0.3124, 0.3848],
        [0.3124, 0.3848],
        [0.3124, 0.3848],
        [0.3124, 0.3848]], device='mps:0', grad_fn=<LinearBackward0>), hidden_states=None, attentions=None)

In [79]:
pred

tensor([1, 1, 1, 1, 1, 1, 1, 1], device='mps:0')

In [88]:
for i in range(8):
    print(tokenizer.decode(data['input_ids'][i], skip_special_tokens = True))
    print('label = ' , data['labels'][i].item())
    print('predictions = ', pred[i].item())

ËôΩ ÁÑ∂ cpu Âèë ÁÉ≠ Èáè ‰∏ç Â§ß Ôºå ‰ΩÜ Êú∫ Âô® Á°¨ Áõò Âèë ÁÉ≠ Èáè ËæÉ Â§ß Ôºå ÂØº Ëá¥ cpu È£é Êâá ‰∏ç ÂÅú Âú∞ È´ò ÈÄü ËΩ¨ Âä® Ôºå Êôö ‰∏ä ‰Ωø Áî® Ëßâ Âæó Âô™ Â£∞ Â§™ Â§ß Ôºå ÊØî Êàë ÁöÑ 15 ÂØ∏ ÁöÑ hp ÁöÑ Â£∞ Èü≥ Âìç Âæó Â§ö Ôºà Âêå Ê†∑ ÁöÑ Â∑• ‰Ωú Èáè Ôºå ‰∏ç Ë∞à ÈÄü Â∫¶ ‰∫Ü Ôºâ Ôºå Ëøô ÊòØ ‰∏™ ÈÅó ÊÜæ ÔºÅ
label =  0
predictions =  1
Èùû Â∏∏ ‰∏Ä Ëà¨ ÁöÑ ‰∏Ä Êú¨ ‰π¶ Ôºå ÂÖÖ Êª° ‰∫Ü ÂÅá ÊÉ≥ ÁöÑ ÁêÜ ÊÉ≥ ‰∏ª ‰πâ Ëâ≤ ÂΩ© Ôºå Âª∫ ËÆÆ Âàö ÊØï ‰∏ö ÁöÑ ËÅå Âú∫ Êñ∞ ‰∫∫ ÂçÉ ‰∏á ‰∏ç Ë¶Å Áúã „ÄÇ
label =  0
predictions =  1
„Ää Á¶ª Â©ö „Äã ‰πü ËØª ÂÆå ‰∫Ü „ÄÇ Á¶ª Â©ö Áøª ËØë Êàê Êõ¥ Êòé ÁôΩ ÁöÑ ËØù Ôºå Â∫î ËØ• Âè´ Âπª ÁÅ≠ „ÄÇ ÊâÄ Êúâ ÁöÑ ÂØπ Áîü Ê¥ª ÁöÑ Â∏å Êúõ ÈÉΩ ‰º¥ Èöè ÁùÄ ËØ• Á¶ª Â©ö ÁöÑ ‰∫∫ ÁöÑ ‰∏ç Á¶ª Â©ö ËÄå Á†¥ ÁÅ≠ ‰∫Ü „ÄÇ
label =  1
predictions =  1
„Ää Èò¥ Èò≥ Â∏à. Êô¥ Êòé Âèñ Áò§ „Äã Ëøô Êú¨ ‰π¶ ‰π∞ Âõû Êù• Êîæ Âú® ‰π¶ Êû∂ ‰∏ä Â•Ω ÊÆµ Êó• Â≠ê Ôºå Êàë ÈÉΩ ÂÆå ‰∫Ü ÊòØ ‰ªÄ ‰πà Êó∂ ÂÄô ‰π∞ ÁöÑ ‰∫Ü „ÄÇ Âú® Êï¥ ÁêÜ ‰∏ú Ë•ø ÁöÑ Êó∂ ÂÄô Áúã Âà∞ Ê≠£ Â•Ω Áù° ‰∏ç ÁùÄ Â∞± Áúã Áúã Âêß „ÄÇ Áúã ÂÆå ‰