In [1]:
import math
import numpy as np
import torch
from datasets import load_dataset
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaTokenizerFast, RobertaConfig, RobertaModelWithHeads
from transformers import Trainer, TrainingArguments, EvalPrediction
from transformers import DataCollatorForLanguageModeling
from transformers import RobertaForMaskedLM
from transformers import AdapterType
from sklearn.metrics import f1_score
import datetime

In [2]:
today = datetime.date.today().strftime('%Y%m%d')

In [3]:
def check_gpu():
    '''
    check gpu status
    '''
    try:
        print('GPU available:', torch.cuda.is_available())
        print(torch.cuda.device_count(), 'GPUs detected')
        print('Current GPU id:', torch.cuda.current_device())
        print('Current GPU Name:', torch.cuda.get_device_name(torch.cuda.current_device()))
    except:
        print('GPU not available')
        
def encode_batch(batch):
    '''
    Encodes a batch of input data using the model tokenizer
    using 512
    '''
    return tokenizer(batch["text"], max_length=120, truncation=True, padding="max_length")
#     return tokenizer(batch["text"], padding=True, truncation=True)

def compute_accuracy(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {'acc': (preds==p.label_ids).mean()}

def compute_f1(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1': f1_score(p.label_ids, preds, average=f1_type)}

def compute_score(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1-micro': f1_score(p.label_ids, preds, average='micro'),'f1-macro': f1_score(p.label_ids, preds, average='macro'), 'acc': (preds==p.label_ids).mean()}

In [4]:
check_gpu()

GPU available: True
1 GPUs detected
Current GPU id: 0
Current GPU Name: NVIDIA GeForce GTX 980 Ti


In [5]:
# dictionary for dataset, name: (classes, type of f1 score)
dataset_dict = {'chemprot': (13, 'micro'), 'rct': (5, 'micro'),
                'CI': (6, 'macro'), 'sciie': (7, 'm2cro'),
                'HN': (2, 'macro'), 'ag': (4, 'macro'),
                'amazon': (2, 'macro'), 'imdb': (2, 'macro')}

In [6]:
ds_name = 'CI'
n_labels = dataset_dict[ds_name][0]
f1_type = dataset_dict[ds_name][1]

In [7]:
dataset = load_dataset(f'data_loaders/{ds_name}_data_loader.py')

Reusing dataset task_dataset (C:\Users\snow-\.cache\huggingface\datasets\task_dataset\task\1.0.0\0f3e47e2404d89192a80ec5140374817ceee56a5e8ced3e23f680be3c7719815)


In [8]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

In [9]:
dataset_encoded = dataset.map(encode_batch, batched=True)
# tokenized_datasets = dataset.map(tokenizer, batched=True, num_proc=4, remove_columns=["text"])

Loading cached processed dataset at C:\Users\snow-\.cache\huggingface\datasets\task_dataset\task\1.0.0\0f3e47e2404d89192a80ec5140374817ceee56a5e8ced3e23f680be3c7719815\cache-2a3cfd2cdaee0e91.arrow
Loading cached processed dataset at C:\Users\snow-\.cache\huggingface\datasets\task_dataset\task\1.0.0\0f3e47e2404d89192a80ec5140374817ceee56a5e8ced3e23f680be3c7719815\cache-b19c49c1112d7830.arrow
Loading cached processed dataset at C:\Users\snow-\.cache\huggingface\datasets\task_dataset\task\1.0.0\0f3e47e2404d89192a80ec5140374817ceee56a5e8ced3e23f680be3c7719815\cache-e00caab88a68deb3.arrow


In [10]:
dataset_encoded.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 1688
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 114
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 139
    })
})

## Train adapter language model 100 epochs

In [14]:
model = RobertaForMaskedLM.from_pretrained('roberta-base')

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
model.add_adapter('mlm', AdapterType.text_lang)

In [16]:
model.set_active_adapters(['mlm'])

In [17]:
model.train_adapter(["mlm"])

In [18]:
# get warm up steps for given warmup ratio

warmup_ratio = 0.006
max_train_batch_size_mlm = 16
WARMUP_STEP = int(dataset_encoded['train'].num_rows / max_train_batch_size_mlm * warmup_ratio)
print(WARMUP_STEP)
GRADIENT_ACC_STEP = 256 / max_train_batch_size_mlm
print(GRADIENT_ACC_STEP)

0
16.0


In [19]:
training_args_mlm = TrainingArguments(
    output_dir=f'model/{ds_name}/mlm-adapter/{today}/',
    evaluation_strategy = "epoch",
    learning_rate=0.00025,
    remove_unused_columns=False,
    num_train_epochs=100,
    fp16=True,
    per_device_train_batch_size=max_train_batch_size_mlm,
    per_device_eval_batch_size=64,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
#     warmup_ratio=0.006, not supported in adapter-transformers
    warmup_steps=WARMUP_STEP,
    weight_decay=0.01,
    gradient_accumulation_steps=GRADIENT_ACC_STEP
)

In [20]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [21]:
trainer_mlm = Trainer(
    model=model,
    args=training_args_mlm,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    data_collator=data_collator,
)

In [22]:
trainer_mlm.train()

Epoch,Training Loss,Validation Loss
0,No log,2.565787
1,No log,2.320081
2,No log,2.409102
3,No log,2.174234
4,No log,2.011321
5,No log,1.944114
6,No log,2.399259
7,No log,2.0293
8,No log,1.890951
9,No log,2.010864


TrainOutput(global_step=600, training_loss=2.024574788411458)

In [23]:
trainer_mlm.evaluate()

{'eval_loss': 1.698409914970398, 'epoch': 99.90566037735849}

In [24]:
# eval_results = trainer.evaluate()
# print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [27]:
model.save_pretrained(f'model/CI/{today}/model')

In [28]:
model.save_adapter(f'model/CI/{today}/adapter', 'mlm')

## Baseline Finetune

In [12]:
model = RobertaModelWithHeads.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [13]:
model.add_classification_head('CI_classifier', num_labels=n_labels)

In [14]:
# get warm up steps for given warmup ratio

warmup_ratio = 0.006
max_train_batch_size_mlm = 16
WARMUP_STEP = max(1, int(dataset_encoded['train'].num_rows / max_train_batch_size_mlm * warmup_ratio))
print(WARMUP_STEP)
# GRADIENT_ACC_STEP = 256 / max_train_batch_size_mlm
# print(GRADIENT_ACC_STEP)

1


In [16]:
training_args_ft = TrainingArguments(
    output_dir=f'model/{ds_name}/ft/{today}/',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    remove_unused_columns=False,
    num_train_epochs=3,
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
#     warmup_ratio=0.006, not supported in adapter-transformers
    warmup_steps=WARMUP_STEP,
    weight_decay=0.01,
#     gradient_accumulation_steps=GRADIENT_ACC_STEP
)

In [17]:
trainer_ft = Trainer(
    model=model,
    args=training_args_ft,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics= compute_score
#     data_collator=data_collator,
)

In [18]:
trainer_ft.train()

Epoch,Training Loss,Validation Loss,F1-micro,F1-macro,Acc
1,No log,1.068134,0.640351,0.243859,0.640351
2,No log,0.869458,0.736842,0.368697,0.736842
3,No log,0.856226,0.719298,0.363781,0.719298


TrainOutput(global_step=318, training_loss=1.055605522491647)

In [19]:
trainer_ft.evaluate()

{'eval_loss': 0.8562257885932922,
 'eval_f1-micro': 0.7192982456140351,
 'eval_f1-macro': 0.3637812848716722,
 'eval_acc': 0.7192982456140351,
 'epoch': 3.0}

In [20]:
trainer_ft.evaluate(dataset_encoded['test'])

{'eval_loss': 0.8781317472457886,
 'eval_f1-micro': 0.7194244604316546,
 'eval_f1-macro': 0.41375367551838144,
 'eval_acc': 0.7194244604316546,
 'epoch': 3.0}

## TAPT Pretrained baseline finetune

In [12]:
model = RobertaModelWithHeads.from_pretrained('allenai/dsp_roberta_base_tapt_citation_intent_1688')

Some weights of the model checkpoint at allenai/dsp_roberta_base_tapt_citation_intent_1688 were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at allenai/dsp_roberta_base_tapt_citation_intent_1688 and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this

In [13]:
model.add_classification_head('CI_classifier', num_labels=n_labels)

In [14]:
# get warm up steps for given warmup ratio

warmup_ratio = 0.006
max_train_batch_size_mlm = 16
WARMUP_STEP = max(1, int(dataset_encoded['train'].num_rows / max_train_batch_size_mlm * warmup_ratio))
print(WARMUP_STEP)
# GRADIENT_ACC_STEP = 256 / max_train_batch_size_mlm
# print(GRADIENT_ACC_STEP)

1


In [16]:
training_args_ft = TrainingArguments(
    output_dir=f'model/{ds_name}/tapt-ft/{today}/',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    remove_unused_columns=False,
    num_train_epochs=3,
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
#     warmup_ratio=0.006, not supported in adapter-transformers
    warmup_steps=WARMUP_STEP,
    weight_decay=0.01,
#     gradient_accumulation_steps=GRADIENT_ACC_STEP
)

In [17]:
trainer_ft = Trainer(
    model=model,
    args=training_args_ft,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_score
#     data_collator=data_collator,
)

In [18]:
trainer_ft.train()

Epoch,Training Loss,Validation Loss,F1-micro,F1-macro,Acc
1,No log,0.98981,0.692982,0.322286,0.692982
2,No log,0.80423,0.692982,0.352298,0.692982
3,No log,0.78064,0.692982,0.413875,0.692982


TrainOutput(global_step=318, training_loss=0.9623676036138954)

In [19]:
trainer_ft.evaluate()

{'eval_loss': 0.7806396484375,
 'eval_f1-micro': 0.6929824561403509,
 'eval_f1-macro': 0.41387488328664795,
 'eval_acc': 0.6929824561403509,
 'epoch': 3.0}

In [20]:
trainer_ft.evaluate(dataset_encoded['test'])

{'eval_loss': 0.835127592086792,
 'eval_f1-micro': 0.7553956834532374,
 'eval_f1-macro': 0.4780421932739813,
 'eval_acc': 0.7553956834532374,
 'epoch': 3.0}

## Adapter Pretrain Finetune

In [22]:
model = RobertaModelWithHeads.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [23]:
model.load_adapter('model/CI/20210428/adapter')

'mlm'

In [24]:
model.add_classification_head('CI_classifier', num_labels=n_labels)

In [25]:
model.set_active_adapters(["mlm"])

In [26]:
# get warm up steps for given warmup ratio

warmup_ratio = 0.006
max_train_batch_size_mlm = 16
WARMUP_STEP = max(1, int(dataset_encoded['train'].num_rows / max_train_batch_size_mlm * warmup_ratio))
print(WARMUP_STEP)
# GRADIENT_ACC_STEP = 256 / max_train_batch_size_mlm
# print(GRADIENT_ACC_STEP)

1


In [27]:
training_args_ft = TrainingArguments(
    output_dir=f'model/{ds_name}/adapter-pt-ft/{today}/',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    remove_unused_columns=False,
    num_train_epochs=3,
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
#     warmup_ratio=0.006, not supported in adapter-transformers
    warmup_steps=WARMUP_STEP,
    weight_decay=0.01,
#     gradient_accumulation_steps=GRADIENT_ACC_STEP
)

In [28]:
trainer_ft = Trainer(
    model=model,
    args=training_args_ft,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics=compute_score
#     data_collator=data_collator,
)

In [29]:
trainer_ft.train()

Epoch,Training Loss,Validation Loss,F1-micro,F1-macro,Acc
1,No log,1.062377,0.640351,0.243859,0.640351
2,No log,0.91378,0.710526,0.344047,0.710526
3,No log,0.883724,0.710526,0.361469,0.710526


TrainOutput(global_step=318, training_loss=1.0902853911777712)

In [30]:
trainer_ft.evaluate()

{'eval_loss': 0.8837238550186157,
 'eval_f1-micro': 0.7105263157894737,
 'eval_f1-macro': 0.3614690320572674,
 'eval_acc': 0.7105263157894737,
 'epoch': 3.0}

In [31]:
trainer_ft.evaluate(dataset_encoded['test'])

{'eval_loss': 0.9179366827011108,
 'eval_f1-micro': 0.6906474820143885,
 'eval_f1-macro': 0.34601846902089034,
 'eval_acc': 0.6906474820143885,
 'epoch': 3.0}

## Adapter Pretrain Adapter Finetune

In [12]:
model = RobertaModelWithHeads.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [13]:
model.load_adapter('model/CI/20210428/adapter')

'mlm'

In [14]:
model.add_classification_head('CI_classifier', num_labels=n_labels)

In [15]:
model.add_adapter("CI_classifier", adapter_type=AdapterType.text_task, config="pfeiffer")

In [16]:
model.set_active_adapters([['mlm', 'CI_classifier']])

In [17]:
model.train_adapter(['CI_classifier'])

In [18]:
# get warm up steps for given warmup ratio

warmup_ratio = 0.006
max_train_batch_size_mlm = 16
WARMUP_STEP = max(1, int(dataset_encoded['train'].num_rows / max_train_batch_size_mlm * warmup_ratio))
print(WARMUP_STEP)
# GRADIENT_ACC_STEP = 256 / max_train_batch_size_mlm
# print(GRADIENT_ACC_STEP)

1


In [20]:
training_args_ft = TrainingArguments(
    output_dir=f'model/{ds_name}/adapter-pt-adapter-ft/{today}/',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    remove_unused_columns=False,
    num_train_epochs=3,
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
#     warmup_ratio=0.006, not supported in adapter-transformers
    warmup_steps=WARMUP_STEP,
    weight_decay=0.01,
#     gradient_accumulation_steps=GRADIENT_ACC_STEP
)

In [21]:
trainer_ft = Trainer(
    model=model,
    args=training_args_ft,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics= compute_score
#     data_collator=data_collator,
)

In [22]:
trainer_ft.train()

Epoch,Training Loss,Validation Loss,F1-micro,F1-macro,Acc
1,No log,1.408942,0.517544,0.11368,0.517544
2,No log,1.356578,0.517544,0.11368,0.517544
3,No log,1.346379,0.517544,0.11368,0.517544


TrainOutput(global_step=318, training_loss=1.4399384312659689)

In [23]:
trainer_ft.evaluate()

{'eval_loss': 1.346379041671753,
 'eval_f1-micro': 0.5175438596491229,
 'eval_f1-macro': 0.11368015414258188,
 'eval_acc': 0.5175438596491229,
 'epoch': 3.0}

In [24]:
trainer_ft.evaluate(dataset_encoded['test'])

{'eval_loss': 1.3637382984161377,
 'eval_f1-micro': 0.5107913669064749,
 'eval_f1-macro': 0.1126984126984127,
 'eval_acc': 0.5107913669064749,
 'epoch': 3.0}

## Adapter Pretrain Adapter Finetune more epochs

In [12]:
model = RobertaModelWithHeads.from_pretrained('roberta-base')

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [13]:
model.load_adapter('model/CI/20210428/adapter')

'mlm'

In [14]:
model.add_classification_head('CI_classifier', num_labels=n_labels)

In [15]:
model.add_adapter("CI_classifier", adapter_type=AdapterType.text_task, config="pfeiffer")

In [16]:
model.set_active_adapters([['mlm', 'CI_classifier']])

In [17]:
model.train_adapter(['CI_classifier'])

In [18]:
# get warm up steps for given warmup ratio

warmup_ratio = 0.006
max_train_batch_size_mlm = 16
WARMUP_STEP = max(1, int(dataset_encoded['train'].num_rows / max_train_batch_size_mlm * warmup_ratio))
print(WARMUP_STEP)
# GRADIENT_ACC_STEP = 256 / max_train_batch_size_mlm
# print(GRADIENT_ACC_STEP)

1


In [19]:
training_args_ft = TrainingArguments(
    output_dir=f'model/{ds_name}/adapter-pt-adapter-ft/{today}/',
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    remove_unused_columns=False,
    num_train_epochs=40,
    fp16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
#     warmup_ratio=0.006, not supported in adapter-transformers
    warmup_steps=WARMUP_STEP,
    weight_decay=0.01,
#     gradient_accumulation_steps=GRADIENT_ACC_STEP
)

In [20]:
trainer_ft = Trainer(
    model=model,
    args=training_args_ft,
    train_dataset=dataset_encoded["train"],
    eval_dataset=dataset_encoded["validation"],
    compute_metrics= compute_score
#     data_collator=data_collator,
)

In [21]:
trainer_ft.train()

Epoch,Training Loss,Validation Loss,F1-micro,F1-macro,Acc
1,No log,1.375445,0.517544,0.11368,0.517544
2,No log,1.322572,0.517544,0.11368,0.517544
3,No log,1.307114,0.517544,0.11368,0.517544
4,No log,1.285271,0.517544,0.11368,0.517544
5,1.381693,1.262777,0.517544,0.11368,0.517544
6,1.381693,1.230609,0.517544,0.11368,0.517544
7,1.381693,1.186542,0.526316,0.14046,0.526316
8,1.381693,1.112144,0.587719,0.214447,0.587719
9,1.381693,1.063328,0.622807,0.274325,0.622807
10,1.198104,1.023273,0.649123,0.283257,0.649123


TrainOutput(global_step=4240, training_loss=0.973677610001474)

In [22]:
trainer_ft.evaluate()

{'eval_loss': 0.7908603549003601,
 'eval_f1-micro': 0.7368421052631579,
 'eval_f1-macro': 0.435762679055362,
 'eval_acc': 0.7368421052631579,
 'epoch': 40.0}

In [23]:
trainer_ft.evaluate(dataset_encoded['test'])

{'eval_loss': 0.9183534979820251,
 'eval_f1-micro': 0.7050359712230215,
 'eval_f1-macro': 0.43262295348761515,
 'eval_acc': 0.7050359712230215,
 'epoch': 40.0}