In [1]:
!nvidia-smi

Mon Jan 29 02:06:32 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              47W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
!pip install -r requirements.txt



In [3]:
!pip install kaleido cohere openai tiktoken



In [4]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# 使用1/10的数据量
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(65000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(5000))

In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments

model_dir = "models/bert-base-cased"

training_args = TrainingArguments(output_dir=f"{model_dir}/test_trainer",
                                  logging_dir=f"{model_dir}/test_trainer/runs",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=48,
                                  logging_steps=100)

In [9]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.8438,0.809795,0.6374
2,0.6621,0.811489,0.6548
3,0.4649,0.929868,0.6536


TrainOutput(global_step=4065, training_loss=0.6848024675620291, metrics={'train_runtime': 4371.7928, 'train_samples_per_second': 44.604, 'train_steps_per_second': 0.93, 'total_flos': 5.130803778048e+16, 'train_loss': 0.6848024675620291, 'epoch': 3.0})

In [13]:
small_test_dataset = tokenized_datasets["test"].shuffle(seed=64).select(range(1000))
trainer.evaluate(small_test_dataset)

{'eval_loss': 0.9014902114868164,
 'eval_accuracy': 0.659,
 'eval_runtime': 8.5886,
 'eval_samples_per_second': 116.434,
 'eval_steps_per_second': 14.554,
 'epoch': 3.0}

In [14]:
trainer.save_model(f"{model_dir}/finetuned-trainer")

In [15]:
trainer.save_state()