In [None]:
! pip install transformers datasets
! pip install accelerate -U
! pip install transformers[torch]
! pip install evaluate

In [6]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
print(dataset)

# dataset["train"][100]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})


In [8]:
from datasets import load_dataset, DatasetDict

# 載入資料集
dataset = load_dataset("yelp_review_full")

# 對每個分割隨機選取 10% 的資料，並保持 DatasetDict 類型
subset_dict = DatasetDict({
    split: ds.train_test_split(train_size=0.1, seed=42)['train'] for split, ds in dataset.items()
})

# 檢視子資料集大小
print(subset_dict)

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 65000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 5000
    })
})


In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


# tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = subset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/65000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [11]:
small_train_dataset = tokenized_datasets["train"]
small_eval_dataset = tokenized_datasets["test"]

In [12]:
tokenized_datasets["train"][100]

{'label': 4,
 'text': "Wow...we finally tried this place and it's phenomenal!\\n\\nTraditional seafood boils, plastic baggies, no dishes, just ol' bay and big ol' mess. What out for the spices, they can add up and make sure you order one of everything on the menu...at least once!",
 'input_ids': [101,
  11750,
  119,
  119,
  119,
  1195,
  1921,
  1793,
  1142,
  1282,
  1105,
  1122,
  112,
  188,
  14343,
  1233,
  106,
  165,
  183,
  165,
  183,
  1942,
  9871,
  8934,
  1348,
  2343,
  24263,
  171,
  20708,
  1116,
  117,
  5828,
  3821,
  19310,
  117,
  1185,
  10514,
  117,
  1198,
  184,
  1233,
  112,
  5952,
  1105,
  1992,
  184,
  1233,
  112,
  6477,
  119,
  1327,
  1149,
  1111,
  1103,
  25133,
  117,
  1152,
  1169,
  5194,
  1146,
  1105,
  1294,
  1612,
  1128,
  1546,
  1141,
  1104,
  1917,
  1113,
  1103,
  13171,
  119,
  119,
  119,
  1120,
  1655,
  1517,
  106,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


<a id='trainer'></a>

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="test_trainer")

In [15]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [17]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")



In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from transformers import pipeline
pipe = pipeline("sentiment-analysis", model='test_trainer/checkpoint-3500', tokenizer=tokenizer)

![image.png](attachment:image.png)

In [None]:
pipe("will go again")

## 用PyTorch來訓練

<a id='pytorch_native'></a>

In [None]:
import torch
del model
del trainer
torch.cuda.empty_cache()

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10000))

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

<a id='additional-resources'></a>