In [1]:
import torch
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4080 Laptop GPU


In [4]:
# 1. 加载模型和分词器
model = BertForSequenceClassification.from_pretrained(
    "bert-large-uncased", 
    num_labels=2,  # 根据任务调整（如二分类）
)
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 2. 加载并预处理数据集（示例：GLUE MRPC）
dataset = load_dataset("imdb")

def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [10]:
# 3. 配置训练参数
training_args = TrainingArguments(
    output_dir="./bert_finetuned",  # 输出目录
    per_device_train_batch_size=8,   # 单卡 batch size
    num_train_epochs=3,              # 训练轮次
    logging_dir="./logs",            # 日志目录
    save_steps=500,                 # 保存模型步数
    eval_strategy="steps",     # 评估策略
    eval_steps=500,                 # 评估步数
    fp16=True,                      # 启用混合精度（NVIDIA GPU）
)

# 4. 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [11]:
# 5. 启动训练
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss,Validation Loss


KeyboardInterrupt: 