In [None]:
## BERT 情感分析微调 (Unsloth 框架 + Yelp 数据集)

In [1]:
# 安装依赖（建议在终端运行）
!pip install transformers datasets peft bitsandbytes accelerate tqdm



In [2]:
# 1. 加载 unsloth + bert 模型
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 2. 加载 Amazon US Reviews 数据集 - Electronics 类目 (3C)
from datasets import load_dataset

dataset = load_dataset("amazon_polarity")


In [4]:
# 3. 数据预处理：使用 content -> text，label -> label
from tqdm import tqdm

def preprocess_function(examples):
    result = tokenizer(examples["content"], truncation=True, padding="max_length")
    result["labels"] = examples["label"]
    return result

print("开始进行数据预处理...")
encoded_dataset = dataset.map(
    preprocess_function,
    remove_columns=dataset["train"].column_names,
    desc="Tokenizing with tqdm",
    num_proc=4  # 使用多进程加速
)
print("数据预处理完成！")

开始进行数据预处理...


Tokenizing with tqdm (num_proc=4):   0%|          | 1395/3600000 [00:00<05:34, 10765.63 examples/s]

Tokenizing with tqdm (num_proc=4): 100%|██████████| 3600000/3600000 [03:41<00:00, 16247.30 examples/s]
Tokenizing with tqdm (num_proc=4): 100%|██████████| 400000/400000 [00:24<00:00, 16188.50 examples/s]

数据预处理完成！





In [5]:
!pip show transformers
!which python
!pip show transformers
!conda info --envs

Name: transformers
Version: 4.53.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /home/workspace/miniconda3/envs/LLMTraing_py311/lib/python3.11/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, trl, unsloth, unsloth_zoo
/home/workspace/miniconda3/envs/LLMTraing_py311/bin/python
Name: transformers
Version: 4.53.2
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contribut

In [20]:
!pip install -U transformers



In [9]:
encoded_dataset.set_format("torch")
from transformers import __version__
print(__version__)

# 4. 减少训练数据规模用于快速调试
small_train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(5000))
small_eval_dataset = encoded_dataset["test"].shuffle(seed=42).select(range(1000))

# 5. 定义训练参数（调小 batch size，增加训练稳定性）
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert_sentiment_amazon_polarity",
    learning_rate=3e-5,
    per_device_train_batch_size=8,  # 更小 batch size 防止显存溢出
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset
)

4.53.2


In [10]:
# 5. 开始训练
trainer.train()

Step,Training Loss
10,0.6986
20,0.6674
30,0.592
40,0.4989
50,0.4378
60,0.3473
70,0.3669
80,0.4574
90,0.3256
100,0.4131


TrainOutput(global_step=1875, training_loss=0.18060689756100376, metrics={'train_runtime': 160.3405, 'train_samples_per_second': 93.551, 'train_steps_per_second': 11.694, 'total_flos': 3946665830400000.0, 'train_loss': 0.18060689756100376, 'epoch': 3.0})

In [11]:
# 6. 保存模型
model.save_pretrained("./bert_sentiment_model_3c")
tokenizer.save_pretrained("./bert_sentiment_model_3c")

print("训练完成，3C类目模型已保存！")

训练完成，3C类目模型已保存！


In [23]:
# 8. 测试模型
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
print("加载测试管道...")
model = AutoModelForSequenceClassification.from_pretrained("./bert_sentiment_model_3c")
tokenizer = AutoTokenizer.from_pretrained("./bert_sentiment_model_3c")
sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

id2label = {0: "Negative", 1: "Positive"}

def test_sentiment(text):
    result = sentiment_pipeline(text)[0]
    label_id = int(result['label'].split('_')[-1])
    label = id2label[label_id]
    print(f"输入文本: {text}")
    print(f"预测情感: {label} (置信度: {result['score']:.2f})")
    return {"label": label, "score": result['score']}

# 示例测试
example_text = "This product is amazing and works perfectly! Highly recommend."
test_sentiment(example_text)

example_text2 = "Terrible experience. Completely useless product."
test_sentiment(example_text2)


Device set to use cuda:0


加载测试管道...
输入文本: This product is amazing and works perfectly! Highly recommend.
预测情感: Positive (置信度: 1.00)
输入文本: Terrible experience. Completely useless product.
预测情感: Negative (置信度: 1.00)


{'label': 'Negative', 'score': 0.9995960593223572}