In [None]:
## BERT 情感分析微调 (Unsloth 框架 + Yelp 数据集)

In [2]:
# 安装依赖（建议在终端运行）
! pip install unsloth datasets bitsandbytes accelerate peft

Defaulting to user installation because normal site-packages is not writeable




In [None]:
# 1. 加载 unsloth + bert 模型
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 2. 加载 Amazon US Reviews 数据集 - Electronics 类目 (3C)
from datasets import load_dataset

dataset = load_dataset("amazon_us_reviews", "Electronics_v1_00")  # 3C类目：Electronics


In [None]:
# 3. 数据预处理：使用 review_body -> text，star_rating -> label (转换为0-4)
def preprocess_function(examples):
    text = examples["review_body"]
    label = int(examples["star_rating"]) - 1
    return tokenizer(text, truncation=True, padding="max_length"), {"labels": label}

encoded_dataset = dataset.map(
    lambda x: preprocess_function(x),
    remove_columns=dataset["train"].column_names
)

encoded_dataset.set_format("torch")

In [None]:
# 4. 定义训练参数
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bert_sentiment_amazon_3c",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=20,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
)

In [None]:
# 5. 开始训练
trainer.train()

In [None]:
# 6. 保存模型
model.save_pretrained("./bert_sentiment_model_3c")
tokenizer.save_pretrained("./bert_sentiment_model_3c")

print("训练完成，3C类目模型已保存！")