<a id="part1"></a>
## 1. 环境配置与依赖安装

In [4]:
!pip install mindnlp==0.4.0
!pip uninstall mindformers transformers -y
!pip install transformers==4.40.0
!pip install mindspore==2.4.1
%env HF_ENDPOINT=https://hf-mirror.com

Looking in indexes: http://pip.modelarts.private.com:8888/repository/pypi/simple
You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python3.9 -m pip install --upgrade pip' command.[0m
Found existing installation: transformers 4.40.0
Uninstalling transformers-4.40.0:
  Successfully uninstalled transformers-4.40.0
Looking in indexes: http://pip.modelarts.private.com:8888/repository/pypi/simple
Collecting transformers==4.40.0
  Downloading http://pip.modelarts.private.com:8888/repository/pypi/packages/transformers/4.40.0/transformers-4.40.0-py3-none-any.whl (9.0 MB)
[K     |████████████████████████████████| 9.0 MB 52.2 MB/s eta 0:00:01
Installing collected packages: transformers
Successfully installed transformers-4.40.0
You should consider upgrading via the '/home/ma-user/anaconda3/envs/MindSpore/bin/python3.9 -m pip install --upgrade pip' command.[0m
Looking in indexes: http://pip.modelarts.private.com:8888/repository/pypi/simple
You should consider upg

<a id="part2"></a>
## 2. 数据预处理扩展（情感分类任务）

In [43]:
import math
import numpy as np
import pandas as pd
import os
import math
import random
import codecs
from pathlib import Path

import mindspore
import mindspore.dataset as ds
import mindspore.nn as nn
from mindspore import Tensor
from mindspore import context
from mindspore.train.model import Model
from mindspore.nn.metrics import Accuracy
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.ops import operations as ops

import argparse
import os

import mindspore
from mindnlp.core.optim import AdamW
from tqdm import tqdm
import evaluate
from mindnlp.dataset import load_dataset
from mindnlp.engine import set_seed
from mindnlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
from mindnlp.transformers.optimization import get_linear_schedule_with_warmup

from mindnlp.peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    PromptTuningConfig,
)

mindspore.set_context(device_target='Ascend')

class MovieReview:
    """情感分类数据预处理（移植自TextCNN_MindSpore.ipynb）"""
    def __init__(self, root_dir, maxlen=51, split=0.0):
        self.path = root_dir
        self.feelMap = {'neg':0, 'pos':1}
        self.files = []
        
        self.Vocab = {}  # 显式创建词表字典
        
        mypath = Path(self.path)
        if not mypath.exists() or not mypath.is_dir():
            print("please check the root_dir!")
            raise ValueError

        # 在数据目录中找到文件
        for root,_,filename in os.walk(self.path):
            for each in filename:
                self.files.append(os.path.join(root,each))
            break

        # 确认是否为两个文件.neg与.pos
        if len(self.files) != 2:
            print("There are {} files in the root_dir".format(len(self.files)))
            raise ValueError

        # 读取数据
        self.word_num = 0
        self.maxlen = 0
        self.minlen = float("inf")
        self.maxlen = float("-inf")
        self.Pos = []
        self.Neg = []
        for filename in self.files:
            f = codecs.open(filename, 'r')
            ff = f.read()
            file_object = codecs.open(filename, 'w', 'utf-8')
            file_object.write(ff)
            self.read_data(filename)
        self.PosNeg = self.Pos + self.Neg

        self.text2vec(maxlen=maxlen)
        self.split_dataset(split=split)

    def read_data(self, filePath):

        with open(filePath,'r') as f:
            
            for sentence in f.readlines():
                sentence = sentence.replace('\n','')\
                                    .replace('"','')\
                                    .replace('\'','')\
                                    .replace('.','')\
                                    .replace(',','')\
                                    .replace('[','')\
                                    .replace(']','')\
                                    .replace('(','')\
                                    .replace(')','')\
                                    .replace(':','')\
                                    .replace('--','')\
                                    .replace('-',' ')\
                                    .replace('\\','')\
                                    .replace('0','')\
                                    .replace('1','')\
                                    .replace('2','')\
                                    .replace('3','')\
                                    .replace('4','')\
                                    .replace('5','')\
                                    .replace('6','')\
                                    .replace('7','')\
                                    .replace('8','')\
                                    .replace('9','')\
                                    .replace('`','')\
                                    .replace('=','')\
                                    .replace('$','')\
                                    .replace('/','')\
                                    .replace('*','')\
                                    .replace(';','')\
                                    .replace('<b>','')\
                                    .replace('%','')
                sentence = sentence.split(' ')
                sentence = list(filter(lambda x: x, sentence))
                if sentence:
                    self.word_num += len(sentence)
                    self.maxlen = self.maxlen if self.maxlen >= len(sentence) else len(sentence)
                    self.minlen = self.minlen if self.minlen <= len(sentence) else len(sentence)
                    if 'pos' in filePath:
                        self.Pos.append([sentence,self.feelMap['pos']])
                    else:
                        self.Neg.append([sentence,self.feelMap['neg']])
        
    def text2vec(self, maxlen):
        """文本向量化"""
        for sentence_label in self.Pos + self.Neg:
            vector = [0]*maxlen
            for idx, word in enumerate(sentence_label[0]):
                if idx >= maxlen: break
                if word not in self.Vocab:
                    self.Vocab[word] = len(self.Vocab)
                vector[idx] = self.Vocab[word]
            sentence_label[0] = vector
    
    def create_dataset(self, batch_size=64):
        """生成MindSpore Dataset"""
        def generator():
            for item in self.train:
                yield (np.array(item[0], dtype=np.int32), 
                       np.array(item[1], dtype=np.int32))
        return ds.GeneratorDataset(source=generator(), 
                              column_names=["input_ids", "labels"]).batch(batch_size)
    def split_dataset(self, split=0.0):  # split参数无效
        self.train = self.Pos + self.Neg
        random.shuffle(self.train)
        self.test = self.train





<a id="part3"></a>
## 3. 硬提示推理

In [46]:
def inference_with_hard_prompt():
    # --- 配置 ---
    HARD_PROMPT = "这个句子的情感是："
    print("\n" + "="*50)
    print("硬提示直接推理（没经过训练）")
    print(f"提示模板: '{HARD_PROMPT}'")
    print("="*50 + "\n")
    
    # --- 加载模型 ---
    model = AutoModelForSequenceClassification.from_pretrained(
        "AI-ModelScope/roberta-large", 
        num_labels=2  # 二分类任务
    )
    
    # --- 加载数据并添加提示 ---
    sentiment_data = MovieReview(root_dir="./data/", maxlen=51)
    vocab_inv = {v:k for k,v in sentiment_data.Vocab.items()}  # 反向词表
    dataset = sentiment_data.create_dataset(batch_size=32)
    
    # --- 推理与评估 ---
    metric = evaluate.load("accuracy")
    model.set_train(False)  # 设置为推理模式
    
    for batch in dataset.create_dict_iterator():
        # 将词索引转换为文本并添加提示
        batch_texts = [
            f"{HARD_PROMPT}{' '.join([vocab_inv[idx] for idx in seq if idx != 0])}" 
            for seq in batch["input_ids"].asnumpy()
        ]
        
        # 编码输入
        inputs = tokenizer(
            batch_texts,
            padding="max_length",    # 自动填充到最大长度
            truncation=True,         # 启用截断
            max_length=64,           # 设置与训练一致的max_length
            return_tensors="ms"
        )["input_ids"]
        
        # 推理
        outputs = model(inputs)
        preds = outputs.logits.argmax(-1).asnumpy()
        
        # 计算指标
        metric.add_batch(
            predictions=preds,
            references=batch["labels"].asnumpy()
        )
    
    # 输出结果
    acc = metric.compute()["accuracy"]
    print("\n" + "="*50)
    print(f"硬提示推理结果:")
    print(f"- 总样本数: {len(dataset)*32}")
    print(f"- 准确率: {acc*100:.2f}%")
    print("="*50)

# ===== 执行推理 =====
if __name__ == "__main__":
    inference_with_hard_prompt()


硬提示直接推理（没经过训练）
提示模板: '这个句子的情感是：'



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at AI-ModelScope/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.



硬提示推理结果:
- 总样本数: 10688
- 准确率: 50.00%


<a id="part4"></a>
## 4. 软提示训练

In [86]:
# --- 软提示配置 ---
batch_size = 8
model_name_or_path = "AI-ModelScope/roberta-large"
task = "mrpc"
peft_type = PeftType.PROMPT_TUNING
# num_epochs = 20
num_epochs = 5

# peft config
peft_config = PromptTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
# learning rate
lr = 1e-3

In [87]:
# load tokenizer
if any(k in model_name_or_path for k in ("gpt", "opt", "bloom")):
    padding_side = "left"
else:
    padding_side = "right"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side=padding_side, mirror="modelscope")
if getattr(tokenizer, "pad_token_id") is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

In [88]:
datasets = load_dataset("glue", task)
print(next(datasets['train'].create_dict_iterator()))

{'sentence1': Tensor(shape=[], dtype=String, value= 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .'), 'sentence2': Tensor(shape=[], dtype=String, value= 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .'), 'label': Tensor(shape=[], dtype=Int64, value= 1), 'idx': Tensor(shape=[], dtype=Int64, value= 0)}


In [89]:
from mindnlp.dataset import BaseMapFunction

class MapFunc(BaseMapFunction):
    def __call__(self, sentence1, sentence2, label, idx):
        outputs = tokenizer(sentence1, sentence2, truncation=True, max_length=None)
        return outputs['input_ids'], outputs['attention_mask'], label


def get_dataset(dataset, tokenizer):
    input_colums=['sentence1', 'sentence2', 'label', 'idx']
    output_columns=['input_ids', 'attention_mask', 'labels']
    dataset = dataset.map(MapFunc(input_colums, output_columns),
                          input_colums, output_columns)
    dataset = dataset.padded_batch(batch_size, pad_info={'input_ids': (None, tokenizer.pad_token_id),
                                                         'attention_mask': (None, 0)})
    return dataset

train_dataset = get_dataset(datasets['train'], tokenizer)
eval_dataset = get_dataset(datasets['validation'], tokenizer)

In [90]:
print(next(train_dataset.create_dict_iterator()))

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': Tensor(shape=[8, 67], dtype=Int64, value=
[[    0, 10127,  1001 ...     1,     1,     1],
 [    0,   975, 26802 ...     1,     1,     1],
 [    0,  1213,    56 ...     1,     1,     1],
 ...
 [    0,  9064, 32497 ...     1,     1,     1],
 [    0,   133,  4417 ...     1,     1,     1],
 [    0,   133, 19888 ...     1,     1,     1]]), 'attention_mask': Tensor(shape=[8, 67], dtype=Int64, value=
[[1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 ...
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0],
 [1, 1, 1 ... 0, 0, 0]]), 'labels': Tensor(shape=[8], dtype=Int64, value= [1, 0, 1, 0, 1, 1, 0, 1])}


In [91]:
metric = evaluate.load("glue", task)

In [92]:
# load model
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, return_dict=True, mirror="modelscope")
model = get_peft_model(model, peft_config)
# print number of trainable parameters
model.print_trainable_parameters()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at AI-ModelScope/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,061,890 || all params: 356,423,684 || trainable%: 0.2979291353713745


模型微调

In [93]:
optimizer = AdamW(params=model.trainable_params(), lr=lr)

# Instantiate scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0.06 * (len(train_dataset) * num_epochs),
    num_training_steps=(len(train_dataset) * num_epochs),
)

In [94]:
# print name of trainable parameters
model.trainable_params()

(Tensor(shape=[1024, 1024], dtype=Float32, value=
 [[ 1.38089573e-02,  2.89921463e-02,  1.32645434e-02 ... -2.05632485e-02, -2.86122505e-03,  2.10555382e-02],
  [ 4.60442342e-03,  4.34034653e-02, -8.66422988e-03 ... -2.42341571e-02,  1.46407923e-02, -2.72690002e-02],
  [ 1.18833138e-02, -6.02551457e-03, -1.69392396e-02 ... -4.50211251e-03,  7.50867603e-03,  3.92194232e-03],
  ...
  [ 3.13470233e-03, -4.23548520e-02,  2.29569934e-02 ... -2.14548297e-02, -2.52000801e-02, -1.21975020e-02],
  [ 2.33502574e-02,  1.67301262e-03, -9.02926270e-03 ... -2.08490994e-02, -2.75446083e-02, -8.77535250e-03],
  [ 2.20639468e-03, -2.47005536e-03,  5.73945278e-03 ... -2.47196835e-02, -3.58031616e-02, -9.51369666e-03]]),
 Tensor(shape=[1024], dtype=Float32, value= [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00 ...  0.00000000e+00,  0.00000000e+00,  0.00000000e+00]),
 Tensor(shape=[2, 1024], dtype=Float32, value=
 [[-3.27868313e-02, -2.51472760e-02, -3.97190116e-02 ... -1.66799501e-02, -8.69967137e-03

In [95]:
def forward_fn(**batch):
    outputs = model(**batch)
    loss = outputs.loss
    return loss

grad_fn = mindspore.value_and_grad(forward_fn, None, model.trainable_params())

for epoch in range(num_epochs):
    model.set_train()
    train_total_size = train_dataset.get_dataset_size()
    for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):

        loss, grads = grad_fn(**batch)
        optimizer.step(grads)
        lr_scheduler.step()

    model.set_train(False)
    eval_total_size = eval_dataset.get_dataset_size()
    for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)):
        outputs = model(**batch)
        predictions = outputs.logits.argmax(axis=-1)
        predictions, references = predictions, batch["labels"]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    print(f"epoch {epoch}:", eval_metric)

  0%|          | 0/459 [00:00<?, ?it/s]

..

  0%|          | 1/459 [00:13<1:39:17, 13.01s/it]

...

100%|█████████▉| 458/459 [02:06<00:00,  6.00it/s]

..

100%|██████████| 459/459 [02:17<00:00,  3.33it/s]
100%|██████████| 51/51 [00:03<00:00, 14.50it/s]


epoch 0: {'accuracy': 0.6838235294117647, 'f1': 0.8122270742358079}


100%|██████████| 459/459 [01:16<00:00,  5.99it/s]
100%|██████████| 51/51 [00:03<00:00, 14.89it/s]


epoch 1: {'accuracy': 0.6887254901960784, 'f1': 0.8140556368960469}


100%|██████████| 459/459 [01:18<00:00,  5.87it/s]
100%|██████████| 51/51 [00:03<00:00, 14.52it/s]


epoch 2: {'accuracy': 0.7132352941176471, 'f1': 0.8208269525267994}


100%|██████████| 459/459 [01:33<00:00,  4.92it/s]
100%|██████████| 51/51 [00:04<00:00, 12.40it/s]


epoch 3: {'accuracy': 0.7083333333333334, 'f1': 0.7965811965811965}


100%|██████████| 459/459 [01:29<00:00,  5.14it/s]
100%|██████████| 51/51 [00:03<00:00, 13.24it/s]


epoch 4: {'accuracy': 0.7034313725490197, 'f1': 0.8169440242057489}
