```shell
conda create -n AI python=3.9; conda activate AI
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple transformers==4.39.2
conda install cudatoolkit==11.8
pip3 install numpy --pre torch torchvision torchaudio --force-reinstall --index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip install auto-gptq --no-build-isolation
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple ipykernel evaluate tqdm scikit-learn
````

In [1]:
import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from pprint import pprint
from peft import get_peft_model, LoraConfig, PeftModel
import evaluate
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForMaskedLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset = load_dataset("stanfordnlp/sst2")

### 一、加载语句分类模型

In [2]:
model_checkpoint = "FacebookAI/roberta-large"
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

# 这样会自动创建一个分类模型
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
 

### 二、创建little-SST-2数据集

In [3]:
def small_sst(path): 
    f = open(path)
    ds = []
    label = []
    for line in f:
        text, l = line.split('\t')
        ds.append(text)
        label.append(int(l[0]))
    return {'text': ds, 'label': label}

train_ds = Dataset.from_dict(small_sst("SST-2/train.txt"))
test_ds = Dataset.from_dict(small_sst("SST-2/test.txt"))
# 创建DatasetDict对象
dataset = DatasetDict({
    'train': train_ds,
    'validation': test_ds
})

# 打印数据集信息
pprint(dataset)

{'train': Dataset({
    features: ['text', 'label'],
    num_rows: 7393
}),
 'validation': Dataset({
    features: ['text', 'label'],
    num_rows: 1749
})}


In [4]:
# 创建分词器
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [5]:
# create tokenize function
def tokenize_function(examples):
    text = examples['text']

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
    )
    return tokenized_inputs

# 数据预处理
tokenized_dataset = dataset.map(tokenize_function, batched=True)
pprint(tokenized_dataset)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 7393/7393 [00:00<00:00, 69804.20 examples/s]
Map: 100%|██████████| 1749/1749 [00:00<00:00, 79416.25 examples/s]

{'train': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 7393
}),
 'validation': Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 1749
})}





In [6]:
# 创建评估函数
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script: 4.20kB [00:00, 18.5MB/s]                   


In [7]:
# 定义一些栗子， 这里先测试没有经过微调的模型
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", 
             "This is not worth watching even once.", "I really love it.", 
             "I cannot stand it.", 
             "They didn't praise him slightly."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt")
    logits = model(inputs).logits
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Negative
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Negative
I really love it. - Positive
I cannot stand it. - Negative
They didn't praise him slightly. - Negative


### 三、用Lora微调模型

In [8]:
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['query', 'key', 'value']) # 这个是可以微调的层

In [9]:
# 打印参数情况
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 1,641,474 || all params: 357,003,268 || trainable%: 0.4598


In [10]:
# 超参数
lr = 1e-5
batch_size = 8
num_epochs = 7

In [11]:
# 训练的信息
training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
# 创建trainer对象
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics,
)

# 训练模型
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  8%|▊         | 502/6475 [00:45<09:29, 10.49it/s]

{'loss': 0.6934, 'grad_norm': 2.2893123626708984, 'learning_rate': 9.227799227799229e-06, 'epoch': 0.54}


                                                  
 14%|█▍        | 925/6475 [01:32<08:27, 10.94it/s]

{'eval_loss': 0.26943090558052063, 'eval_accuracy': {'accuracy': 0.9291023441966838}, 'eval_runtime': 8.7184, 'eval_samples_per_second': 200.609, 'eval_steps_per_second': 25.119, 'epoch': 1.0}


 15%|█▌        | 1002/6475 [01:40<08:04, 11.29it/s] 

{'loss': 0.4861, 'grad_norm': 26.86524200439453, 'learning_rate': 8.455598455598457e-06, 'epoch': 1.08}


 23%|██▎       | 1502/6475 [02:25<07:43, 10.73it/s]

{'loss': 0.3804, 'grad_norm': 0.3137076497077942, 'learning_rate': 7.683397683397685e-06, 'epoch': 1.62}


                                                   
 29%|██▊       | 1850/6475 [03:06<06:16, 12.29it/s]

{'eval_loss': 0.24798475205898285, 'eval_accuracy': {'accuracy': 0.9422527158376215}, 'eval_runtime': 8.7397, 'eval_samples_per_second': 200.121, 'eval_steps_per_second': 25.058, 'epoch': 2.0}


 31%|███       | 2001/6475 [03:21<07:14, 10.30it/s]  

{'loss': 0.3414, 'grad_norm': 6.949513912200928, 'learning_rate': 6.911196911196911e-06, 'epoch': 2.16}


 39%|███▊      | 2501/6475 [04:07<06:01, 11.01it/s]

{'loss': 0.3013, 'grad_norm': 0.18884636461734772, 'learning_rate': 6.13899613899614e-06, 'epoch': 2.7}


                                                   
 43%|████▎     | 2775/6475 [04:41<05:16, 11.69it/s]

{'eval_loss': 0.2555335760116577, 'eval_accuracy': {'accuracy': 0.9468267581475128}, 'eval_runtime': 8.709, 'eval_samples_per_second': 200.827, 'eval_steps_per_second': 25.146, 'epoch': 3.0}


 46%|████▋     | 3002/6475 [05:02<05:38, 10.26it/s]  

{'loss': 0.2963, 'grad_norm': 22.516517639160156, 'learning_rate': 5.366795366795368e-06, 'epoch': 3.24}


 54%|█████▍    | 3501/6475 [05:47<04:28, 11.09it/s]

{'loss': 0.2757, 'grad_norm': 79.44569396972656, 'learning_rate': 4.594594594594596e-06, 'epoch': 3.78}


                                                   
 57%|█████▋    | 3700/6475 [06:14<03:58, 11.61it/s]

{'eval_loss': 0.225992813706398, 'eval_accuracy': {'accuracy': 0.9531160663236135}, 'eval_runtime': 8.7314, 'eval_samples_per_second': 200.312, 'eval_steps_per_second': 25.082, 'epoch': 4.0}


 62%|██████▏   | 4002/6475 [06:43<03:45, 10.95it/s]  

{'loss': 0.2815, 'grad_norm': 2.7410101890563965, 'learning_rate': 3.822393822393823e-06, 'epoch': 4.32}


 70%|██████▉   | 4502/6475 [07:29<02:49, 11.63it/s]

{'loss': 0.2652, 'grad_norm': 23.52754783630371, 'learning_rate': 3.0501930501930503e-06, 'epoch': 4.86}


                                                   
 71%|███████▏  | 4625/6475 [07:49<02:48, 11.00it/s]

{'eval_loss': 0.2372749149799347, 'eval_accuracy': {'accuracy': 0.9514008004574043}, 'eval_runtime': 8.7421, 'eval_samples_per_second': 200.066, 'eval_steps_per_second': 25.051, 'epoch': 5.0}


 77%|███████▋  | 5001/6475 [08:25<02:14, 10.92it/s]

{'loss': 0.2624, 'grad_norm': 47.604286193847656, 'learning_rate': 2.2779922779922782e-06, 'epoch': 5.41}


 85%|████████▍ | 5502/6475 [09:11<01:29, 10.83it/s]

{'loss': 0.2482, 'grad_norm': 135.29986572265625, 'learning_rate': 1.505791505791506e-06, 'epoch': 5.95}


                                                   
 86%|████████▌ | 5550/6475 [09:24<01:18, 11.80it/s]

{'eval_loss': 0.2300196886062622, 'eval_accuracy': {'accuracy': 0.9519725557461407}, 'eval_runtime': 8.7322, 'eval_samples_per_second': 200.293, 'eval_steps_per_second': 25.08, 'epoch': 6.0}


 93%|█████████▎| 6002/6475 [10:06<00:43, 10.88it/s]

{'loss': 0.2433, 'grad_norm': 0.2586665153503418, 'learning_rate': 7.335907335907337e-07, 'epoch': 6.49}


                                                   
100%|██████████| 6475/6475 [10:58<00:00, 11.62it/s]

{'eval_loss': 0.23166055977344513, 'eval_accuracy': {'accuracy': 0.9531160663236135}, 'eval_runtime': 8.6809, 'eval_samples_per_second': 201.478, 'eval_steps_per_second': 25.228, 'epoch': 7.0}


100%|██████████| 6475/6475 [10:59<00:00,  9.82it/s]

{'train_runtime': 659.5291, 'train_samples_per_second': 78.467, 'train_steps_per_second': 9.818, 'train_loss': 0.3344619291445463, 'epoch': 7.0}





TrainOutput(global_step=6475, training_loss=0.3344619291445463, metrics={'train_runtime': 659.5291, 'train_samples_per_second': 78.467, 'train_steps_per_second': 9.818, 'train_loss': 0.3344619291445463, 'epoch': 7.0})

In [13]:
# 使用微调后的模型
model.to('cpu') 

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    print(logits)
    predictions = torch.max(logits, 1).indices
    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
tensor([[-2.9299,  4.1849]], grad_fn=<AddmmBackward0>)
It was good. - Positive
tensor([[ 2.7952, -3.4325]], grad_fn=<AddmmBackward0>)
Not a fan, don't recommed. - Negative
tensor([[-2.4072,  3.4096]], grad_fn=<AddmmBackward0>)
Better than the first one. - Positive
tensor([[ 2.9686, -3.4959]], grad_fn=<AddmmBackward0>)
This is not worth watching even once. - Negative
tensor([[-3.0375,  4.0711]], grad_fn=<AddmmBackward0>)
I really love it. - Positive
tensor([[ 2.8249, -3.3651]], grad_fn=<AddmmBackward0>)
I cannot stand it. - Negative
tensor([[ 2.0054, -2.2860]], grad_fn=<AddmmBackward0>)
They didn't praise him slightly. - Negative


In [14]:
#  保存模型
path = "pretrained_q_k_v"
model = model.save_pretrained(path)
tokenizer = tokenizer.save_pretrained(path)



### 四、加载模型，进行推理

In [15]:
base_model = "FacebookAI/roberta-large"
model_checkpoint = "pretrained_q_k_v"
# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}

inference_model = AutoModelForSequenceClassification.from_pretrained(
    base_model, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(base_model)
model = PeftModel.from_pretrained(inference_model, model_checkpoint)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
text = "I don't hate or like it."
# 对输入文本进行编码
inputs = tokenizer(text, return_tensors="pt")

# 模型推理
with torch.no_grad():
    outputs = model(**inputs)

# 获取预测结果
predictions = torch.argmax(outputs.logits, dim=-1)

# 打印预测结果
print(outputs)
print(f"Predicted label: {id2label[predictions.item()]}")

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.3639, -2.9012]]), hidden_states=None, attentions=None)
Predicted label: Negative
