# 文本相似度实例(单塔模型)

单塔模型顾名思义，是指在整个过程中只进行一次模型计算。在单塔模型下，我们需要把两句文本通过[SEP]进行拼接，将拼接好的数据喂给模型，通过output中的[CLS] token做一个分类任务
- 优点：准确性高
- 缺点：计算慢

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step2 加载数据集

In [2]:

dataset = load_dataset('json', data_files='./data/train_pair_1w.json', split="train") # 如果是加载固定的json文件则用load_dataset
# dataset = DatasetDict.load_from_disk('./data') # 加载的是huggingface的数据集
dataset

Dataset({
    features: ['sentence1', 'sentence2', 'label'],
    num_rows: 10000
})

In [3]:
dataset[0]

{'sentence1': '找一部小时候的动画片', 'sentence2': '求一部小时候的动画片。谢了', 'label': '1'}

## Step3 划分数据集

In [4]:
mydatasets = dataset.train_test_split(0.2)
mydatasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

## Step4 数据集预处理

In [5]:
import torch

tokenizer = AutoTokenizer.from_pretrained('D:/pretrained_model/models--hfl--chinese-macbert-base')

def process_function(examples):
    tokenizer_examples = tokenizer(examples['sentence1'], examples['sentence2'], max_length=200, truncation=True)
    tokenizer_examples['labels'] = [float(label) for label in examples['label']]
    return tokenizer_examples

In [6]:
tokenizer_examples = mydatasets.map(process_function, batched=True, remove_columns=mydatasets['train'].column_names)
tokenizer_examples

Map: 100%|██████████| 8000/8000 [00:01<00:00, 7237.30 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 7193.17 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [7]:
tokenizer_examples['train'][0].keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

## Step5 创建模型

In [14]:
from transformers import BertForSequenceClassification 
model = AutoModelForSequenceClassification.from_pretrained('D:/pretrained_model/models--hfl--chinese-macbert-base', num_labels=1)

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at D:/pretrained_model/models--hfl--chinese-macbert-base were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForS

## Step6 创建评估函数

In [25]:
import evaluate

acc_metric = evaluate.load('./metric_accuracy.py')
f1_metric = evaluate.load('./metric_f1.py')

In [26]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [int(p > 0.5) for p in predictions]
    labels = [int(l) for l in labels]
    # predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## Step7 创建TrainingArguments

In [27]:
train_args = TrainingArguments(
    output_dir='./cross_model',
    overwrite_output_dir=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=64,
    logging_steps=100,
    learning_rate=2e-5,
    metric_for_best_model='f1',
    load_best_model_at_end=True,
    num_train_epochs=1
)

## Step8 创建Trainer

In [28]:
trainer = Trainer(
    model=model,
    args=train_args,
    tokenizer=tokenizer,
    train_dataset=tokenizer_examples['train'],
    eval_dataset=tokenizer_examples['test'],
    compute_metrics=eval_metric
)

## Step9 模型训练

In [29]:
trainer.train()

 33%|███▎      | 2000/6000 [10:00<20:01,  3.33it/s]

                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [04:35<00:00, 10.40it/s]

[A[A

{'loss': 0.0536, 'learning_rate': 1.9e-05, 'epoch': 0.05}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [04:45<00:00, 10.40it/s]

[A[A

{'loss': 0.0477, 'learning_rate': 1.8e-05, 'epoch': 0.1}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [04:54<00:00, 10.40it/s]

[A[A

{'loss': 0.0302, 'learning_rate': 1.7e-05, 'epoch': 0.15}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [05:04<00:00, 10.40it/s]

[A[A

{'loss': 0.0402, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.2}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [05:14<00:00, 10.40it/s]

[A[A

{'loss': 0.0321, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.25}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [05:25<00:00, 10.40it/s]

[A[A

{'loss': 0.0295, 'learning_rate': 1.4e-05, 'epoch': 0.3}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [05:35<00:00, 10.40it/s]

[A[A

{'loss': 0.0406, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.35}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [05:45<00:00, 10.40it/s]

[A[A

{'loss': 0.0318, 'learning_rate': 1.2e-05, 'epoch': 0.4}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [05:55<00:00, 10.40it/s]

[A[A

{'loss': 0.0317, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.45}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [06:05<00:00, 10.40it/s]

[A[A

{'loss': 0.0294, 'learning_rate': 1e-05, 'epoch': 0.5}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [06:15<00:00, 10.40it/s]

[A[A

{'loss': 0.0279, 'learning_rate': 9e-06, 'epoch': 0.55}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [06:24<00:00, 10.40it/s]

[A[A

{'loss': 0.0438, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.6}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [06:34<00:00, 10.40it/s]

[A[A

{'loss': 0.04, 'learning_rate': 7e-06, 'epoch': 0.65}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [06:45<00:00, 10.40it/s]

[A[A

{'loss': 0.0375, 'learning_rate': 6e-06, 'epoch': 0.7}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [06:54<00:00, 10.40it/s]

[A[A

{'loss': 0.0292, 'learning_rate': 5e-06, 'epoch': 0.75}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [07:04<00:00, 10.40it/s]

[A[A

{'loss': 0.0492, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.8}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [07:13<00:00, 10.40it/s]

[A[A

{'loss': 0.0483, 'learning_rate': 3e-06, 'epoch': 0.85}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [07:23<00:00, 10.40it/s]

[A[A

{'loss': 0.0436, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.9}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [07:34<00:00, 10.40it/s]

[A[A

{'loss': 0.0459, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.95}


                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [07:43<00:00, 10.40it/s]

[A[A

{'loss': 0.0561, 'learning_rate': 0.0, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
  predictions = [int(p > 0.5) for p in predictions]
                                                   


[A[A[A                                      
[A                                                

100%|██████████| 2000/2000 [07:56<00:00, 10.40it/s]

[A[A
[A
[A

{'eval_loss': 0.07564018666744232, 'eval_accuracy': 0.913, 'eval_f1': 0.8886043533930857, 'eval_runtime': 13.4288, 'eval_samples_per_second': 148.934, 'eval_steps_per_second': 2.383, 'epoch': 1.0}


  state_dict = torch.load(best_model_path, map_location="cpu")
                                                   


[A[A[A                                      
100%|██████████| 2000/2000 [07:58<00:00, 10.40it/s]

100%|██████████| 2000/2000 [03:32<00:00,  9.41it/s]

{'train_runtime': 212.4913, 'train_samples_per_second': 37.649, 'train_steps_per_second': 9.412, 'train_loss': 0.03941202163696289, 'epoch': 1.0}





TrainOutput(global_step=2000, training_loss=0.03941202163696289, metrics={'train_runtime': 212.4913, 'train_samples_per_second': 37.649, 'train_steps_per_second': 9.412, 'train_loss': 0.03941202163696289, 'epoch': 1.0})

## Step10 模型评估

In [31]:
trainer.evaluate(tokenizer_examples["test"])

  predictions = [int(p > 0.5) for p in predictions]
100%|██████████| 32/32 [00:13<00:00,  2.36it/s]


{'eval_loss': 0.07564018666744232,
 'eval_accuracy': 0.913,
 'eval_f1': 0.8886043533930857,
 'eval_runtime': 14.1363,
 'eval_samples_per_second': 141.48,
 'eval_steps_per_second': 2.264,
 'epoch': 1.0}

## Step11 模型预测

In [32]:
from transformers import pipeline

In [41]:
model.config

BertConfig {
  "_name_or_path": "D:/pretrained_model/models--hfl--chinese-macbert-base",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "\u4e0d\u76f8\u4f3c",
    "1": "\u76f8\u4f3c"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "problem_type": "regression",
  "torch_dtype": "float32",
  "transformers_version": "4.30.0",
  "type

In [33]:
model.config.id2label = {0:'不相似', 1:'相似'}

pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [44]:
result = pipe({"text": "我喜欢苹果", "text_pair": "苹果不是我喜欢的水果"}, function_to_apply="none")
# print(result)
result["label"] = "相似" if result["score"] > 0.5 else "不相似"
result

{'label': '不相似', 'score': -0.0076041752472519875}