# 相似性度量

In [1]:
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [2]:
dataset = load_dataset("json", data_files="../../datas/train_pair_1w.json", split="train")

In [3]:
dataset = dataset.train_test_split(0.2)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("models/macbert-base")

In [5]:
def process_func(examples):
    inputs = tokenizer(
        examples["sentence1"], 
        examples["sentence2"],
        max_length=128,
        truncation=True,
        padding=True)
    
    inputs["labels"] = [float(label) for label in examples["label"]]
    return inputs

In [6]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained("models/macbert-base", num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at models/macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import evaluate

combined_metrics = evaluate.combine(["metric_accuracy.py", "metric_f1.py"])

In [9]:
def compute_metrics(pred):
    predictions, labels = pred
    prediction_level = 0.5
    predictions = [int(p > prediction_level) for p in predictions]
    labels = [int(l) for l in labels]
    print(len(predictions), len(labels))

    return combined_metrics.compute(
        predictions=predictions,
        references=labels
    )

In [10]:
args = TrainingArguments(
    output_dir="trained/models_for_seqcrossimilarity",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=200,
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    num_train_epochs=1,
    optim="adafactor",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,
    weight_decay=0.01
)

In [11]:
trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer)
)

In [12]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
200,0.1338,0.088136,0.8825,0.854309


2000 2000


  predictions = [int(p > prediction_level) for p in predictions]


TrainOutput(global_step=250, training_loss=0.12803606605529785, metrics={'train_runtime': 915.8551, 'train_samples_per_second': 8.735, 'train_steps_per_second': 0.273, 'total_flos': 526217385984000.0, 'train_loss': 0.12803606605529785, 'epoch': 1.0})

In [66]:
from transformers import pipeline, TextClassificationPipeline

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [74]:
result = pipe({"text": "我喜欢北京", "text_pair": "天气怎样"}, function_to_apply="none") # 回归任务所以用none, 不对结果做softmax

# 相似性判断

In [1]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
dataset = load_dataset("json", data_files="../../datas/train_pair_1w.json", split="train")

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
dataset = dataset.train_test_split(0.2)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("models/macbert-base")

In [5]:
def process_func(examples):
    sentence_list = []
    labels = []
    for sen1, sen2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
        sentence_list.append(sen1)
        sentence_list.append(sen2)
        labels.append(1 if int(label) == 1 else -1) # CosineEmbeddingLoss要求相似1, 不相似-1
    
    inputs = tokenizer(sentence_list, max_length=128, truncation=True, padding=True)
    inputs = {
        k: [v[i: i + 2] for i in range(0, len(v), 2)]
        for k, v in inputs.items()
    }
    inputs["labels"] = labels
    return inputs

In [6]:
tokenized_data = dataset.map(process_func, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
print(tokenized_data["train"][0])

{'input_ids': [[101, 852, 3221, 8024, 2769, 2553, 7557, 6382, 3209, 100, 100, 2772, 3221, 2933, 2372, 6432, 671, 1368, 8024, 3633, 1008, 3791, 6427, 2792, 6382, 100, 100, 2769, 3315, 782, 8024, 3345, 1046, 6930, 3345, 8024, 3295, 5307, 4638, 4288, 2360, 100, 100, 5318, 2190, 3221, 702, 794, 679, 1600, 6983, 4638, 782, 8039, 2769, 1403, 3341, 3766, 1600, 100, 100, 3717, 8013, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 6432, 679, 2137, 2110, 2110, 3345, 7027, 5101, 762, 4638, 3416, 948, 3341, 2533, 5473, 3209, 8024, 800, 6432, 784, 720, 3847, 6716, 1107, 2533, 1916, 1448, 8024, 3227, 4197, 3221, 1930, 1920, 1071, 6404, 8024, 1071, 2141, 800, 1525, 7027, 3221, 1358, 749, 7599, 2170, 2798, 7410, 1358, 4638, 8024, 1921, 4495, 2218, 3221, 6821, 3416, 8024, 1600, 784, 720, 5790, 5763, 6963, 679, 5052, 752, 8024, 6206, 3221, 5473, 3209

In [8]:
from transformers import BertPreTrainedModel, BertModel, BertForSequenceClassification
from typing import Optional
from torch.nn import CosineEmbeddingLoss, CosineSimilarity


class BertForSimilarity(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        A_input_ids, B_input_ids = input_ids[:, 0], input_ids[:, 1]
        A_attention_mask, B_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
        A_token_type_ids, B_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]

        A_outputs = self.bert(
            A_input_ids,
            attention_mask=A_attention_mask,
            token_type_ids=A_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        A_pooled_output = A_outputs[1]

        B_outputs = self.bert(
            B_input_ids,
            attention_mask=B_attention_mask,
            token_type_ids=B_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        B_pooled_output = B_outputs[1]

        similarity = CosineSimilarity()(A_pooled_output, B_pooled_output)

        loss = None
        if labels is not None:
            loss_func = CosineEmbeddingLoss(0.3)
            loss = loss_func(A_pooled_output, B_pooled_output, labels)
        output = (similarity, )
        return ((loss,) + output) if loss is not None else output

In [9]:
model = BertForSimilarity.from_pretrained("models/macbert-base")

In [11]:
import evaluate

combined_metrics = evaluate.combine(["metric_accuracy.py", "metric_f1.py"])

In [None]:
def compute_metrics(pred):
    
    predictions, labels = pred
    prediction_level = 0.7
    predictions = [int(p > prediction_level) for p in predictions]
    labels = [int(l > 0) for l in labels]

    return combined_metrics.compute(
        predictions=predictions,
        references=labels
    )

In [22]:
args = TrainingArguments(
    output_dir="trained/models_for_seqAseqsimilarity",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    eval_strategy="steps",
    save_strategy="steps",
    logging_steps=200,
    eval_steps=200,
    save_steps=200,
    save_total_limit=3,
    num_train_epochs=3,
    optim="adafactor",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    learning_rate=2e-5,
    weight_decay=0.01
)

In [23]:
from transformers import DataCollatorWithPadding

trainer = Trainer(
    args=args,
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer)
)

In [24]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
200,0.1485,0.196703,0.756,0.68798
400,0.1687,0.183045,0.783,0.71671
600,0.1349,0.177431,0.795,0.731675


2000 2000
2000 2000
2000 2000


TrainOutput(global_step=750, training_loss=0.14339023081461588, metrics={'train_runtime': 151.167, 'train_samples_per_second': 158.765, 'train_steps_per_second': 4.961, 'total_flos': 3157275967488000.0, 'train_loss': 0.14339023081461588, 'epoch': 3.0})

In [25]:
class SimilarityPipeline:
    def __init__(self, model, tokenizer):
        self.model = model.bert
        self.tokenizer = tokenizer
        self.device = model.device
    
    def preprocess(self, sentence1, sentence2):
        inputs = tokenizer([sentence1, sentence2], max_length=128, truncation=True, padding=True, return_tensors="pt")
        return inputs

    def predict(self, tokens):
        tokens = {k: v.to(self.device) for k, v in tokens.items()}
        return self.model(**tokens)[1]

    def postprocess(self, logits):
        cos = CosineSimilarity()(logits[None, 0, :], logits[None, 1, :])
        return cos.cpu().item()

    def __call__(self, sentence1, sentence2):
        tokens = self.preprocess(sentence1, sentence2)
        logits = self.predict(tokens)
        result = self.postprocess(logits)
        return result

In [26]:
pipe = SimilarityPipeline(model, tokenizer)

In [42]:
pipe("我喜欢北京", "明天不行")

0.2632889747619629