In [None]:
!git clone https://github.com/CLUEbenchmark/QBQTC.git

fatal: destination path 'QBQTC' already exists and is not an empty directory.


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
from transformers.modeling_outputs import SequenceClassifierOutput

In [None]:
import os
import json
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, f1_score, classification_report
from transformers.modeling_outputs import SequenceClassifierOutput
import torch.nn.functional as F
from tqdm import tqdm
import random
import warnings

warnings.filterwarnings('ignore')


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


In [None]:
dataset_path = '/content/QBQTC/dataset'

In [None]:
def read_json_file(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                item = json.loads(line.strip())
                data.append(item)
            except json.JSONDecodeError:
                print(f"Error parsing line: {line}")
    return data

files = ['train.json', 'dev.json', 'test.json', 'test_public.json']
data_dict = {}

for file in files:
    file_path = os.path.join(dataset_path, file)
    if os.path.exists(file_path):
        data = read_json_file(file_path)
        data_dict[file.split('.')[0]] = data
        print(f"read {file}，include {len(data)} ")

train_df = pd.DataFrame(data_dict['train'])
dev_df = pd.DataFrame(data_dict['dev'])
test_public_df = pd.DataFrame(data_dict['test_public'])
test_df = pd.DataFrame(data_dict['test'])
print(train_df.head())
print(dev_df.head())
print(test_public_df.head())
print(test_df.head())


read train.json，include 180000 
read dev.json，include 20000 
read test.json，include 10000 
read test_public.json，include 5000 
   id            query                           title label
0   0            应届生实习                    实习生招聘-应届生求职网     1
1   1  ln1+x-ln1+y=x-y  已知函数fx=1lnx+1-x则y=fx的图像高考吧百度贴吧     0
2   2         大秦之悍卒189                   起点中文网阅文集团旗下网站     0
3   3             出门经咒                     快快乐乐出门咒-豆丁网     1
4   4           盖中盖广告词              谁知道盖中盖所有的广告词急用百度知道     1
   id             query                                              title  \
0   0            小孩咳嗽感冒                              小孩感冒过后久咳嗽该吃什么药育儿问答宝宝树   
1   1      前列腺癌根治术后能活多久                    前列腺癌转移能活多久前列腺癌治疗方法盘点-家庭医生在线肿瘤频道   
2   2          英雄大作战022               英雄大作战v0.65无敌版英雄大作战v0.65无敌版小游戏4399小游戏   
3   3  如何将一个文件复制到另一个文件里                           怎么把布局里的图纸复制到另外一个文件中去百度文库   
4   4        gilneasart  gilneas-pictures&charactersart-worldofwarcraft...   

  label  
0     1  
1     1  
2     1 

In [None]:
def preprocess_data(df):
    if 'label' in df.columns:
        df['label'] = df['label'].astype(int)

    df['query'] = df['query'].astype(str)
    df['title'] = df['title'].astype(str)

    return df

train_df = preprocess_data(train_df)
dev_df = preprocess_data(dev_df)
test_public_df = preprocess_data(test_public_df)



In [None]:
class TextMatchingDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.has_label = 'label' in df.columns

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        query = str(self.df.iloc[idx]['query'])
        title = str(self.df.iloc[idx]['title'])

        # tokenizer
        encoding = self.tokenizer(
            query,
            title,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # [1, seq_len]---->[seq_len]
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        token_type_ids = encoding['token_type_ids'].squeeze(0)

        if self.has_label:
            label = torch.tensor(self.df.iloc[idx]['label'])
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids,
                'label': label
            }
        else:
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'token_type_ids': token_type_ids,
            }


In [None]:
def train(model, train_dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_dataloader, desc="Training")
    for batch in progress_bar:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=labels
        )

        # BertForSequenceClassification，the output is an object containing loss and logits
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(train_dataloader)

# evaluate
def evaluate(model, dataloader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            logits = outputs.logits if hasattr(outputs, 'logits') else outputs

            _, preds = torch.max(logits, dim=1)

            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')
    report = classification_report(true_labels, predictions, digits=4)

    return accuracy, f1, report, predictions

# predict
def predict(model, dataloader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Predicting"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            logits = outputs.logits if hasattr(outputs, 'logits') else outputs

            _, preds = torch.max(logits, dim=1)

            predictions.extend(preds.cpu().tolist())

    return predictions


In [None]:
class TransformerMatchingModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=8, num_layers=3, num_classes=3, max_length=256):
        super(TransformerMatchingModel, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_length, embed_dim)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=embed_dim*4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.classifier = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(embed_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        batch_size, seq_len = input_ids.size()

        positions = torch.arange(0, seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, -1)
        word_embeddings = self.embedding(input_ids)
        position_embeddings = self.position_embedding(positions)
        embeddings = word_embeddings + position_embeddings

        # src_key_padding_mask: shape [batch_size, seq_len], where True indicates padding
        src_key_padding_mask = (attention_mask == 0) if attention_mask is not None else None

        transformer_output = self.transformer(embeddings, src_key_padding_mask=src_key_padding_mask)
        cls_output = transformer_output[:, 0, :]
        logits = self.classifier(cls_output)

        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )


In [None]:
def train_transformer_model():
    all_text = list(train_df['query']) + list(train_df['title'])
    all_text = [str(text) for text in all_text]

    all_tokens = []
    for text in all_text:
        tokens = list(text)
        all_tokens.extend(tokens)

    vocab = {token: idx + 1 for idx, token in enumerate(set(all_tokens))}
    vocab['<PAD>'] = 0  # pad
    vocab['<UNK>'] = len(vocab)  # unkown

    class SimpleTokenizer:
        def __init__(self, vocab, max_length=128):
            self.vocab = vocab
            self.max_length = max_length

        def __call__(self, query, title, add_special_tokens=True, max_length=None, padding='max_length', truncation=True, return_tensors='pt'):
            if max_length is None:
                max_length = self.max_length

            query_tokens = list(query)
            title_tokens = list(title)

            query_ids = [self.vocab.get(token, self.vocab['<UNK>']) for token in query_tokens]
            title_ids = [self.vocab.get(token, self.vocab['<UNK>']) for token in title_tokens]

            if add_special_tokens:
                input_ids = [self.vocab['<UNK>']] + query_ids + [self.vocab['<UNK>']] + title_ids + [self.vocab['<UNK>']]
                token_type_ids = [0] * (len(query_ids) + 2) + [1] * (len(title_ids) + 1)
            else:
                input_ids = query_ids + title_ids
                token_type_ids = [0] * len(query_ids) + [1] * len(title_ids)

            if truncation and len(input_ids) > max_length:
                input_ids = input_ids[:max_length]
                token_type_ids = token_type_ids[:max_length]

            attention_mask = [1] * len(input_ids)
            if padding == 'max_length':
                pad_length = max_length - len(input_ids)
                input_ids = input_ids + [self.vocab['<PAD>']] * pad_length
                attention_mask = attention_mask + [0] * pad_length
                token_type_ids = token_type_ids + [0] * pad_length

            if return_tensors == 'pt':
                return {
                    'input_ids': torch.tensor([input_ids]),
                    'attention_mask': torch.tensor([attention_mask]),
                    'token_type_ids': torch.tensor([token_type_ids])
                }
            else:
                return {
                    'input_ids': input_ids,
                    'attention_mask': attention_mask,
                    'token_type_ids': token_type_ids
                }

    tokenizer = SimpleTokenizer(vocab)

    model = TransformerMatchingModel(
        vocab_size=len(vocab),
        embed_dim=128,
        num_heads=8,
        num_layers=3,
        num_classes=3  # 0, 1, 2
    )
    model.to(device)

    train_dataset = TextMatchingDataset(train_df, tokenizer)
    dev_dataset = TextMatchingDataset(dev_df, tokenizer)

    batch_size = 64
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)

    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

    epochs = 5
    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0.1 * total_steps,
        num_training_steps=total_steps
    )

    best_f1 = 0
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        train_loss = train(model, train_dataloader, optimizer, scheduler, device)
        print(f"training loss: {train_loss:.4f}")

        accuracy, f1, report, _ = evaluate(model, dev_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Validation F1 Score: {f1:.4f}")
        print(report)

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), 'best_transformer_model.pt')
            print("save best model")

    model.load_state_dict(torch.load('best_transformer_model.pt'))

    test_public_dataset = TextMatchingDataset(test_public_df, tokenizer)
    test_public_dataloader = DataLoader(test_public_dataset, batch_size=batch_size)

    accuracy, f1, report, _ = evaluate(model, test_public_dataloader, device)
    print(f"Test Set Accuracy: {accuracy:.4f}")
    print(f"Test Set F1 Score: {f1:.4f}")
    print(report)

    # predict
    # test_dataset = TextMatchingDataset(test_df, tokenizer)
    # test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

    # predictions = predict(model, test_dataloader, device)

    # test_df['predicted_label'] = predictions
    # test_df[['id', 'predicted_label']].to_csv('transformer_predictions.csv', index=False)
    # print("result: transformer_predictions.csv")

    return model


In [None]:
print("\ntrain Transformer model...")
transformer_model = train_transformer_model()


train Transformer model...
Epoch 1/5


Training: 100%|██████████| 2813/2813 [01:50<00:00, 25.52it/s, loss=1.01]


training loss: 0.8529


Evaluating: 100%|██████████| 313/313 [00:08<00:00, 38.23it/s]


Validation Accuracy: 0.6457
Validation F1 Score: 0.5975
              precision    recall  f1-score   support

           0     0.5540    0.2432    0.3380      4894
           1     0.6693    0.8945    0.7657     12592
           2     0.4498    0.1834    0.2605      2514

    accuracy                         0.6457     20000
   macro avg     0.5577    0.4403    0.4547     20000
weighted avg     0.6135    0.6457    0.5975     20000

save best model
Epoch 2/5


Training: 100%|██████████| 2813/2813 [01:52<00:00, 24.92it/s, loss=0.659]


training loss: 0.8035


Evaluating: 100%|██████████| 313/313 [00:08<00:00, 38.30it/s]


Validation Accuracy: 0.6544
Validation F1 Score: 0.5901
              precision    recall  f1-score   support

           0     0.5815    0.2303    0.3299      4894
           1     0.6643    0.9299    0.7749     12592
           2     0.5793    0.1002    0.1709      2514

    accuracy                         0.6544     20000
   macro avg     0.6084    0.4201    0.4253     20000
weighted avg     0.6333    0.6544    0.5901     20000

Epoch 3/5


Training: 100%|██████████| 2813/2813 [01:50<00:00, 25.38it/s, loss=1.05]


training loss: 0.7766


Evaluating: 100%|██████████| 313/313 [00:08<00:00, 38.46it/s]


Validation Accuracy: 0.6579
Validation F1 Score: 0.6009
              precision    recall  f1-score   support

           0     0.5783    0.2460    0.3452      4894
           1     0.6702    0.9221    0.7762     12592
           2     0.5758    0.1360    0.2201      2514

    accuracy                         0.6579     20000
   macro avg     0.6081    0.4347    0.4472     20000
weighted avg     0.6359    0.6579    0.6009     20000

save best model
Epoch 4/5


Training: 100%|██████████| 2813/2813 [01:53<00:00, 24.88it/s, loss=0.567]


training loss: 0.7468


Evaluating: 100%|██████████| 313/313 [00:08<00:00, 38.58it/s]


Validation Accuracy: 0.6604
Validation F1 Score: 0.6255
              precision    recall  f1-score   support

           0     0.5624    0.3269    0.4135      4894
           1     0.6883    0.8781    0.7717     12592
           2     0.5050    0.2192    0.3057      2514

    accuracy                         0.6604     20000
   macro avg     0.5852    0.4747    0.4970     20000
weighted avg     0.6345    0.6604    0.6255     20000

save best model
Epoch 5/5


Training: 100%|██████████| 2813/2813 [01:49<00:00, 25.69it/s, loss=0.933]


training loss: 0.7171


Evaluating: 100%|██████████| 313/313 [00:08<00:00, 38.37it/s]


Validation Accuracy: 0.6609
Validation F1 Score: 0.6268
              precision    recall  f1-score   support

           0     0.5642    0.3259    0.4132      4894
           1     0.6895    0.8770    0.7720     12592
           2     0.5009    0.2303    0.3155      2514

    accuracy                         0.6609     20000
   macro avg     0.5848    0.4777    0.5002     20000
weighted avg     0.6351    0.6609    0.6268     20000

save best model


Evaluating: 100%|██████████| 79/79 [00:02<00:00, 38.40it/s]


Test Set Accuracy: 0.6588
Test Set F1 Score: 0.6277
              precision    recall  f1-score   support

           0     0.5637    0.3366    0.4215      1209
           1     0.6912    0.8664    0.7689      3159
           2     0.4717    0.2373    0.3158       632

    accuracy                         0.6588      5000
   macro avg     0.5755    0.4801    0.5021      5000
weighted avg     0.6326    0.6588    0.6277      5000



## bge + transformer
用 BGE 得到 [query_tokens] 和 [title_tokens] 的 embedding， 拼接之后送入 Transformer 模型

In [None]:
class BGETransformerMatchingModel(nn.Module):
    def __init__(self, num_heads=8, num_layers=3, num_classes=3, freeze_bge=True):
        super(BGETransformerMatchingModel, self).__init__()

        self.bge_model_name = "BAAI/bge-base-zh-v1.5"
        self.bge_model = AutoModel.from_pretrained(self.bge_model_name)

        # (option)
        if freeze_bge:
            for param in self.bge_model.parameters():
                param.requires_grad = False

        embed_dim = self.bge_model.config.hidden_size

        self.projection = nn.Linear(embed_dim, 128)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=128,
            nhead=num_heads,
            dim_feedforward=128*4,
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        self.classifier = nn.Sequential(
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, num_classes)
        )

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):

        bge_outputs = self.bge_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        token_embeddings = bge_outputs.last_hidden_state  # [batch_size, seq_len, embed_dim]

        projected_embeddings = self.projection(token_embeddings)  # [batch_size, seq_len, 128]

        src_key_padding_mask = (attention_mask == 0) if attention_mask is not None else None

        transformer_output = self.transformer(projected_embeddings, src_key_padding_mask=src_key_padding_mask)

        cls_output = transformer_output[:, 0, :]

        logits = self.classifier(cls_output)

        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits
        )

def train_bge_transformer_model():
    tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-zh-v1.5")

    model = BGETransformerMatchingModel(
        num_heads=8,
        num_layers=3,
        num_classes=3,  # 0, 1, 2
        freeze_bge=True
    )
    model.to(device)

    train_dataset = TextMatchingDataset(train_df, tokenizer)
    dev_dataset = TextMatchingDataset(dev_df, tokenizer)

    batch_size = 32
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)

    if model.bge_model.parameters().__next__().requires_grad:
        optimizer = torch.optim.AdamW([
            {'params': model.bge_model.parameters(), 'lr': 1e-5},  # smaller learning rate for bge
            {'params': model.projection.parameters()},
            {'params': model.transformer.parameters()},
            {'params': model.classifier.parameters()}
        ], lr=5e-4)
    else:
        optimizer = torch.optim.AdamW([
            {'params': model.projection.parameters()},
            {'params': model.transformer.parameters()},
            {'params': model.classifier.parameters()}
        ], lr=5e-4)

    epochs = 5
    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0.1 * total_steps,
        num_training_steps=total_steps
    )

    best_f1 = 0
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        train_loss = train(model, train_dataloader, optimizer, scheduler, device)
        print(f"training loss: {train_loss:.4f}")

        accuracy, f1, report, _ = evaluate(model, dev_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Validation F1 Score: {f1:.4f}")
        print(report)

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), 'best_bge_transformer_model.pt')
            print("save best model")

    model.load_state_dict(torch.load('best_bge_transformer_model.pt'))

    test_public_dataset = TextMatchingDataset(test_public_df, tokenizer)
    test_public_dataloader = DataLoader(test_public_dataset, batch_size=batch_size)

    accuracy, f1, report, _ = evaluate(model, test_public_dataloader, device)
    print(f"Test Set Accuracy: {accuracy:.4f}")
    print(f"Test Set F1 Score: {f1:.4f}")
    print(report)

    return model


In [None]:
print("\ntrain Transformer_bge model...")
transformer_model = train_bge_transformer_model()


train Transformer_bge model...
Epoch 1/5


Training: 100%|██████████| 5625/5625 [11:36<00:00,  8.08it/s, loss=0.621]


training loss: 0.7958


Evaluating: 100%|██████████| 625/625 [01:13<00:00,  8.55it/s]


Validation Accuracy: 0.6681
Validation F1 Score: 0.5845
              precision    recall  f1-score   support

           0     0.7288    0.2372    0.3579      4894
           1     0.6626    0.9676    0.7866     12592
           2     0.8500    0.0068    0.0134      2514

    accuracy                         0.6681     20000
   macro avg     0.7472    0.4039    0.3860     20000
weighted avg     0.7024    0.6681    0.5845     20000

save best model
Epoch 2/5


Training: 100%|██████████| 5625/5625 [11:35<00:00,  8.09it/s, loss=0.618]


training loss: 0.7376


Evaluating: 100%|██████████| 625/625 [01:12<00:00,  8.60it/s]


Validation Accuracy: 0.6916
Validation F1 Score: 0.6571
              precision    recall  f1-score   support

           0     0.6631    0.4142    0.5099      4894
           1     0.7041    0.9009    0.7904     12592
           2     0.5548    0.1834    0.2756      2514

    accuracy                         0.6916     20000
   macro avg     0.6406    0.4995    0.5253     20000
weighted avg     0.6753    0.6916    0.6571     20000

save best model
Epoch 3/5


Training: 100%|██████████| 5625/5625 [11:33<00:00,  8.11it/s, loss=0.737]


training loss: 0.7124


Evaluating: 100%|██████████| 625/625 [01:12<00:00,  8.64it/s]


Validation Accuracy: 0.6890
Validation F1 Score: 0.6535
              precision    recall  f1-score   support

           0     0.6723    0.3756    0.4819      4894
           1     0.7004    0.9064    0.7902     12592
           2     0.5443    0.2100    0.3031      2514

    accuracy                         0.6890     20000
   macro avg     0.6390    0.4973    0.5251     20000
weighted avg     0.6739    0.6890    0.6535     20000

Epoch 4/5


Training: 100%|██████████| 5625/5625 [11:33<00:00,  8.11it/s, loss=0.749]


training loss: 0.6922


Evaluating: 100%|██████████| 625/625 [01:13<00:00,  8.55it/s]


Validation Accuracy: 0.6955
Validation F1 Score: 0.6668
              precision    recall  f1-score   support

           0     0.7067    0.3796    0.4940      4894
           1     0.7077    0.8989    0.7919     12592
           2     0.5320    0.2912    0.3763      2514

    accuracy                         0.6955     20000
   macro avg     0.6488    0.5232    0.5541     20000
weighted avg     0.6853    0.6955    0.6668     20000

save best model
Epoch 5/5


Training: 100%|██████████| 5625/5625 [11:35<00:00,  8.09it/s, loss=0.734]


training loss: 0.6700


Evaluating: 100%|██████████| 625/625 [01:12<00:00,  8.63it/s]


Validation Accuracy: 0.6979
Validation F1 Score: 0.6786
              precision    recall  f1-score   support

           0     0.6958    0.4285    0.5303      4894
           1     0.7210    0.8722    0.7894     12592
           2     0.5017    0.3496    0.4121      2514

    accuracy                         0.6979     20000
   macro avg     0.6395    0.5501    0.5773     20000
weighted avg     0.6872    0.6979    0.6786     20000

save best model


Evaluating: 100%|██████████| 157/157 [00:18<00:00,  8.69it/s]

Test Set Accuracy: 0.7000
Test Set F1 Score: 0.6812
              precision    recall  f1-score   support

           0     0.7020    0.4326    0.5353      1209
           1     0.7245    0.8724    0.7916      3159
           2     0.4900    0.3497    0.4081       632

    accuracy                         0.7000      5000
   macro avg     0.6388    0.5516    0.5784      5000
weighted avg     0.6894    0.7000    0.6812      5000






# BERT

bert单塔

In [None]:
def train_bert_model():

    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-chinese',
        num_labels=3  # 0, 1, 2
    )
    model.to(device)

    train_dataset = TextMatchingDataset(train_df, tokenizer)
    dev_dataset = TextMatchingDataset(dev_df, tokenizer)

    batch_size = 64
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    epochs = 3
    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0.1 * total_steps,
        num_training_steps=total_steps
    )

    best_f1 = 0
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")

        train_loss = train(model, train_dataloader, optimizer, scheduler, device)
        print(f"training loss: {train_loss:.4f}")

        # val
        accuracy, f1, report, _ = evaluate(model, dev_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(f"Validation F1 Score: {f1:.4f}")
        print(report)

        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), 'best_bert_model.pt')
            print("save best model")

    model.load_state_dict(torch.load('best_bert_model.pt'))

    # test_public
    test_public_dataset = TextMatchingDataset(test_public_df, tokenizer)
    test_public_dataloader = DataLoader(test_public_dataset, batch_size=batch_size)

    accuracy, f1, report, _ = evaluate(model, test_public_dataloader, device)
    print(f"Test Set Accuracy: {accuracy:.4f}")
    print(f"Test Set F1 Score: {f1:.4f}")
    print(report)


    return model


In [None]:
print("train BERT model...")
bert_model = train_bert_model()

train BERT model...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


Training: 100%|██████████| 2813/2813 [27:56<00:00,  1.68it/s, loss=0.371]


training loss: 0.6991


Evaluating: 100%|██████████| 313/313 [01:09<00:00,  4.48it/s]


Validation Accuracy: 0.7220
Validation F1 Score: 0.7145
              precision    recall  f1-score   support

           0     0.6961    0.5922    0.6399      4894
           1     0.7594    0.8362    0.7959     12592
           2     0.5137    0.4029    0.4516      2514

    accuracy                         0.7220     20000
   macro avg     0.6564    0.6104    0.6292     20000
weighted avg     0.7130    0.7220    0.7145     20000

save best model
Epoch 2/3


Training: 100%|██████████| 2813/2813 [27:57<00:00,  1.68it/s, loss=0.593]


training loss: 0.5784


Evaluating: 100%|██████████| 313/313 [01:09<00:00,  4.48it/s]


Validation Accuracy: 0.7331
Validation F1 Score: 0.7275
              precision    recall  f1-score   support

           0     0.7228    0.5936    0.6519      4894
           1     0.7710    0.8420    0.8049     12592
           2     0.5184    0.4594    0.4871      2514

    accuracy                         0.7331     20000
   macro avg     0.6707    0.6317    0.6480     20000
weighted avg     0.7274    0.7331    0.7275     20000

save best model
Epoch 3/3


Training: 100%|██████████| 2813/2813 [27:57<00:00,  1.68it/s, loss=0.493]


training loss: 0.4887


Evaluating: 100%|██████████| 313/313 [01:09<00:00,  4.47it/s]


Validation Accuracy: 0.7308
Validation F1 Score: 0.7274
              precision    recall  f1-score   support

           0     0.7166    0.5979    0.6519      4894
           1     0.7759    0.8289    0.8015     12592
           2     0.5077    0.4980    0.5028      2514

    accuracy                         0.7308     20000
   macro avg     0.6668    0.6416    0.6521     20000
weighted avg     0.7277    0.7308    0.7274     20000



Evaluating: 100%|██████████| 79/79 [00:17<00:00,  4.52it/s]

Test Set Accuracy: 0.7408
Test Set F1 Score: 0.7345
              precision    recall  f1-score   support

           0     0.7227    0.5906    0.6500      1209
           1     0.7780    0.8531    0.8138      3159
           2     0.5383    0.4668    0.5000       632

    accuracy                         0.7408      5000
   macro avg     0.6797    0.6368    0.6546      5000
weighted avg     0.7343    0.7408    0.7345      5000






双塔bert

In [None]:
class TextMatchingDataset_Dual(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.queries = df['query'].tolist()
        self.titles = df['title'].tolist()
        self.labels = df['label'].tolist() if 'label' in df.columns else [0] * len(df)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.queries)

    def __getitem__(self, idx):
        query = self.queries[idx]
        title = self.titles[idx]
        label = self.labels[idx]

        query_enc = self.tokenizer(query, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        title_enc = self.tokenizer(title, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids_1': query_enc['input_ids'].squeeze(0),
            'attention_mask_1': query_enc['attention_mask'].squeeze(0),
            'input_ids_2': title_enc['input_ids'].squeeze(0),
            'attention_mask_2': title_enc['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [None]:
class DualTowerBERTModel(nn.Module):
    def __init__(self, model_name='bert-base-chinese', hidden_size=768, num_classes=3):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.classifier = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2):
        output1 = self.bert(input_ids=input_ids_1, attention_mask=attention_mask_1).pooler_output
        output2 = self.bert(input_ids=input_ids_2, attention_mask=attention_mask_2).pooler_output
        combined = torch.cat([output1, output2], dim=1)
        logits = self.classifier(combined)
        return logits


In [None]:
def train(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids_1 = batch['input_ids_1'].to(device)
        attention_mask_1 = batch['attention_mask_1'].to(device)
        input_ids_2 = batch['input_ids_2'].to(device)
        attention_mask_2 = batch['attention_mask_2'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids_1, attention_mask_1, input_ids_2, attention_mask_2)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids_1 = batch['input_ids_1'].to(device)
            attention_mask_1 = batch['attention_mask_1'].to(device)
            input_ids_2 = batch['input_ids_2'].to(device)
            attention_mask_2 = batch['attention_mask_2'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids_1, attention_mask_1, input_ids_2, attention_mask_2)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    report = classification_report(all_labels, all_preds, digits=4)
    return acc, f1, report, all_preds


In [None]:
def train_bert_model_Dual():
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = DualTowerBERTModel().to(device)

    train_dataset = TextMatchingDataset_Dual(train_df, tokenizer)
    dev_dataset = TextMatchingDataset_Dual(dev_df, tokenizer)

    batch_size = 64
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    dev_loader = DataLoader(dev_dataset, batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=2e-5)
    epochs = 3
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    best_f1 = 0
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        train_loss = train(model, train_loader, optimizer, scheduler, device)
        print(f"training loss: {train_loss:.4f}")

        acc, f1, report, _ = evaluate(model, dev_loader, device)
        print(f"Validation Accuracy: {acc:.4f}")
        print(f"Validation F1 Score: {f1:.4f}")
        print(report)


        if f1 > best_f1:
            best_f1 = f1
            torch.save(model.state_dict(), 'best_dualtower_model.pt')
            print("save best model")

    model.load_state_dict(torch.load('best_dualtower_model.pt'))

    test_public_dataset = TextMatchingDataset_Dual(test_public_df, tokenizer)
    test_public_dataloader = DataLoader(test_public_dataset, batch_size=batch_size)

    accuracy, f1, report, _ = evaluate(model, test_public_dataloader, device)
    print(f"Test Set Accuracy: {accuracy:.4f}")
    print(f"Test Set F1 Score: {f1:.4f}")
    print(report)
    return model


In [None]:
model2 = train_bert_model_Dual()


Epoch 1/3


Training: 100%|██████████| 2813/2813 [27:44<00:00,  1.69it/s]


training loss: 0.7702


Evaluating: 100%|██████████| 313/313 [01:09<00:00,  4.50it/s]


Validation Accuracy: 0.6807
Validation F1 Score: 0.5578
              precision    recall  f1-score   support

           0     0.6525    0.3964    0.4932      4894
           1     0.7183    0.8548    0.7806     12592
           2     0.4459    0.3620    0.3996      2514

    accuracy                         0.6807     20000
   macro avg     0.6056    0.5377    0.5578     20000
weighted avg     0.6679    0.6807    0.6624     20000

save best model

Epoch 2/3


Training: 100%|██████████| 2813/2813 [27:45<00:00,  1.69it/s]


training loss: 0.6762


Evaluating: 100%|██████████| 313/313 [01:09<00:00,  4.48it/s]


Validation Accuracy: 0.6970
Validation F1 Score: 0.5695
              precision    recall  f1-score   support

           0     0.6751    0.4219    0.5193      4894
           1     0.7200    0.8784    0.7914     12592
           2     0.5155    0.3238    0.3978      2514

    accuracy                         0.6970     20000
   macro avg     0.6369    0.5414    0.5695     20000
weighted avg     0.6833    0.6970    0.6753     20000

save best model

Epoch 3/3


Training: 100%|██████████| 2813/2813 [27:44<00:00,  1.69it/s]


training loss: 0.5915


Evaluating: 100%|██████████| 313/313 [01:09<00:00,  4.50it/s]


Validation Accuracy: 0.6975
Validation F1 Score: 0.5903
              precision    recall  f1-score   support

           0     0.6701    0.4585    0.5445      4894
           1     0.7349    0.8492    0.7879     12592
           2     0.4819    0.4025    0.4387      2514

    accuracy                         0.6975     20000
   macro avg     0.6289    0.5701    0.5903     20000
weighted avg     0.6872    0.6975    0.6844     20000

save best model


Evaluating: 100%|██████████| 79/79 [00:17<00:00,  4.54it/s]

Test Set Accuracy: 0.6966
Test Set F1 Score: 0.5907
              precision    recall  f1-score   support

           0     0.6518    0.4582    0.5381      1209
           1     0.7389    0.8446    0.7882      3159
           2     0.4842    0.4130    0.4458       632

    accuracy                         0.6966      5000
   macro avg     0.6249    0.5719    0.5907      5000
weighted avg     0.6856    0.6966    0.6844      5000




