In [None]:
import torch
import numpy as np
from transformers import BertTokenizer
import json
import pandas as pd
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from tqdm import tqdm
from ipywidgets import FloatProgress
from torch.utils.tensorboard import SummaryWriter
import re
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn import metrics
import json
import pandas as pd

### Transform the original slots into tokenized slots

In [None]:
def tag_tokens(query, slots, tokens):
    tokens = tokens[1:-1]
    tagged_tokens = []
    query_chars = list(query)
    token_index = 0
    index = 0
    for char, slot in zip(query_chars, slots):
        if token_index < len(tokens):
            current_token = tokens[token_index]
            if char == current_token or char in current_token:
                tagged_tokens.append(slot)
                token_index += 1
            elif current_token.startswith('##'):
                tagged_tokens.append(tagged_tokens[-1])
                token_index += 1
            elif '[UNK]' == current_token:
                tagged_tokens.append('U')
                token_index += 1
    for i, item in enumerate(tagged_tokens):
        if item == 'U':
            if 'city' in tagged_tokens[i-1] and 'I-district' == tagged_tokens[i+1]:
                tagged_tokens[i] = 'B-district'
            if 'O' == tagged_tokens[i-1] and 'I-development' == tagged_tokens[i+1]:
                tagged_tokens[i] = 'B-development'
            if 'O' == tagged_tokens[i+1] and 'development' in tagged_tokens[i-1]:
                tagged_tokens[i] = 'I-development'
            if 'development' in tagged_tokens[i-1] and 'development' in tagged_tokens[i+1]:
                tagged_tokens[i] = 'I-development'
            if 'company' in tagged_tokens[i-1] and 'company' in tagged_tokens[i+1]:
                tagged_tokens[i] = 'I-company'
            if 'O' in tagged_tokens[i-1] and 'I-development' == tagged_tokens[i+1]:
                tagged_tokens[i] = 'B-development'
            if 'I-district' in tagged_tokens[i-1] and 'I-development' == tagged_tokens[i+1]:
                tagged_tokens[i] = 'B-development'
            else:
                tagged_tokens[i] = 'O'
    tagged_tokens.insert(0, 'O')
    tagged_tokens.append('O')
    return tagged_tokens

In [None]:
BERT_PATH = './model/bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

In [None]:
def pad_to_512(input_string, max_pad_lenth=512):
    while len(input_string) < max_pad_lenth:
        input_string.append(int(-100))
    return input_string

In [None]:
intents_num = {'企业性质查询': 0,
          '营业利润查询': 1,
          '企业负债查询': 2,
          '项目成交状况查询': 3,
          '企业营业成本查询': 4,
          '小区绿化率查询': 5,
          '建筑密度查询': 6,
          '小区成交均价查询': 7,
          '营业总收入查询': 8,
          '地块总价查询': 9,
          '地块成交时间查询': 10,
          '企业债务违约查询': 11,
          '容积率查询': 12,
          '企业风险查询': 13,
          '地块归属查询': 14,
          '项目开发商信息查询': 15
          }

slots_num = {'O': 0,
          'B-year': 1,
          'I-year': 2,
          'B-month': 3,
          'I-month': 4,
          'B-city': 5,
          'I-city': 6,
          'B-district': 7,
          'I-district': 8,
          'B-development': 9,
          'I-development': 10,
          'B-company': 11,
          'I-company': 12
          }

### Read the training, testing, and validation sets.

In [None]:
def json2dataframe(all_datas):
    df = pd.DataFrame(columns=['category', 'text', 'intents', 'slots'])
    intents = []
    for data in all_datas:
        query = data['query']
        slots = data['slots'].split(',')
        encoded_text = tokenizer(query, return_tensors='pt')
        tokens = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'][0])
        slots = tag_tokens(query, slots, tokens)
        numbered_slots = [slots_num[item] for item in slots]
        intents_label = [0.0] * len(intents_num)
        if data['intent'] not in intents and '+' not in data['intent']:
            intents.append(data['intent'])
        if '+' in data['intent']:
            data['intent'] = data['intent'].split('+')
            for intent in data['intent']:
                intents_label[intents_num[intent]] = 1.0
            df = pd.concat([df, pd.DataFrame([{'category': intents_label, 'text': data['query'], 'intents': 1, 'slots': numbered_slots}])], ignore_index=True)
        elif '+' not in data['intent']:
            intent = data['intent']
            intents_label[intents_num[intent]] = 1.0
            df = pd.concat([df, pd.DataFrame([{'category': intents_label, 'text': data['query'], 'intents': 0, 'slots': numbered_slots}])], ignore_index=True)
    df['slots'] = df['slots'].apply(pad_to_512)
    return df

In [None]:
train_path = '../dataset/train.json'
with open(train_path, 'r', encoding='utf-8') as f:
    all_datas = json.load(f)
df_train = json2dataframe(all_datas)

validation_path = '../dataset/validation.json'
with open(validation_path, 'r', encoding='utf-8') as f:
    all_datas = json.load(f)
df_val = json2dataframe(all_datas)

test_path = '../dataset/test.json'
with open(test_path, 'r', encoding='utf-8') as f:
    all_datas = json.load(f)
df_test = json2dataframe(all_datas)

print(len(df_train), len(df_val), len(df_test))

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = df['category']
        self.texts = [tokenizer(text, 
                                padding='max_length', 
                                max_length = 512, 
                                truncation=True,
                                return_tensors="pt") 
                      for text in df['text']]
        self.num_intents = df['intents']
        self.slots = df['slots']

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def get_batch_num_intents(self, idx):
        # Fetch a batch of inputs
        return np.array(self.num_intents[idx])

    def get_batch_slots(self, idx):
        # Fetch a batch of inputs
        return np.array(self.slots[idx])

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        batch_num = self.get_batch_num_intents(idx)
        batch_slots = self.get_batch_slots(idx)
        return batch_texts, batch_y, batch_num, batch_slots

### Build the neural network.

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(dropout)
        self.linear1 = nn.Linear(768, 16)
        self.linear2 = nn.Linear(768, 2)
        self.linear3 = nn.Linear(768, 13)
        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax()

    def forward(self, input_id, mask):
        last_hidden_state, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear1_output = self.linear1(dropout_output)
        intent_probability = self.sigmoid(linear1_output)
        num_intents = self.linear2(dropout_output)
        last_hidden_state_output = self.dropout(last_hidden_state)
        slot_probability = self.linear3(last_hidden_state_output)
        return intent_probability, num_intents, slot_probability

### Train

In [None]:
def tensors_equal_ignore_order(tensor1, tensor2):
    # Sort the two tensors along the specified dimension
    sorted_tensor1, _ = torch.sort(tensor1)
    sorted_tensor2, _ = torch.sort(tensor2)
    results = []
    for row1, row2 in zip(sorted_tensor1, sorted_tensor2):
        results.append(torch.equal(row1, row2))
        results_tensor = torch.tensor(results, dtype=torch.bool)
    return results_tensor

# Input two tensors: the first tensor is the probability tensor, and the second tensor is the one-hot encoded label tensor.
def compute_multi_label_acc(probility, label):
    probility, idx1 = torch.sort(probility, descending=True)
    label, idx2 = torch.sort(label, descending=True)
    idx1 = idx1[:,0:2]
    idx2 = idx2[:,0:2]
    for i,labl in enumerate(label):
        if labl.sum() < 2:
            idx1[i,1] = 0
            idx2[i,1] = 0
    acc = tensors_equal_ignore_order(idx1, idx2).sum().item()
    return acc

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):
    # Sort the two tensors along the specified dimension using the Dataset class to retrieve the training and validation sets
    train, val = Dataset(train_data), Dataset(val_data)
    # Use DataLoader to retrieve data based on batch_size, and shuffle samples during training
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=5, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=5)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # loss
    criterion = nn.CrossEntropyLoss()
    binary_criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)
    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    for epoch_num in range(epochs):
        total_intent_acc_train = 0
        total_num_acc_train = 0
        total_loss_train = 0
        total_slot_acc_train = 0
        total_tokens_train = 0
        for train_input, train_intent_label, train_num_label, train_slot_label in tqdm(train_dataloader):
            train_intent_label = train_intent_label.to(device)
            train_num_label = train_num_label.to(device)
            train_slot_label = train_slot_label.to(device)
            mask = train_input['attention_mask'].squeeze(1).to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            # model output
            intent_probability, num_intents, slot_probability = model(input_id, mask)
            # compute loss
            intent_loss = binary_criterion(intent_probability, train_intent_label.float())
            active_loss = mask.view(-1) == 1
            active_logits = slot_probability.view(-1, 13)[active_loss]
            active_labels = train_slot_label.view(-1)[active_loss]
            slot_loss = criterion(active_logits, active_labels)
            num_loss = criterion(num_intents, train_num_label)
            loss = intent_loss + num_loss + slot_loss
            total_loss_train += loss.item()
            # compute metric
            intent_acc = compute_multi_label_acc(intent_probability, train_intent_label)
            total_intent_acc_train += intent_acc
            num_intent_acc = (num_intents.argmax(dim=1) == train_num_label).sum().item()
            total_num_acc_train += num_intent_acc
            word_leval_slots_acc = (slot_probability.argmax(dim=2).view(-1)[active_loss] == train_slot_label.view(-1)[active_loss]).sum().item()
            batch_token_nums = active_loss.sum().item()
            total_slot_acc_train += word_leval_slots_acc
            total_tokens_train += batch_token_nums
            model.zero_grad()
            loss.backward()
            optimizer.step()
        # ------- val -----------
        total_intent_acc_val = 0
        total_num_acc_val = 0
        total_loss_val = 0
        total_slot_acc_val = 0
        total_tokens_val = 0
        with torch.no_grad():
            for val_input, val_intent_label, val_num_label, val_slot_label in val_dataloader:
                val_intent_label = val_intent_label.to(device)
                val_num_label = val_num_label.to(device)
                val_slot_label = val_slot_label.to(device)
                mask = val_input['attention_mask'].squeeze(1).to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)
                intent_probability, num_intents, slot_probability = model(input_id, mask)
                intent_loss = binary_criterion(intent_probability, val_intent_label.float())
                # compute slots loss
                active_loss = mask.view(-1) == 1
                active_logits = slot_probability.view(-1, 13)[active_loss]
                active_labels = val_slot_label.view(-1)[active_loss]
                slot_loss = criterion(active_logits, active_labels)
                # compute intent num loss
                num_loss = criterion(num_intents, val_num_label)
                loss = intent_loss + num_loss + slot_loss
                total_loss_val += loss.item()
                # compute metric
                intent_acc = compute_multi_label_acc(intent_probability, val_intent_label)
                total_intent_acc_val += intent_acc
                num_intent_acc = (num_intents.argmax(dim=1) == val_num_label).sum().item()
                total_num_acc_val += num_intent_acc
                word_leval_slots_acc = (slot_probability.argmax(dim=2).view(-1)[active_loss] == val_slot_label.view(-1)[active_loss]).sum().item()
                batch_token_nums = active_loss.sum().item()
                total_slot_acc_val += word_leval_slots_acc
                total_tokens_val += batch_token_nums
        writer.add_scalar('Loss/train', total_loss_train / len(train_data), epoch_num)
        writer.add_scalar('Accuracy/train_intent', total_intent_acc_train / len(train_data), epoch_num)
        writer.add_scalar('Accuracy/train_num_intents', total_num_acc_train / len(train_data), epoch_num)
        writer.add_scalar('Accuracy/train_token_level_slot_acc', total_slot_acc_train / total_tokens_train, epoch_num)
        writer.add_scalar('Loss/val', total_loss_val / len(val_data), epoch_num)
        writer.add_scalar('Accuracy/val_intent', total_intent_acc_val / len(val_data), epoch_num)
        writer.add_scalar('Accuracy/val_num_intents', total_num_acc_val / len(val_data), epoch_num)
        writer.add_scalar('Accuracy/val_token_level_slot_acc', total_slot_acc_val / total_tokens_val, epoch_num)

        print(
            f'''Epochs: {epoch_num + 1} 
            | Train Loss: {total_loss_train / len(train_data): .3f} 
            | Train Intent Accuracy: {total_intent_acc_train / len(train_data): .3f}
            | Train Num of intents Accuracy: {total_num_acc_train / len(train_data): .3f} 
            | Train Token-level Slots Accuracy: {total_slot_acc_train / total_tokens_train: .3f} 
            | Val Loss: {total_loss_val / len(val_data): .3f} 
            | Val Intent Accuracy: {total_intent_acc_val / len(val_data): .3f}
            | Val Num of intents Accuracy: {total_num_acc_val / len(val_data): .3f}
            | Val Token-level Slots Accuracy: {total_slot_acc_val / total_tokens_val: .3f} ''')
        writer.close()

### Begin train

In [None]:
EPOCHS = 10
writer = SummaryWriter('./runs')
model = BertClassifier()
LR = 1e-6
train(model, df_train, df_val, LR, EPOCHS)

### validation

In [None]:
def evaluate(model, test_data):
    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=4)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()
    total_intent_acc_test = 0
    total_num_acc_test = 0
    total_tokens_test = 0
    total_slot_acc_test = 0
    with torch.no_grad():
        for test_input, test_intent_label, test_num_label, test_slot_label in test_dataloader:
            test_intent_label = test_intent_label.to(device)
            test_num_label = test_num_label.to(device)
            test_slot_label =  test_slot_label.to(device)
            mask = test_input['attention_mask'].squeeze(1).to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)
            intent_probability, num_intents, slot_probability = model(input_id, mask)
            active_loss = mask.view(-1) == 1
            intent_acc = compute_multi_label_acc(intent_probability, test_intent_label)
            # intent_acc = (intent_probability.argmax(dim=1) == test_intent_label.argmax(dim=1)).sum().item()
            total_intent_acc_test += intent_acc
            num_intent_acc = (num_intents.argmax(dim=1) == test_num_label).sum().item()
            total_num_acc_test += num_intent_acc
            word_leval_slots_acc = (slot_probability.argmax(dim=2).view(-1)[active_loss] == test_slot_label.view(-1)[active_loss]).sum().item()
            batch_token_nums = active_loss.sum().item()
            total_slot_acc_test += word_leval_slots_acc
            total_tokens_test += batch_token_nums
    print(f'Test Intent Accuracy: {total_intent_acc_test*100 / len(test_data): .2f}%')
    print(f'Test Num of Intent Accuracy: {total_num_acc_test*100 / len(test_data): .2f}%')
    print(f'Test Token-level Slots Accuracy: {total_slot_acc_test*100 / total_tokens_test: .2f}%')

In [None]:
evaluate(model, df_test)

In [None]:
model_path = './model/bert.pt'
torch.save(model, model_path)

# Predict

In [None]:
def top2_indices(tensor):
    if len(tensor) < 2:
        raise ValueError("Input tensor must have at least 2 elements")
    _, indices = torch.topk(tensor, k=2, dim=0)
    return indices

def find_key(dictionary, value):
    return [key for key, val in dictionary.items() if val == value]

In [None]:
model_path = '../model/bert.pt'
model = torch.load(model_path)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    model = model.cuda()
BERT_PATH = '../model/bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

### Transform slots into keywords

In [None]:
def align_tokens_with_query(tokens, query, query_slot):
    query = list(query)
    new_tokens = []
    origin_slot = query_slot
    if isinstance(origin_slot, type('str')):
        origin_slot = origin_slot.split(' ')
    new_slot = []
    for i, token in enumerate(tokens):
        if token == '[CLS]' or token == '[SEP]':
            continue
        elif token == query[0]:
            new_tokens.append(token)
            new_slot.append(origin_slot[0])
            query = query[1:]
            origin_slot = origin_slot[1:]
        elif '##' in token:
            token = token[2:]
            new_tokens.append(token)
            new_slot.append(origin_slot[0])
            for t in list(token):
                if t == query[0]:
                    query = query[1:]
                    origin_slot = origin_slot[1:]
                    
        elif '[UNK]' == token:
            end_index = query.index(tokens[i+1])
            unk = ''.join(query[0:end_index])
            new_tokens.append(unk)
            new_slot.append(origin_slot[0])
            query = query[end_index:]
            origin_slot = origin_slot[1:]

        elif len(token)>1:
            new_tokens.append(token)
            new_slot.append(origin_slot[0])
            for t in list(token):
                if t == query[0]:
                    query = query[1:]
                    origin_slot = origin_slot[1:]
    return new_tokens, new_slot

def restore_keywords_from_tokens(tokens, token_slot):
    keywords = []
    current_tokens = []
    current_label = None
    token_slot = token_slot[1:-1]

    for token, slot in zip(tokens, token_slot):
        if slot.startswith('B-'):
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
            current_label = slot[2:]
            current_tokens.append(token)
        elif slot.startswith('I-') and current_label == slot[2:]:
            current_tokens.append(token)
        else:
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
                current_label = None

    if current_tokens:
        keywords.append((''.join(current_tokens), current_label))

    return keywords

def restore_keywords_from_query(query, slots):
    keywords = []
    current_tokens = []
    current_label = None
    query = list(query)
    if slots[0] == '[CLS]':
        slots = slots[1:-1]

    for token, slot in zip(query, slots):
        if slot.startswith('B-'):
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
            current_label = slot[2:]
            current_tokens.append(token)
        elif slot.startswith('I-') and current_label == slot[2:]:
            current_tokens.append(token)
        else:
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
                current_label = None

    if current_tokens:
        keywords.append((''.join(current_tokens), current_label))

    return keywords

In [None]:
model_path = '../model/bert.pt'
model = torch.load(model_path)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    model = model.cuda()
BERT_PATH = '../model/bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

In [None]:
def intent2label(intents_row):
    intents_num = {'企业性质查询': 0,
          '营业利润查询': 1,
          '企业负债查询': 2,
          '项目成交状况查询': 3,
          '企业营业成本查询': 4,
          '小区绿化率查询': 5,
          '建筑密度查询': 6,
          '小区成交均价查询': 7,
          '营业总收入查询': 8,
          '地块总价查询': 9,
          '地块成交时间查询': 10,
          '企业债务违约查询': 11,
          '容积率查询': 12,
          '企业风险查询': 13,
          '地块归属查询': 14,
          '项目开发商信息查询': 15
          }
    intents_label = [0] * len(intents_num)
    if '+' in intents_row:
        intents = intents_row.split('+')
        for intent in intents:
            intents_label[intents_num[intent]] = 1
    elif '+' not in intents_row:
        intent = intents_row
        intents_label[intents_num[intent]] = 1
    return intents_label

def extract_keywords(tokens, token_slot):
    keywords = []
    current_keyword = ''
    current_slot = ''
    
    for token, slot in zip(tokens[1:-1], token_slot[1:-1]):  # Skip [CLS] and [SEP]
        if slot.startswith('B-'):
            if current_keyword:
                keywords.append((current_keyword.strip(), current_slot.split('-')[1]))
            current_keyword = token
            current_slot = slot
        elif slot.startswith('I-') and slot[2:] == current_slot[2:]:
            current_keyword += token
        elif slot == 'O':
            if current_keyword:
                keywords.append((current_keyword.strip(), current_slot.split('-')[1]))
                current_keyword = ''
                current_slot = ''
    
    if current_keyword:
        keywords.append((current_keyword.strip(), current_slot.split('-')[1]))
    
    return keywords

In [None]:
data_file = '../dataset/test.json'
with open(data_file, 'r', encoding='utf-8') as f:
    datas = json.load(f)
pred_intent_num = []
true_intent_num = []
pred_intent_label = []
true_intent_label = []
true_key_words = []
pred_key_words = []
true_token_slots = []
pred_token_slots = []
i = 0
for data in tqdm(datas):
    query = data['query']
    #print(query)
    origin_slots = data['slots'].split(',')
    #print(origin_slots)
    true_key_word = restore_keywords_from_query(query, origin_slots)
    #print(f'true_key_word:{true_key_word}')
    true_key_words.append(true_key_word)
    intent = data['intent']
    if '+' in intent:
        true_intent_num.append(1)
    if '+' not in intent:
        true_intent_num.append(0)
    intent_label = intent2label(intent)
    true_intent_label.append(list(intent_label))
    encoded_text = tokenizer(query, return_tensors='pt')
    tokens = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'][0])
    new_tokens, new_slots = align_tokens_with_query(tokens, query, origin_slots)
    #print(f'new_tokens:{new_tokens}')
    new_slots_to_num = [slots_num[item] for item in new_slots]
    true_token_slots.extend(new_slots_to_num)
    encoded_text_id = encoded_text['input_ids'].to(device)
    mask = encoded_text['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(encoded_text_id, mask)
    intent_probility = outputs[0].view(-1)
    _, intent_idx = torch.topk(intent_probility, k=2, dim=0)
    intent_idx = intent_idx.cpu()
    intent_num_probility = outputs[1].argmax()
    pred_intent_num.append(intent_num_probility.cpu())
    if intent_num_probility == 0:
        intent_idx = intent_idx[0]
    pred_intent = [1 if i in intent_idx else 0 for i in range(16)]
    pred_intent_label.append(list(pred_intent))
    slots_probility = outputs[2].argmax(dim=2).view(-1)
    pred_slots_num = slots_probility[1:-1].cpu()
    pred_token_slots.extend([t.item() for t in pred_slots_num])
    pred_token_slot = [find_key(slots_num, i)[0] for i in slots_probility]
    #print(f'token_slot:{token_slot}')
    pred_key_word = restore_keywords_from_tokens(new_tokens, pred_token_slot)
    #print(f'pred_key_word:{pred_key_word}')
    pred_key_words.append(pred_key_word)

# Calculate metrics

In [None]:
def metric_compute(trues: list, preds: list):
    if len(trues) != len(preds):
        return 'Input lengthes not equal!'
    precision = 0
    precision_all = 0
    recall = 0
    recall_all = 0
    for true_label, pred_label in zip(trues, preds):
        # 查准率
        for pred in pred_label:
            if pred in true_label:
                precision += 1
            precision_all
        # 查全率
        for true in true_label:
            if true in pred_label:
                recall += 1
            recall_all += 1
    P = precision/precision_all
    R = recall/recall_all
    F1 = 2 * P * R / (P + R)
    return P, R, F1

In [None]:
#pred_intent_num = pred_intent_num.cpu()
target_names = ['One Intent', 'Two Intent']
print(classification_report(true_intent_num , pred_intent_num, target_names=target_names))

In [None]:
city_count = 0
city_p = 0
development_count = 0
development_p = 0
company_count = 0
company_p = 0
community_count = 0
community_p = 0
year_count = 0
year_p = 0
month_count = 0
month_p = 0
acc_count = 0
for true_label, pred_label in zip(true_key_words, pred_key_words):
    true_year_value = []
    true_month_value = []
    true_community_value = []
    true_city_value = []
    true_company_value = []
    true_development_value = []
    for key_word, slot in true_label:
        #print(slot)
        if slot == 'year':
            true_year_value.append(key_word)
            pred_year_value = [key[0] for key in pred_label if key[1] == 'year']
            
        elif slot == 'month':
            true_month_value.append(key_word)
            pred_month_value = [key[0] for key in pred_label if key[1] == 'month']
            
        elif slot == 'district':
            true_community_value.append(key_word)
            # print(true_community_value)
            pred_community_value = [key[0] for key in pred_label if key[1] == 'district']
            
        elif slot == 'city':
            true_city_value.append(key_word)
            pred_city_value = [key[0] for key in pred_label if key[1] == 'city']
            
        elif slot == 'development':
            true_development_value.append(key_word)
            pred_development_value = [key[0] for key in pred_label if key[1] == 'development']
            
        elif slot == 'company':
            true_company_value.append(key_word)
            pred_company_value = [key[0] for key in pred_label if key[1] == 'company']

    if true_city_value != []:
        if set(true_city_value) == set(pred_city_value):
            city_p += 1
        city_count += 1
    if true_year_value != []:
        if set(true_year_value) == set(pred_year_value):
            year_p += 1
        year_count += 1

    if true_month_value != []:
        if set(true_month_value) == set(pred_month_value):
            month_p += 1
        month_count += 1
        
    if true_community_value != []:
        if set(true_community_value) == set(pred_community_value):
            community_p += 1
        community_count += 1

    if true_company_value != []:
        if set(true_company_value) == set(pred_company_value):
            company_p += 1
        company_count += 1

    if true_development_value != []:
        if set(true_development_value) == set(pred_development_value):
            development_p += 1
        development_count += 1

    if true_city_value != []:
        if set(true_city_value) == set(pred_city_value):
            city_p += 1
        city_count += 1
    if set(true_label) == set(pred_label):
        acc_count += 1

In [None]:
precision = {'city': 0, 'development': 0, 'company': 0, 'district': 0, 'year': 0, 'month': 0, 'all': 0 }
precision_all = {'city': 0, 'development': 0, 'company': 0, 'district': 0, 'year': 0, 'month': 0, 'all': 0 }
recall = {'city': 0, 'development': 0, 'company': 0, 'district': 0, 'year': 0, 'month': 0, 'all': 0 }
recall_all = {'city': 0, 'development': 0, 'company': 0, 'district': 0, 'year': 0, 'month': 0, 'all': 0 }

for true_label, pred_label in zip(true_key_words, pred_key_words):
    for pred in pred_label:
        slot = pred[1]
        keyword = pred[0]
        if pred in true_label:
            precision['all'] += 1
            precision[slot] += 1
        precision_all['all'] += 1
        precision_all[slot] += 1
    for true in true_label:
        slot = true[1]
        keyword = true[0]
        if true in pred_label:
            recall['all'] += 1
            recall[slot] += 1
        recall_all['all'] += 1
        recall_all[slot] += 1
Macro = {'P': 0, 'R': 0, 'F1': 0}
Micro = {'P': 0, 'R': 0, 'F1': 0}
for key in precision.keys():
    P = precision[key]/precision_all[key]
    if key != 'all':
        Macro['P'] += P
        Micro['P'] += P
    R = recall[key]/recall_all[key]
    if key != 'all':
        Macro['R'] += R
        Micro['R'] += R
    F1 = 2*P*R/(P+R)
Macro_P = Macro['P']/6
Macro_R = Macro['R']/6
Macro_F1 = 2*Macro_P*Macro_R/(Macro_P+Macro_R)
Micro_P = Micro['P']/6
Micro_R = Micro['R']/6
Micro_F1 = 2*Micro_P*Micro_R/(Micro_P+Micro_R)
print(f'Macro_P:{Macro_P}')
print(f'Macro_R:{Macro_R}')
print(f'Macro_F1:{Macro_F1}')
print(f'Micro_P:{Micro_P}')
print(f'Micro_R:{Micro_R}')
print(f'Micro_F1:{Micro_F1}')

### Save BERT's predicted SLU information

In [None]:
pred_key_words

In [None]:
def align_tokens_with_query(tokens, query):
    query = list(query)
    new_tokens = []
    for i, token in enumerate(tokens):
        if token == '[CLS]' or token == '[SEP]':
            continue
        elif token == query[0]:
            new_tokens.append(token)
            query = query[1:]
        elif '##' in token:
            token = token[2:]
            new_tokens.append(token)
            for t in list(token):
                if t == query[0]:
                    query = query[1:]
                    
        elif '[UNK]' == token:
            end_index = query.index(tokens[i+1])
            unk = ''.join(query[0:end_index])
            new_tokens.append(unk)
            query = query[end_index:]

        elif len(token)>1:
            new_tokens.append(token)
            for t in list(token):
                if t == query[0]:
                    query = query[1:]
    return new_tokens

def restore_keywords_from_tokens(tokens, token_slot):
    keywords = []
    current_tokens = []
    current_label = None
    token_slot = token_slot[1:-1]

    for token, slot in zip(tokens, token_slot):
        if slot.startswith('B-'):
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
            current_label = slot[2:]
            current_tokens.append(token)
        elif slot.startswith('I-') and current_label == slot[2:]:
            current_tokens.append(token)
        else:
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
                current_label = None

    if current_tokens:
        keywords.append((''.join(current_tokens), current_label))
    keyword_pair = []
    for keyword in keywords:
        if keyword[-1] == 'city':
            keyword_pair.append(f'城市:{keyword[0]}')
        elif keyword[-1] == 'district':
            keyword_pair.append(f'区域:{keyword[0]}')
        elif keyword[-1] == 'development':
            keyword_pair.append(f'项目名称:{keyword[0]}')
        elif keyword[-1] == 'company':
            keyword_pair.append(f'企业名称:{keyword[0]}')
        elif keyword[-1] == 'year':
            keyword_pair.append(f'年份:{keyword[0]}')
        elif keyword[-1] == 'month':
            keyword_pair.append(f'月份:{keyword[0]}')
        elif keyword[-1] == 'land':
            keyword_pair.append(f'地块名称:{keyword[0]}')

    return keyword_pair


def restore_keywords_from_query(query, slots):
    keywords = []
    current_tokens = []
    current_label = None
    query = list(query)
    if isinstance(slots, str):
        slots = slots.split(' ')
    if slots[0] == '[CLS]':
        slots = slots[1:-1]

    for token, slot in zip(query, slots):
        if slot.startswith('B-'):
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
            current_label = slot[2:]
            current_tokens.append(token)
        elif slot.startswith('I-') and current_label == slot[2:]:
            current_tokens.append(token)
        else:
            if current_tokens:
                keywords.append((''.join(current_tokens), current_label))
                current_tokens = []
                current_label = None

    if current_tokens:
        keywords.append((''.join(current_tokens), current_label))
    keyword_pair = []
    for keyword in keywords:
        if keyword[-1] == 'city':
            keyword_pair.append(f'城市:{keyword[0]}')
        elif keyword[-1] == 'district':
            keyword_pair.append(f'区域:{keyword[0]}')
        elif keyword[-1] == 'development':
            keyword_pair.append(f'项目名称:{keyword[0]}')
        elif keyword[-1] == 'company':
            keyword_pair.append(f'企业名称:{keyword[0]}')
        elif keyword[-1] == 'year':
            keyword_pair.append(f'年份:{keyword[0]}')
        elif keyword[-1] == 'month':
            keyword_pair.append(f'月份:{keyword[0]}')
        elif keyword[-1] == 'land':
            keyword_pair.append(f'地块名称:{keyword[0]}')

    return keyword_pair

In [None]:
json_file = '../dataset/test.json'
with open(json_file, 'r', encoding='utf-8') as f:
    datas = json.load(f)
all_intents = ['小区绿化率查询', '营业利润查询', '小区成交均价查询+项目成交状况查询', '企业营业成本查询', '企业负债查询', '小区成交均价查询', '企业风险查询', '建筑密度查询', '地块总价查询+容积率查询', '容积率查询', '项目成交状况查询', '地块成交时间查询+地块总价查询', '地块成交时间查询', '企业债务违约查询', '营业总收入查询', '地块总价查询', '企业性质查询', '容积率查询+地块成交时间查询', '地块成交时间查询+容积率查询', '项目开发商信息查询', '小区绿化率查询+容积率查询', '地块归属查询', '企业债务违约查询+企业负债查询', '企业负债查询+企业风险查询']
for data in datas:
    query = data['query']
    encoded_text = tokenizer(query, return_tensors='pt')
    tokens = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'][0])
    new_tokens = align_tokens_with_query(tokens, query)
    # print(f'new_tokens:{new_tokens}')
    encoded_text_id = encoded_text['input_ids'].to(device)
    mask = encoded_text['attention_mask'].to(device)
    with torch.no_grad():
        outputs = model(encoded_text_id, mask)
    intent_probility = outputs[0].view(-1)
    _, intent_idx = torch.topk(intent_probility, k=2, dim=0)
    intent_idx = intent_idx.cpu()
    intent_num = outputs[1].argmax().cpu()
    if intent_num == 0:
        intent_idx = intent_idx[0]
        intent = find_key(intents_num, intent_idx)[0]
    elif intent_num == 1:
        # intent_idx = intent_idx.item()
        intent = [find_key(intents_num, i)[0] for i in intent_idx]
        if intent[0]+'+'+intent[1] in all_intents:
            intent = intent[0]+'+'+intent[1]
        if intent[1]+'+'+intent[0] in all_intents:
            intent = intent[1]+'+'+intent[0]
    data["BERT_pred_intent"] = intent
    slots_probility = outputs[2].argmax(dim=2).view(-1)
    token_slot = [find_key(slots_num, i)[0] for i in slots_probility]
    pred_key_word = restore_keywords_from_tokens(new_tokens, token_slot)
    key_word = pred_key_word
    data["BERT_pred_slots"] = key_word

In [None]:
save_json_file = './results/test-with-BERT_pred_intent+slots.json'
with open(save_json_file, 'w', encoding='utf-8') as file:
    json.dump(datas, file, ensure_ascii=False, indent=4)