In [1]:
import jieba
import json
import os
from collections import OrderedDict,Counter,defaultdict,Counter
import copy
import random
import math
import re
import numpy as np
import zipfile
import torch
from transformers import BertTokenizer,BertModel,get_linear_schedule_with_warmup
import torch.nn as nn
from torch.utils.data import DataLoader,Dataset
from transformers import BertConfig,BertTokenizer,BertModel,AdamW
from tqdm import tqdm
import time
seed = 2021
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
import warnings
warnings.filterwarnings('ignore')

In [2]:
def generate_tags(tokenizer,word_seq,slots):
    tag_seq=['O']*len(word_seq)
    # attention please here value should be considered to be a list, because some recommends can be shown in one quest.
    for key,values in slots.items():
        for value in values:
            current_slot_value=tokenizer.tokenize(value)
            for i in range(len(word_seq)):
                if word_seq[i:i+len(current_slot_value)]==current_slot_value:
                    tag_seq[i]='B+'+key
                    if len(current_slot_value)>1:
                        tag_seq[i+1:i+len(current_slot_value)]=['I+'+key]*(len(current_slot_value)-1)
                    break
    return tag_seq

In [3]:
def preprocess_data(file_dir,tokenizer,include_sys=False):
    data_key=['train','val','test']
    intent_vocab=[]
    tag_vocab=[]
    for key in data_key:
        file_name=os.path.join(file_dir,key+'.json.zip')
        zpf=zipfile.ZipFile(file_name,'r')
        data=json.load(zpf.open(key+'.json'))
        sessions=[]
        for num,session in data.items():
            for i,message in enumerate(session["messages"]):
                utterance=message["content"]
                word_seq=tokenizer.tokenize(utterance)
                if message["role"]=="sys" and not include_sys:
                    pass
                else:
                    processed_data=[]
                    slots={}
                    intents=[]
                    golden=[]
                    for intent,domain,slot,value in message["dialog_act"]:
                        if intent in ['Inform','Recommend'] and '酒店设施' not in slot:
                            if value in utterance:
                                idx=utterance.index(value)
                                idx=len(tokenizer.tokenize(utterance[:idx]))
                                new_value=''.join(word_seq[idx:idx+len(tokenizer.tokenize(value))])
                                new_value=new_value.replace('##','')
                                golden.append([intent,domain,slot,new_value])
                                
                                slot_name="+".join([intent,domain,slot])
                                if slot_name not in slots:
                                    slots[slot_name]=[value]
                                else:
                                    slots[slot_name].append(value)
                            else:
                                golden.append([intent,domain,slot,value])
                        else:
                            intent_name='+'.join([intent,domain,slot,value])
                            intents.append(intent_name)
                            intent_vocab.append(intent_name)
                            golden.append([intent,domain,slot,value])                        
                    tag_seq=generate_tags(tokenizer,word_seq,slots)
                    tag_vocab+=tag_seq
                    processed_data.append(word_seq)
                    processed_data.append(tag_seq)
                    processed_data.append(intents)
                    processed_data.append(golden)
                    # attention please copy.deepcopy should be used to prevent data change later effect
                    current_context=[item["content"] for item in session["messages"][0:i] ]
#                     if len(current_context)==0:current_context=['']
                    processed_data.append(current_context)
                    sessions.append(processed_data)
        with open(os.path.join(file_dir,f'formated_{key}_nlu_data.json'),"w",encoding='utf-8') as g:
            json.dump(sessions,g,indent=2,ensure_ascii=False)
        print(os.path.join(file_dir,f'formated_{key}_nlu_data.json'))
    with open(os.path.join(file_dir,'intent_vocab.json'),"w",encoding='utf-8') as h:
        output_intent_vocab=[x[0] for x in dict(Counter(intent_vocab)).items()]
        json.dump(output_intent_vocab,h,indent=2,ensure_ascii=False)
    print(os.path.join(file_dir,'intent_vocab.json'))
    with open(os.path.join(file_dir,'tag_vocab.json'),"w",encoding='utf-8') as j:
        output_tag_vocab=[x[0] for x in dict(Counter(tag_vocab)).items()]
        json.dump(output_tag_vocab,j,indent=2,ensure_ascii=False)
    print(os.path.join(file_dir,'tag_vocab.json'))

In [4]:
class Dataloader:
    def __init__(self, intent_vocab_path, tag_vocab_path, pretrained_weights, max_history=3):
        """
        :param intent_vocab: list of all intents
        :param tag_vocab: list of all tags
        :param pretrained_weights: which bert_policy, e.g. 'bert_policy-base-uncased'
        """
        with open(intent_vocab_path,'r',encoding='utf-8') as f:
            self.intent_vocab=json.load(f)
        with open(tag_vocab_path,'r',encoding='utf-8') as g:
            self.tag_vocab=json.load(g)
        self.intent_dim = len(self.intent_vocab)
        self.tag_dim = len(self.tag_vocab)
        self.id2intent = dict([(i, x) for i, x in enumerate(self.intent_vocab)])
        self.intent2id = dict([(x, i) for i, x in enumerate(self.intent_vocab)])
        self.id2tag = dict([(i, x) for i, x in enumerate(self.tag_vocab)])
        self.tag2id = dict([(x, i) for i, x in enumerate(self.tag_vocab)])
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
        self.data = {}
        self.intent_weight = [1] * len(self.intent2id)
        self.max_history=max_history
        self.max_sen_len=0
        self.max_context_len=0

    def load_data(self, data_path, data_key, cut_sen_len=0):
        """
        sample representation: [list of words, list of tags, list of intents, original dialog act]
        :param data_key: train/val/tests
        :param data:
        :return:
        """
        # data是[tokens, tags, intents, raw_dialog_act, context[-context_size:]]]五个纬度的嵌套列表
        # tokens是jieba切分的得到的词，tags是可以看作词对应的slot标签，
        with open(data_path,'r',encoding='utf-8') as f:
            self.data[data_key]=json.load(f)
        max_context_len=0
        max_sen_len=0
        for d in self.data[data_key]:
            # d = (tokens, tags, intents, raw_dialog_act, context(list of str))
            if cut_sen_len > 0:
                d[0] = d[0][:cut_sen_len]
                d[1] = d[1][:cut_sen_len]
                d[4] = [" ".join(s.split()[:cut_sen_len]) for s in d[4][-self.max_history:]]

            d[4] = self.tokenizer.encode("[CLS] " + " [SEP] ".join(d[4]))
            
            max_context_len = max(max_context_len, len(d[4]))
            word_seq = d[0]
            tag_seq = d[1]
            new2ori = None
            d.append(new2ori)
            d.append(word_seq)
            d.append(self.seq_tag2id(tag_seq))
            d.append(self.seq_intent2id(d[2]))
            # here sep and cls will be added later
            max_sen_len = max(max_sen_len, len(word_seq)+2)
            # d = (tokens, tags, intents, da2triples(turn["dialog_act"]), context(token id), new2ori, new_word_seq, tag2id_seq, intent2id_seq)
            if data_key == "train":
                for intent_id in d[-1]:
                    self.intent_weight[intent_id] += 1
        if data_key == "train":
            train_size = len(self.data["train"])
            for intent, intent_id in self.intent2id.items():
                neg_pos = (
                    train_size - self.intent_weight[intent_id]
                ) / self.intent_weight[intent_id]
                self.intent_weight[intent_id] = np.log10(neg_pos)
            self.intent_weight = torch.tensor(self.intent_weight)
            self.max_context_len=max_context_len
            self.max_sen_len=max_sen_len
            print("max sen bert_policy len from train data", self.max_sen_len)
            print("max context bert_policy len from train data", self.max_context_len)

    def seq_tag2id(self, tags):
        return [self.tag2id[x] for x in tags if x in self.tag2id]

    def seq_id2tag(self, ids):
        return [self.id2tag[x] for x in ids]

    def seq_intent2id(self, intents):
        return [self.intent2id[x] for x in intents if x in self.intent2id]

    def seq_id2intent(self, ids):
        return [self.id2intent[x] for x in ids]

    def pad_batch(self, batch_data):
        batch_size = len(batch_data)
        max_sen_len = max([len(x[-3]) for x in batch_data]) + 2
        word_mask_tensor = torch.zeros((batch_size, max_sen_len), dtype=torch.long)
        word_seq_tensor = torch.zeros((batch_size, max_sen_len), dtype=torch.long)
        tag_mask_tensor = torch.zeros((batch_size, max_sen_len), dtype=torch.long)
        tag_seq_tensor = torch.zeros((batch_size, max_sen_len), dtype=torch.long)
        intent_tensor = torch.zeros((batch_size, self.intent_dim), dtype=torch.float)
        max_context_len = max([len(x[-5]) for x in batch_data])
        context_mask_tensor = torch.zeros(
            (batch_size, max_context_len), dtype=torch.long)
        context_seq_tensor = torch.zeros(
            (batch_size, max_context_len), dtype=torch.long)
        for i in range(batch_size):
            words = batch_data[i][-3]  #
            tags = batch_data[i][-2]
            intents = batch_data[i][-1]
            words = ["[CLS]"] + words + ["[SEP]"]
            indexed_tokens = self.tokenizer.convert_tokens_to_ids(words)
            sen_len = len(words)
            word_seq_tensor[i, :sen_len] = torch.LongTensor([indexed_tokens])
            tag_seq_tensor[i, 1 : sen_len - 1] = torch.LongTensor(tags)
            word_mask_tensor[i, :sen_len] = torch.LongTensor([1] * sen_len)
            tag_mask_tensor[i, 1 : sen_len - 1] = torch.LongTensor([1] * (sen_len - 2))
            for j in intents:
                intent_tensor[i, j] = 1.0
            context_len = len(batch_data[i][-5])
            context_seq_tensor[i, :context_len] = torch.LongTensor([batch_data[i][-5]])
            context_mask_tensor[i, :context_len] = torch.LongTensor([1] * context_len)

        return word_seq_tensor,word_mask_tensor,tag_seq_tensor,tag_mask_tensor,intent_tensor,context_seq_tensor,context_mask_tensor
    
    def get_train_batch(self, batch_size):
        batch_data = random.choices(self.data["train"], k=batch_size)
        return self.pad_batch(batch_data)

    def yield_batches(self, batch_size, data_key):
        batch_num = math.ceil(len(self.data[data_key]) / batch_size)
        for i in range(batch_num):
            batch_data = self.data[data_key][i * batch_size : (i + 1) * batch_size]
            yield self.pad_batch(batch_data), batch_data, len(batch_data)

In [5]:
class JointWithBert(nn.Module):
    def __init__(self, model_config, slot_dim, intent_dim):
        super(JointWithBert, self).__init__()
        # count of intent and tag
        self.slot_num_labels = slot_dim
        self.intent_num_labels = intent_dim
        # model
        self.bert = BertModel.from_pretrained(model_config.pretrained_weights)
        self.dropout = nn.Dropout(model_config.dropout)
        self.hidden_units = model_config.hidden_units
        
        self.intent_classifier = nn.Linear(self.hidden_units, self.intent_num_labels)
        self.slot_classifier = nn.Linear(self.hidden_units, self.slot_num_labels)
        self.intent_hidden = nn.Linear(2 * self.bert.config.hidden_size, self.hidden_units)
        self.slot_hidden = nn.Linear(2 * self.bert.config.hidden_size, self.hidden_units)
        
        nn.init.xavier_uniform_(self.intent_hidden.weight)
        nn.init.xavier_uniform_(self.slot_hidden.weight)
        nn.init.xavier_uniform_(self.intent_classifier.weight)
        nn.init.xavier_uniform_(self.slot_classifier.weight)

    def forward(self,word_seq_tensor,word_mask_tensor,context_seq_tensor,context_mask_tensor):
        outputs = self.bert(input_ids=word_seq_tensor, attention_mask=word_mask_tensor)
        # 获取每个token的output 输出[batch_size, seq_length, embedding_size] 如果做seq2seq 或者ner 用这个
        sequence_output = outputs[0]
        # 这个输出是获取句子的output
        pooled_output = outputs[1]

        # 如果有上下文信息
            # 将上下文信息进行bert训练并获得整个句子的output
        context_output = self.bert(input_ids=context_seq_tensor, attention_mask=context_mask_tensor)[1]
            # 将上下文得到输出和word_seq_tensor得到的输出进行拼接
        sequence_output = torch.cat([context_output.unsqueeze(1).repeat(1, sequence_output.size(1), 1),sequence_output,],
            dim=-1,)
        # 将上下文得到输出和之前获取句子的output进行拼接
        pooled_output = torch.cat([context_output, pooled_output], dim=-1)

        # 经过dropout、Linear、relu层
        sequence_output = nn.functional.relu(self.slot_hidden(self.dropout(sequence_output)))
        pooled_output = nn.functional.relu(self.intent_hidden(self.dropout(pooled_output)))
        # 经过dropout
        sequence_output = self.dropout(sequence_output)
        # 经过Linear层
        slot_logits = self.slot_classifier(sequence_output)
        outputs = (slot_logits,)

        pooled_output = self.dropout(pooled_output)
        intent_logits = self.intent_classifier(pooled_output)
        outputs = outputs + (intent_logits,)
        
        return outputs

In [6]:
class Model_Config():
    def __init__(self,):
        self.pretrained_weights='./hfl/chinese-bert-wwm-ext'
        self.train_data_path='./crosswoz_data/formated_train_nlu_data.json'
        self.test_data_path='./crosswoz_data/formated_test_nlu_data.json'
        self.dev_data_path='./crosswoz_data/formated_val_nlu_data.json'
        self.hidden_units=1536
        self.learning_rate=3.0e-5
        self.bert_learning_rate=3e-5
        self.other_learning_rate=3e-5
        self.weight_decay=0.01
        self.warmup_steps=0
        self.save_weight_path='./crosswoz_data/output/saved_model/my-pytorch-joint-with-bert.pt'
        self.save_model_path='./crosswoz_data/output/saved_model/my-pytorch-joint-with-bert.pth'
        self.device='cuda:0' if torch.cuda.is_available() else 'cpu'
        self.eps=1e-8
        self.batch_size=20
        self.max_step=40000
        self.check_step=1000
        self.dropout=0.1
        self.cut_sen_len=60
        self.intent_vocab_path='./crosswoz_data/intent_vocab.json'
        self.tag_vocab_path='./crosswoz_data/tag_vocab.json'
        self.max_history=3
        self.if_intent_weight=True
        self.mask_loss=True

In [7]:
def is_slot_da(da):
    if da[0] in ['Inform','Recommend'] and '酒店设施' not in da[2]:
        return True
    return False
def get_score(predict_golden):
    TP,FP,FN=0,0,0
    for item in predict_golden:
        predicts=item['predict']
        labels=item['golden']
        for item in predicts:
            if item in labels:
                TP+=1
            else:
                FP+=1
        for item in labels:
            if item not in predicts:
                FN+=1
    precision=1.0*TP/(TP+FP) if TP+FP else 0.0
    recall=1.0*TP/(TP+FN) if TP+FN else 0.0
    F1=2.0*precision*recall/(precision+recall) if precision+recall else 0.0
    return precision,recall,F1
def tag2das(word_seq,tag_seq):
    assert len(word_seq)==len(tag_seq)
    das=[]
    i=0
    while i<len(tag_seq):
        tag=tag_seq[i]
        if tag.startswith('B'):
            intent,domain,slot=tag[2:].split('+')
            value=word_seq[i]
            j=i+1
            while j<len(tag_seq):
                if tag_seq[j].startswith('I') and tag_seq[j][2:]==tag[2:]:
                    if word_seq[j].startswith('##'):
                        value+=word_seq[j][2:]
                    else:
                        value+=word_seq[j]
                    i+=1
                    j+=1
                else:
                    break
            das.append([intent,domain,slot,value])
        i+=1
    return das
def recover_intent(dataloader,intent_logits,tag_logits,tag_mask_tensor,ori_word_seq,new2ori):
    max_seq_len=tag_logits.size(0)
    das=[]
    for j in range(dataloader.intent_dim):
        if intent_logits[j]>0:
            intent,domain,slot,value=re.split('\+',dataloader.id2intent[j])
            das.append([intent,domain,slot,value])
    tags=[]
    for j in range(1,max_seq_len-1):
        if tag_mask_tensor[j]==1:
            value,tag_id=torch.max(tag_logits[j],dim=-1)
            tags.append(dataloader.id2tag[tag_id.item()])
    tag_intent=tag2das(ori_word_seq,tags)
    das+=tag_intent
    return das

In [8]:
def get_total_loss_func(dataloader,config,intent_logits,intent_tensor,slot_logits,tag_seq_tensor,tag_mask_tensor,intent_loss_fct,slot_loss_fct):
    if config.mask_loss:
        active_tag_loss = tag_mask_tensor.view(-1) == 1
            # I made some change for the view function
        active_tag_logits = slot_logits.view(-1, slot_logits.size()[-1])[active_tag_loss]
        active_tag_labels = tag_seq_tensor.view(-1)[active_tag_loss]
    else:
        active_tag_logits = slot_logits
        active_tag_labels = tag_seq_tensor
    slot_loss = slot_loss_fct(active_tag_logits, active_tag_labels)
    intent_loss = intent_loss_fct(intent_logits, intent_tensor)
    return slot_loss,intent_loss

In [9]:
def evaluate(config,model,dataloader,data_key,slot_loss_fct,intent_loss_fct):
    model.eval()
    val_slot_loss,val_intent_loss=0,0
    predict_golden={'intent':[],'slot':[],'overall':[]}
    score_result={'intent':[],'slot':[],'overall':[]}
    for index,(model_inputs,batch_data,num_data) in tqdm(enumerate(dataloader.yield_batches(config.batch_size,data_key))):
        model_inputs=tuple(item.to(config.device) for item in model_inputs)
        word_seq_tensor,word_mask_tensor,tag_seq_tensor,tag_mask_tensor,intent_tensor,context_seq_tensor,context_mask_tensor=model_inputs
        with torch.no_grad():
            slot_logits,intent_logits=model.forward(word_seq_tensor,word_mask_tensor,context_seq_tensor,context_mask_tensor)

            slot_loss,intent_loss=get_total_loss_func(dataloader,config,intent_logits,intent_tensor,slot_logits,tag_seq_tensor,tag_mask_tensor,intent_loss_fct,slot_loss_fct)

        val_slot_loss+=slot_loss.item()*num_data
        val_intent_loss+=intent_loss.item()*num_data
        
        for i in range(num_data):
            predicts=recover_intent(dataloader,intent_logits[i],slot_logits[i],tag_mask_tensor[i],batch_data[i][0],batch_data[i][-4])
            labels=batch_data[i][3]
            predict_golden['overall'].append({'predict':predicts,'golden':labels})
            predict_golden['intent'].append({'predict':[x for x in predicts if not is_slot_da(x)],'golden':[x for x in labels if not is_slot_da(x)]})
            predict_golden['slot'].append({'predict':[x for x in predicts if is_slot_da(x)],'golden':[x for x in labels if is_slot_da(x)]})
    for x in ['intent','slot','overall']:
        precision,recall,F1=get_score(predict_golden[x])
        score_result[x]=[precision,recall,F1]
        print('-'*20+x+'-'*20)
        print('Precision:{},Recall:{},F1:{}'.format(precision,recall,F1))
    avg_slot_loss=val_slot_loss/len(dataloader.data[data_key])
    avg_intent_loss=val_intent_loss/len(dataloader.data[data_key])
    print('val_slot_loss:{}，val_intent_loss:{}'.format(avg_slot_loss,avg_intent_loss))
    return avg_slot_loss,avg_intent_loss,score_result

In [10]:
def predict_intent_slot(utterance:str,context:list,config,dataloader,model):
    # utterance: str, context: list
    model.eval()

    context_seq = dataloader.tokenizer.encode("[CLS] " + " [SEP] ".join(context[-config.max_history:]))
    
    
    ori_word_seq=dataloader.tokenizer.tokenize(utterance)
    ori_tag_seq = ["O"]*len(ori_word_seq)
    
    intents = []
    da = []
    word_seq,tag_seq,new2ori=ori_word_seq,ori_tag_seq,None

    batch_data=[[ori_word_seq,ori_tag_seq,intents,da,context_seq,new2ori,word_seq,dataloader.seq_tag2id(tag_seq),dataloader.seq_intent2id(intents)]]
    pad_batch=dataloader.pad_batch(batch_data)
    pad_batch=tuple(t.to(config.device) for t in pad_batch)
    
    word_seq_tensor,word_mask_tensor,tag_seq_tensor,tag_mask_tensor,intent_tensor,context_seq_tensor,context_mask_tensor=pad_batch
    with torch.no_grad():
        slot_logits,intent_logits = model.forward(word_seq_tensor,word_mask_tensor,context_seq_tensor,context_mask_tensor)
    das=recover_intent(dataloader,intent_logits[0],slot_logits[0],tag_mask_tensor[0],batch_data[0][0],batch_data[0][-4])
    return das

In [11]:
def train(config,model,dataloader,slot_loss_fct,intent_loss_fct):
    print(config.device)
    bert_param_optimizer=list(model.bert.named_parameters())
    bert_params=list(map(id,model.bert.parameters()))
    other_param_optimizer=[(n,p) for n,p in model.named_parameters() if id(p) not in bert_params]
    no_decay=['bias','LayerNorm.bias','LayerNorm.weight']
    optimizer_grouped_parameters=[
        {'params':[p for n,p in bert_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':config.weight_decay,'lr':config.bert_learning_rate},
        {'params':[p for n,p in bert_param_optimizer if any(nd in n for nd in no_decay)],'weight_decay':0,'lr':config.bert_learning_rate},
        {'params':[p for n,p in other_param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay':config.weight_decay,'lr':config.other_learning_rate},
        {'params':[p for n,p in other_param_optimizer if any(nd in n for nd in no_decay)],'weight_decay':0,'lr':config.other_learning_rate}]
    optimizer=AdamW(optimizer_grouped_parameters,lr=config.learning_rate,eps=config.eps)
    scheduler=get_linear_schedule_with_warmup(optimizer,num_warmup_steps=config.warmup_steps,num_training_steps=config.max_step)
    
    train_slot_loss,train_intent_loss=0,0
    best_dev_loss=float('inf')
    total_train_samples=0
    for step in tqdm(range(1,config.max_step+1)):
        model.train()
        batched_data=dataloader.get_train_batch(config.batch_size)
        batched_data=tuple(item.to(config.device) for item in batched_data)
        word_seq_tensor,word_mask_tensor,tag_seq_tensor,tag_mask_tensor,intent_tensor,context_seq_tensor,context_mask_tensor=batched_data

        slot_logits,intent_logits=model.forward(word_seq_tensor,word_mask_tensor,context_seq_tensor,context_mask_tensor)

        slot_loss,intent_loss=get_total_loss_func(dataloader,config,intent_logits,intent_tensor,slot_logits,tag_seq_tensor,tag_mask_tensor,intent_loss_fct,slot_loss_fct)
        optimizer.zero_grad()
        total_loss=slot_loss+intent_loss
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        optimizer.step()
        train_slot_loss+=slot_loss.item()*word_seq_tensor.size(0)
        train_intent_loss+=intent_loss.item()*word_seq_tensor.size(0)
        total_train_samples+=word_seq_tensor.size(0)
        scheduler.step()
        if step%config.check_step==0:
            train_slot_loss=train_slot_loss/total_train_samples
            train_intent_loss=train_intent_loss/total_train_samples
            print('current_step{}/total_steps{},train_slot_loss:{},train_intent_loss:{}'.format(step,config.max_step,train_slot_loss,train_intent_loss))
            avg_slot_loss,avg_intent_loss,score_result=evaluate(config,model,dataloader,'val',slot_loss_fct,intent_loss_fct)
            avg_dev_loss=avg_slot_loss+avg_intent_loss
            if avg_dev_loss<best_dev_loss:
                best_dev_loss=avg_dev_loss
                torch.save(model.state_dict(),config.save_weight_path)
                print('model is saved to:{}'.format(config.save_weight_path))

In [12]:
"""
train data:
max sen bert_policy len 95
max context bert_policy len 183
dev data:
max sen bert_policy len 48
max context bert_policy len 163
"""

'\ntrain data:\nmax sen bert_policy len 95\nmax context bert_policy len 183\ndev data:\nmax sen bert_policy len 48\nmax context bert_policy len 163\n'

In [13]:
config=Model_Config()
pretrained_weights='./hfl/chinese-bert-wwm-ext'
file_dir='./crosswoz_data'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
if not all([os.path.exists(config.train_data_path),
            os.path.exists(config.test_data_path),
            os.path.exists(config.dev_data_path),
            os.path.exists(config.intent_vocab_path),
            os.path.exists(config.tag_vocab_path)]):
    preprocess_data(file_dir=file_dir,tokenizer=tokenizer,include_sys=True)
    print('raw data was just successfully preprocessed')
else:
    print('raw data has already been preprocessed before')

raw data has already been preprocessed before


In [14]:
dataloader=Dataloader(config.intent_vocab_path, config.tag_vocab_path, config.pretrained_weights, config.max_history)
dataloader.load_data(config.train_data_path, "train", config.cut_sen_len)
dataloader.load_data(config.dev_data_path, "val", config.cut_sen_len)

max sen bert_policy len from train data 62
max context bert_policy len from train data 267


In [15]:
if os.path.exists(config.save_model_path):
    model=torch.load(config.save_model_path,map_location=config.device)
    print('model trained from exist base model')
else:
    model=JointWithBert(model_config=config, slot_dim=dataloader.tag_dim, intent_dim=dataloader.intent_dim)
    if os.path.exists(config.save_weight_path):
        model.load_state_dict(torch.load(config.save_weight_path))
        print('model trained from exist base weight')
    else:
        print('model trained from scratch')
if config.if_intent_weight:
    intent_loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=dataloader.intent_weight.to(config.device))
else:
    intent_loss_fct = torch.nn.BCEWithLogitsLoss()
slot_loss_fct = torch.nn.CrossEntropyLoss()
model.to(config.device)
evaluate_sign=False
if os.path.exists(config.save_weight_path) or os.path.exists(config.save_model_path):
    if evaluate_sign:
        avg_slot_loss,avg_intent_loss,score_result=evaluate(config,model,dataloader,'val',slot_loss_fct,intent_loss_fct)

model trained from exist base model


In [16]:
"""
--------------------intent--------------------
Precision:0.9606143552311436,Recall:0.9696085955487337,F1:0.9650905202047209
--------------------slot--------------------
Precision:0.964492898579716,Recall:0.9147220641244546,F1:0.9389483933787731
--------------------overall--------------------
Precision:0.9629540243755279,Recall:0.9356862285278771,F1:0.9491243198239719
val_slot_loss:0.06734158956858625，val_intent_loss:0.002130140893944007
"""

'\n--------------------intent--------------------\nPrecision:0.9606143552311436,Recall:0.9696085955487337,F1:0.9650905202047209\n--------------------slot--------------------\nPrecision:0.964492898579716,Recall:0.9147220641244546,F1:0.9389483933787731\n--------------------overall--------------------\nPrecision:0.9629540243755279,Recall:0.9356862285278771,F1:0.9491243198239719\nval_slot_loss:0.06734158956858625，val_intent_loss:0.002130140893944007\n'

In [17]:
train_sign=False
if train_sign:
    train(config,model,dataloader,slot_loss_fct,intent_loss_fct)

In [18]:
train_sign=False
if not train_sign:
    torch.save(model, config.save_model_path)

In [19]:
utterance = "价格比较贵805元，评分是4.7分。"
context = ["你好，帮我推荐一个能提供24小时热水和洗衣服务的高档型酒店，谢谢。","建议您去北京广电国际酒店。","行啊，北京广电国际酒店的价格贵吗？评分是多少呢？"] 
predict_intent_slot(utterance,context,config,dataloader,model)

[['Inform', '酒店', '价格', '805元'], ['Inform', '酒店', '评分', '4.7分']]

In [20]:
utterance = "".join(["行","啊","，","北京","广电","国际","酒店","的","价格","贵","吗","？","评分","是","多少","呢","？"])
context = ["你好，帮我推荐一个能提供24小时热水和洗衣服务的高档型酒店，谢谢。","建议您去北京广电国际酒店。"]
predict_intent_slot(utterance,context,config,dataloader,model)

[['Request', '酒店', '价格', ''],
 ['Request', '酒店', '评分', ''],
 ['Inform', '酒店', '名称', '北京广电国际酒店']]