In [1]:
import yaml
import json
import numpy as np

In [2]:
path = '../DM_model/smarthome_AC_story.yml'
with open(path, 'r', encoding='utf-8') as f:
    dataset = yaml.load(f.read(),Loader=yaml.Loader)

# 输入
    - Previous system action
    - System Action
    - Slots
    - User intent entities

In [19]:
for i in dataset['stories']:
    print(i['steps'])
    print(len(i['steps'])//2)
    print('/n')

[{'intent': 'Control-AC_Open', 'entities': [{'address': '主卧'}, {'device': '空调'}, {'operation': '打开'}]}, {'action': 'action_open_ac'}]
1
/n
[{'intent': 'Control-AC_Open', 'entities': [{'device': '空调'}, {'operation': '打开'}]}, {'action': 'action_open_ac'}, {'intent': 'inform_ac', 'entities': [{'address': '客厅'}, {'device': '空调'}]}, {'action': 'action_open_ac'}]
2
/n
[{'intent': 'Control-AC_Open', 'entities': [{'device': '空调'}, {'operation': '打开'}]}, {'action': 'action_open_ac'}, {'intent': 'inform_room', 'entities': [{'address': '客厅'}]}, {'action': 'action_open_ac'}]
2
/n
[{'intent': 'Control-AC_Open', 'entities': [{'device': '空调'}, {'operation': '打开'}]}, {'action': 'action_open_ac'}, {'intent': 'inform_range', 'entities': [{'range': '所有'}]}, {'action': 'action_open_ac'}]
2
/n
[{'intent': 'Control-AC_Open', 'entities': [{'range': '所有'}, {'device': '空调'}, {'operation': '打开'}]}, {'action': 'action_open_ac'}]
1
/n
[{'intent': 'Control-AC_Open', 'entities': [{'operation': '打开'}, {'device': '空调

# Construct dataset map

In [3]:
def construct_dataset(dataset):
    # 将数据集的 intent entities 以及action 的类别抽取出来
    intent_set = ['PAD']
    entities_set = ['PAD']
    action_set = ['PAD']
    for v in dataset['stories']:
        worth = v['steps']
        for i in worth:
            for key, val in i.items():
                if key  == 'intent':
                    intent_set.append(val)
                if key == "action":
                    action_set.append(val)
                if key == 'entities':
                    for i in val:
                        for key , value in i.items():
                            entities_set.append(key)
    return set(intent_set), set(entities_set), set(action_set)

In [4]:
intent, entities, action = construct_dataset(dataset)
print('intent: {} \n inetent_len : {}'.format(intent,len(intent)))
print('entities: {} \n entities_len : {}'.format(entities,len(entities)))
print('action: {} \n action_len : {}'.format(action, len(action)))

intent: {'PAD', 'inform_range', 'Control-AC_Temp', 'Control-AC_Open', 'Control-AC_Timing', 'latent_control_temp', 'deny', 'inform_ac', 'Control-AC_Mode', 'affirm', 'inform_room', 'Control-AC_Close', 'Control-AC_Wind'} 
 inetent_len : 13
entities: {'PAD', 'temperature', 'address', 'range', 'mode', 'device', 'sensorvalue', 'time', 'target', 'operation'} 
 entities_len : 10
action: {'PAD', 'action_close_ac', 'action_latent_control_temp', 'action_controlactemp', 'actiming_form', 'action_controlDeviceWind', 'action_deny_opensetaircondition', 'action_open_ac', 'action_affirm_opensetaircondition', 'acmode_form'} 
 action_len : 10


In [5]:
# 构建意图索引
intent2id = {}
for index, val in enumerate(intent):
    intent2id.update({val:index})

id2intent = {}
for index, val in enumerate(intent):
    id2intent.update({index:val})  

In [6]:
# 构建槽位索引
entities2id = {}
for index, val in enumerate(entities):
    entities2id.update({val:index})

id2entities = {}
for index, val in enumerate(entities):
    id2entities.update({index:val}) 

In [7]:
# 构建动作索引
action2id = {}
for index, val in enumerate(action):
    action2id.update({val:index})

id2action = {}
for index, val in enumerate(action):
    id2action.update({index:val})

In [8]:
# 将label2id 转为字典以json文件储存
char = {}
char.update({'action2id' : action2id})
char.update({'id2action' : id2action})
char.update({'intent2id' : intent2id})
char.update({'id2intent' : id2intent})
char.update({'entities2id' : entities2id})
char.update({'id2entities' : id2entities})

with open('./DM_char.json', mode='w', encoding='utf-8') as f:
    json.dump(char, f)

# extract dataset
    - max_history 3

In [9]:
def split_data(dataset):
    # 以对话历史最长为3分割，以数组形式储存数据
    data_set = []
    num = 0
    for val in dataset['stories']:
        data = val['steps']
        num += 1
#         print(num)
        for index in range(0,len(data),2):

            previous_action = []
            actions = []
            slots = []
            user_intent = []

            # 当前状态
            current_intent  =  data[index]['intent']
            if 'entities' in data[index].keys():
                current_slot = data[index]['entities']
            elif 'entities' not in data[index].keys():
                current_slot = []
            current_action = data[index+1]['action']

            # 前一时刻状态
            pre1_intent = []
            pre1_slot = []
            pre1_action = []

            # 前二时刻状态
            pre2_intent = []
            pre2_slot = []
            pre2_action = [] 

            pre_history_1 = index-2
            pre_history_2 = index-4

            #判断对话历史是否存在
            if pre_history_1 >= 0:
                pre1_intent = data[pre_history_1]['intent']
                if 'entities' in data[pre_history_1].keys():
                    pre1_slot = data[pre_history_1]['entities']
                pre1_action = data[pre_history_1+1]['action']

            if pre_history_2 >= 0:
                pre2_intent = data[pre_history_2]['intent']
                if 'entities' in data[pre_history_2].keys():
                    pre2_slot = data[pre_history_2]['entities']
                pre2_action = data[pre_history_2+1]['action'] 
        
            previous_action_sum = [pre1_action, pre2_action]
            for i in previous_action_sum:
                if i != []:
                    previous_action.append(i)
            if previous_action == []:
                previous_action = ['PAD']
#             print('previous_action: ', previous_action)

            
            actions = [current_action]
#             print('actions: ',actions)
            
            slots_sum = [current_slot , pre1_slot , pre2_slot]            
            for i in slots_sum:
                if i  != []:
                    for val in i:
                        for key, j in val.items():
                            if key not in slots:
                                slots.append((key))
            if slots == []:
                slots = ['PAD']
#             print('slots:',slots)
            
            
            user_intent_sum= [current_intent , pre1_intent , pre2_intent]
            for i in user_intent_sum:
                if i != []:
                    user_intent.append(i)
            if user_intent == []:
                user_intent = ['PAD']
#             print('user_intent: ',user_intent)

            data_set.append({'previous_action':previous_action, 'slots':slots,'user_intent':user_intent, 'action':actions})
    return data_set

In [10]:
data_set = split_data(predataset)

In [11]:
data_set

[{'previous_action': ['PAD'],
  'slots': ['address', 'device', 'operation'],
  'user_intent': ['Control-AC_Open'],
  'action': ['action_open_ac']},
 {'previous_action': ['PAD'],
  'slots': ['device', 'operation'],
  'user_intent': ['Control-AC_Open'],
  'action': ['action_open_ac']},
 {'previous_action': ['action_open_ac'],
  'slots': ['address', 'device', 'operation'],
  'user_intent': ['inform_ac', 'Control-AC_Open'],
  'action': ['action_open_ac']},
 {'previous_action': ['PAD'],
  'slots': ['device', 'operation'],
  'user_intent': ['Control-AC_Open'],
  'action': ['action_open_ac']},
 {'previous_action': ['action_open_ac'],
  'slots': ['address', 'device', 'operation'],
  'user_intent': ['inform_room', 'Control-AC_Open'],
  'action': ['action_open_ac']},
 {'previous_action': ['PAD'],
  'slots': ['device', 'operation'],
  'user_intent': ['Control-AC_Open'],
  'action': ['action_open_ac']},
 {'previous_action': ['action_open_ac'],
  'slots': ['range', 'device', 'operation'],
  'user_i

In [12]:
def trans2labelid(vocab,x):
        max_len = len(vocab)
        labels = [vocab[label] for label in x]
        label_onehot = np.eye(max_len)[labels]
        values = sum(label_onehot)
        return values

In [18]:
def extract_conv_data(data_set):
    dataset_previous_action = []
    dataset_slots = []
    dataset_user_intent = [ValueError: 'previous_action_inputs ' is not a valid scope name]
    dataset_action = []
    for i in data_set:
        previous_action = i['previous_action']
        dataset_previous_action.append(trans2labelid(action2id,previous_action))
        slots = i['slots']
        dataset_slots.append(trans2labelid(entities2id,slots))
        user_intent = i['user_intent']
        dataset_user_intent.append(trans2labelid(intent2id,user_intent))
        action = i['action']
        dataset_action.append(trans2labelid(action2id,action))
    
    return np.array(dataset_previous_action), np.array(dataset_slots), \
            np.array(dataset_user_intent), np.array(dataset_action)

In [19]:
previous_action, slots, user_intent, action = extract_conv_data(data_set) 