In [1]:
import os
import json
import codecs
import numpy as np
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
gpus = tf.config.experimental.list_physical_devices(device_type='GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [2]:
params = {
    'batch_size': 64,
    'lr' : 0.001,
    'max_sent_len': 20,
    'epochs': 500,
    'drops' : [0.1]
         }

In [3]:
def extract_data(data_path):
    """
    意图识别抽取出label
    槽位识别与填充作为命名实体识别问题，对每一个字进行实体标注, ate_time', 'B-target', 'I-date_time', 'I-date_time', 'I-operation', 'I-date_time', 'I-date_time']
[ ]:
￼
​B E I O S
    """
    with codecs.open(data_path,"r",encoding="utf-8") as fp:
        data = json.load(fp)
    texts = [example['text'].replace(" ","") for example in data]
    intent_labels = [example['intent'] for example in data]
    
    slots_ners = []
    count = 0
    for example in data:
        if 'entities' in example.keys():
            text = example['text']
            ner = ['O'] * len(text)
            slots = example['entities']
            for key,val in slots.items():
                start_idx = text.find(val)
                end_idx = start_idx + len(val) -1
                if len(val) == 1:
                    ner[start_idx] = 'S-' + key
                else:
                    ner[start_idx] = 'B-' + key
                    ner[end_idx] = 'E-'+ key
                    for idx in range(start_idx+1, end_idx):
                        ner[idx] = 'I-' + key
        else:
            text = example['text']
            ner = ['O'] * len(text)
        slots_ners.append(ner)
    print('texts len: ', len(texts))
    print('intent_lables len: ',len(intent_labels))
    print('slots_ners len: ', len(slots_ners))
    return texts, intent_labels, slots_ners  

In [4]:
data_path ="../dataset/data_v2.json"
max_sent_len = params["max_sent_len"]
texts, intent_labels, slots_ners = extract_data(data_path)

texts len:  2517
intent_lables len:  2517
slots_ners len:  2517


In [5]:
# 构建文本字符索引

In [6]:
text_set = []
for i in texts:
    for j in i:
        text_set.append(j)

character = ['PADL'] 
for i in set(text_set):
    character.append(i)

char2id = {}
for index, val in enumerate(character):
    char2id.update({val:index})

id2char = {}
for index, val in enumerate(character):
    id2char.update({index:val})  

In [7]:
# 构建意图索引

In [8]:
intent = ['PADL'] 
for i in set(intent_labels):
    intent.append(i)

intent2id = {}
for index, val in enumerate(intent):
    intent2id.update({val:index})

id2intent = {}
for index, val in enumerate(intent):
    id2intent.update({index:val})

In [9]:
# 构建槽位索引

In [10]:
slot_set = []
for i in slots_ners:
    for j in i:
        slot_set.append(j)
        
slot = ['PADL']
for i in set(slot_set):
    slot.append(i)
    
slot2id = {}
for index, val in enumerate(slot):
    slot2id.update({val:index})

id2slot = {}
for index, val in enumerate(slot):
    id2slot.update({index:val})

In [11]:
char = {}
char.update({'char2id' : char2id})
char.update({'id2char' : id2char})
char.update({'intent2id' : intent2id})
char.update({'id2intent' : id2intent})
char.update({'slot2id' : slot2id})
char.update({'id2slot' : id2slot})

with open('./char_conv.json', mode='w', encoding='utf-8') as f:
    json.dump(char, f)   