In [10]:
# -*- coding: utf-8 -*-
# @Author: Jie
# @Date:   2017-06-15 14:11:08
# @Last Modified by:   Jie Yang,     Contact: jieynlp@gmail.com
# @Last Modified time: 2019-01-18 22:29:08

import time
import sys
import argparse
import random
import copy

import gc
import cPickle as pickle

import numpy as np
from utils.metric import get_ner_fmeasure
from utils.data import Data

seed_num = 100
random.seed(seed_num)

np.random.seed(seed_num)

In [11]:

def data_initialization(data, gaz_file, train_file, dev_file, test_file):
	#将训练集验证集等放入data.word_alphabet\biword_alphabet\char_alphabet\label_alphabet中
    data.build_alphabet(train_file)  
    data.build_alphabet(dev_file)
    data.build_alphabet(test_file)
	
	#把gaz_file的词加入到data.gaz的ent2type和ent2id中  ent2id
    data.build_gaz_file(gaz_file)
	
	#data.gaz_alphabet中 放入： 训练集\测试集等等 每句话中在gaz_file中能匹配到的子词
    data.build_gaz_alphabet(train_file)
    data.build_gaz_alphabet(dev_file)
    data.build_gaz_alphabet(test_file)
	
	#keep_growing=False
    data.fix_alphabet()
    return data


def predict_check(pred_variable, gold_variable, mask_variable):
    """
        input:
            pred_variable (batch_size, sent_len): pred tag result, in numpy format
            gold_variable (batch_size, sent_len): gold result variable
            mask_variable (batch_size, sent_len): mask variable
    """
    pred = pred_variable.cpu().data.numpy()
    gold = gold_variable.cpu().data.numpy()
    mask = mask_variable.cpu().data.numpy()
    overlaped = (pred == gold)
    right_token = np.sum(overlaped * mask)
    total_token = mask.sum()
    # print("right: %s, total: %s"%(right_token, total_token))
    return right_token, total_token


def recover_label(pred_variable, gold_variable, mask_variable, label_alphabet, word_recover):
    """
        input:
            pred_variable (batch_size, sent_len): pred tag result
            gold_variable (batch_size, sent_len): gold result variable
            mask_variable (batch_size, sent_len): mask variable
    """
    
    pred_variable = pred_variable[word_recover]
    gold_variable = gold_variable[word_recover]
    mask_variable = mask_variable[word_recover]
    batch_size = gold_variable.size(0)
    seq_len = gold_variable.size(1)
    mask = mask_variable.cpu().data.numpy()
    pred_tag = pred_variable.cpu().data.numpy()
    gold_tag = gold_variable.cpu().data.numpy()
    batch_size = mask.shape[0]
    pred_label = []
    gold_label = []
    for idx in range(batch_size):
        pred = [label_alphabet.get_instance(pred_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
        gold = [label_alphabet.get_instance(gold_tag[idx][idy]) for idy in range(seq_len) if mask[idx][idy] != 0]
        # print "p:",pred, pred_tag.tolist()
        # print "g:", gold, gold_tag.tolist()
        assert(len(pred)==len(gold))
        pred_label.append(pred)
        gold_label.append(gold)
    return pred_label, gold_label

#调用来自：save_data_setting(data, save_data_name)
def save_data_setting(data, save_file):
    new_data = copy.deepcopy(data)  #复制data到new_data
    ## remove input instances
    new_data.train_texts = []
    new_data.dev_texts = []
    new_data.test_texts = []
    new_data.raw_texts = []

    new_data.train_Ids = []
    new_data.dev_Ids = []
    new_data.test_Ids = []
    new_data.raw_Ids = []
    ## save data settings
    with open(save_file, 'w') as fp:
        pickle.dump(new_data, fp)
    print "Data setting saved to file: ", save_file


def load_data_setting(save_file):
    with open(save_file, 'r') as fp:
        data = pickle.load(fp)
    print "Data setting loaded from file: ", save_file
    data.show_data_summary()
    return data

def lr_decay(optimizer, epoch, decay_rate, init_lr):
    lr = init_lr * ((1-decay_rate)**epoch)
    print " Learning rate is setted as:", lr
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer



def evaluate(data, model, name):
    if name == "train":
        instances = data.train_Ids
    elif name == "dev":
        instances = data.dev_Ids
    elif name == 'test':
        instances = data.test_Ids
    elif name == 'raw':
        instances = data.raw_Ids
    else:
        print "Error: wrong evaluate name,", name
    right_token = 0
    whole_token = 0
    pred_results = []
    gold_results = []
    ## set model in eval model
    model.eval()
    batch_size = 1
    start_time = time.time()
    train_num = len(instances)
    total_batch = train_num//batch_size+1
    for batch_id in range(total_batch):
        start = batch_id*batch_size
        end = (batch_id+1)*batch_size 
        if end >train_num:
            end =  train_num
        instance = instances[start:end]
        if not instance:
            continue
        gaz_list,batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu, True)
        tag_seq = model(gaz_list,batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask)
        # print "tag:",tag_seq
        pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover)
        pred_results += pred_label
        gold_results += gold_label
    decode_time = time.time() - start_time
    speed = len(instances)/decode_time
    acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme)
    return speed, acc, p, r, f, pred_results  


def batchify_with_label(input_batch_list, gpu, volatile_flag=False):
    """
        input: list of words, chars and labels, various length. [[words,biwords,chars,gaz, labels],[words,biwords,chars,labels],...]
            words: word ids for one sentence. (batch_size, sent_len) 
            chars: char ids for on sentences, various length. (batch_size, sent_len, each_word_length)
        output:
            zero padding for word and char, with their batch length
            word_seq_tensor: (batch_size, max_sent_len) Variable
            word_seq_lengths: (batch_size,1) Tensor
            char_seq_tensor: (batch_size*max_sent_len, max_word_len) Variable
            char_seq_lengths: (batch_size*max_sent_len,1) Tensor
            char_seq_recover: (batch_size*max_sent_len,1)  recover char sequence order 
            label_seq_tensor: (batch_size, max_sent_len)
            mask: (batch_size, max_sent_len) 
    """
    batch_size = len(input_batch_list)
    words = [sent[0] for sent in input_batch_list]
    biwords = [sent[1] for sent in input_batch_list]
    chars = [sent[2] for sent in input_batch_list]
    gazs = [sent[3] for sent in input_batch_list]
    labels = [sent[4] for sent in input_batch_list]
    word_seq_lengths = torch.LongTensor(map(len, words))
    max_seq_len = word_seq_lengths.max()
    word_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)), volatile =  volatile_flag).long()
    biword_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)), volatile =  volatile_flag).long()
    label_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).long()
    mask = autograd.Variable(torch.zeros((batch_size, max_seq_len)),volatile =  volatile_flag).byte()
    for idx, (seq, biseq, label, seqlen) in enumerate(zip(words, biwords, labels, word_seq_lengths)):
        word_seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        biword_seq_tensor[idx, :seqlen] = torch.LongTensor(biseq)
        label_seq_tensor[idx, :seqlen] = torch.LongTensor(label)
        mask[idx, :seqlen] = torch.Tensor([1]*seqlen)
    word_seq_lengths, word_perm_idx = word_seq_lengths.sort(0, descending=True)
    word_seq_tensor = word_seq_tensor[word_perm_idx]
    biword_seq_tensor = biword_seq_tensor[word_perm_idx]
    ## not reorder label
    label_seq_tensor = label_seq_tensor[word_perm_idx]
    mask = mask[word_perm_idx]
    ### deal with char
    # pad_chars (batch_size, max_seq_len)
    pad_chars = [chars[idx] + [[0]] * (max_seq_len-len(chars[idx])) for idx in range(len(chars))]
    length_list = [map(len, pad_char) for pad_char in pad_chars]
    max_word_len = max(map(max, length_list))
    char_seq_tensor = autograd.Variable(torch.zeros((batch_size, max_seq_len, max_word_len)), volatile =  volatile_flag).long()
    char_seq_lengths = torch.LongTensor(length_list)
    for idx, (seq, seqlen) in enumerate(zip(pad_chars, char_seq_lengths)):
        for idy, (word, wordlen) in enumerate(zip(seq, seqlen)):
            # print len(word), wordlen
            char_seq_tensor[idx, idy, :wordlen] = torch.LongTensor(word)
    char_seq_tensor = char_seq_tensor[word_perm_idx].view(batch_size*max_seq_len,-1)
    char_seq_lengths = char_seq_lengths[word_perm_idx].view(batch_size*max_seq_len,)
    char_seq_lengths, char_perm_idx = char_seq_lengths.sort(0, descending=True)
    char_seq_tensor = char_seq_tensor[char_perm_idx]
    _, char_seq_recover = char_perm_idx.sort(0, descending=False)
    _, word_seq_recover = word_perm_idx.sort(0, descending=False)
    
    ## keep the gaz_list in orignial order
    
    gaz_list = [ gazs[i] for i in word_perm_idx]
    gaz_list.append(volatile_flag)
    if gpu:
        word_seq_tensor = word_seq_tensor.cuda()
        biword_seq_tensor = biword_seq_tensor.cuda()
        word_seq_lengths = word_seq_lengths.cuda()
        word_seq_recover = word_seq_recover.cuda()
        label_seq_tensor = label_seq_tensor.cuda()
        char_seq_tensor = char_seq_tensor.cuda()
        char_seq_recover = char_seq_recover.cuda()
        mask = mask.cuda()
    return gaz_list, word_seq_tensor, biword_seq_tensor, word_seq_lengths, word_seq_recover, char_seq_tensor, char_seq_lengths, char_seq_recover, label_seq_tensor, mask

#调用来自：train(data, save_model_dir, seg)
def train(data, save_model_dir, seg=True):
    print "Training model..."
    data.show_data_summary()   #打印data信息
    save_data_name = save_model_dir +".dset"
    save_data_setting(data, save_data_name)  #保存数据：pickle.dump(new_data, fp)
	
    model = SeqModel(data)
	
    print "finished built model."
    loss_function = nn.NLLLoss()
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.SGD(parameters, lr=data.HP_lr, momentum=data.HP_momentum)
    best_dev = -1
    ## start training
    for idx in range(data.HP_iteration):   #HP_iteration=50
        epoch_start = time.time()
        temp_start = epoch_start
        print("Epoch: %s/%s" %(idx,data.HP_iteration))
        optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr)
        instance_count = 0
        sample_id = 0
        sample_loss = 0
        batch_loss = 0
        total_loss = 0
        right_token = 0
        whole_token = 0
        random.shuffle(data.train_Ids)
		
        ## set model in train model
        model.train()
        model.zero_grad()
        batch_size = 1 ## current only support batch size = 1 to compulate and accumulate to data.HP_batch_size update weights
        batch_id = 0
        train_num = len(data.train_Ids)
        total_batch = train_num//batch_size+1
        for batch_id in range(total_batch):
            start = batch_id*batch_size
            end = (batch_id+1)*batch_size 
            if end >train_num:
                end = train_num
            instance = data.train_Ids[start:end]
            if not instance:
                continue
            gaz_list,  batch_word, batch_biword, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask  = batchify_with_label(instance, data.HP_gpu)
            # print "gaz_list:",gaz_list
            # exit(0)
            instance_count += 1
            loss, tag_seq = model.neg_log_likelihood_loss(gaz_list, batch_word, batch_biword, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask)
            right, whole = predict_check(tag_seq, batch_label, mask)
            right_token += right
            whole_token += whole
            sample_loss += loss.data[0]
            total_loss += loss.data[0]
            batch_loss += loss

            if end%500 == 0:
                temp_time = time.time()
                temp_cost = temp_time - temp_start
                temp_start = temp_time
                print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))
                sys.stdout.flush()
                sample_loss = 0
            if end%data.HP_batch_size == 0:
                batch_loss.backward()
                optimizer.step()
                model.zero_grad()
                batch_loss = 0
        temp_time = time.time()
        temp_cost = temp_time - temp_start
        print("     Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f"%(end, temp_cost, sample_loss, right_token, whole_token,(right_token+0.)/whole_token))       
        epoch_finish = time.time()
        epoch_cost = epoch_finish - epoch_start
        print("Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s,  total loss: %s"%(idx, epoch_cost, train_num/epoch_cost, total_loss))
        # exit(0)
        # continue
        speed, acc, p, r, f, _ = evaluate(data, model, "dev")
        dev_finish = time.time()
        dev_cost = dev_finish - epoch_finish

        if seg:
            current_score = f
            print("Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(dev_cost, speed, acc, p, r, f))
        else:
            current_score = acc
            print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f"%(dev_cost, speed, acc))

        if current_score > best_dev:
            if seg:
                print "Exceed previous best f score:", best_dev
            else:
                print "Exceed previous best acc score:", best_dev
            model_name = save_model_dir +'.'+ str(idx) + ".model"
            torch.save(model.state_dict(), model_name)
            best_dev = current_score 
        # ## decode test
        speed, acc, p, r, f, _ = evaluate(data, model, "test")
        test_finish = time.time()
        test_cost = test_finish - dev_finish
        if seg:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(test_cost, speed, acc, p, r, f))
        else:
            print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f"%(test_cost, speed, acc))
        gc.collect() 


def load_model_decode(model_dir, data, name, gpu, seg=True):
    data.HP_gpu = gpu
    print "Load Model from file: ", model_dir
    model = SeqModel(data)
    ## load model need consider if the model trained in GPU and load in CPU, or vice versa
    # if not gpu:
    #     model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage)
    #     # model = torch.load(model_dir, map_location=lambda storage, loc: storage)
    # else:
    model.load_state_dict(torch.load(model_dir))
        # model = torch.load(model_dir)
    
    print("Decode %s data ..."%(name))
    start_time = time.time()
    speed, acc, p, r, f, pred_results = evaluate(data, model, name)
    end_time = time.time()
    time_cost = end_time - start_time
    if seg:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f"%(name, time_cost, speed, acc, p, r, f))
    else:
        print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f"%(name, time_cost, speed, acc))
    return pred_results









In [21]:
status='train'
train_file= 'data/bala_train'
dev_file= 'data/bala_dev'
test_file= 'data/bala_test'
savemodel= 'data/savemodel'

char_emb = "../SubwordEncoding_download_data/gigaword_chn.all.a2b.uni.ite50.vec"
bichar_emb = "../SubwordEncoding_download_data/gigaword_chn.all.a2b.bi.ite50.vec"
# gaz_file = "../data/ctb.50d.vec"
#gaz_file = "../SubwordEncoding_download_data/zh.wiki.bpe.op200000.d50.w2v.txt"
gaz_file = '../SubwordEncoding_download_data/ctb.50d.vec'

In [22]:
data = Data()

data.HP_use_char = False
data.HP_batch_size = 1
data.use_bigram = True
data.gaz_dropout = 0.5
data.HP_lr = 0.01
data.HP_dropout = 0.5
data.HP_iteration = 50
data.norm_gaz_emb = True
data.HP_fix_gaz_emb = False

In [23]:
data_initialization(data, gaz_file, train_file, dev_file, test_file)

Load gaz file:  ../SubwordEncoding_download_data/ctb.50d.vec  total size: 704368
gaz alphabet size: 667
gaz alphabet size: 681
gaz alphabet size: 984


<utils.data.Data instance at 0x000000000D535488>

In [102]:
data.generate_instance_with_gaz(train_file,'train')
data.generate_instance_with_gaz(dev_file,'dev')
data.generate_instance_with_gaz(test_file,'test')

In [103]:
#字的embedding：data.pretrain_word_embedding	  data.word_emb_dim
data.build_word_pretrain_emb(char_emb)
#biword的embedding：data.pretrain_biword_embedding	   data.biword_emb_dim 
data.build_biword_pretrain_emb(bichar_emb)
#子词的embedding：data.pretrain_gaz_embedding, data.gaz_emb_dim
data.build_gaz_pretrain_emb(gaz_file)

build word pretrain emb...
Embedding:
     pretrain word:11327, prefect match:580, case_match:0, oov:1, oov%:0.00171821305842
build biword pretrain emb...
Embedding:
     pretrain word:3986686, prefect match:1788, case_match:0, oov:11, oov%:0.00611111111111
build gaz pretrain emb...
Embedding:
     pretrain word:704368, prefect match:982, case_match:0, oov:1, oov%:0.0010162601626


In [104]:
print np.array(data.train_texts).shape
print np.array(data.dev_texts).shape
print np.array(data.test_texts).shape
print np.array(data.raw_texts).shape

print np.array(data.train_Ids).shape
print np.array(data.dev_Ids).shape
print np.array(data.test_Ids).shape
print np.array(data.raw_Ids).shape



(15L, 5L)
(2L, 5L)
(13L, 5L)
(0L,)
(15L, 5L)
(2L, 5L)
(13L, 5L)
(0L,)


In [105]:
f=open('example_dev_texts.txt','w+')
print>>f, data.dev_texts

In [108]:

f2=open('example_dev_Ids.txt','w+')
print>>f2, data.dev_Ids


In [107]:
print data.dev_Ids

[[[480, 480, 118, 19, 9, 55, 56, 416, 481, 23, 45, 23, 47, 87, 25], [1202, 1203, 1204, 75, 76, 77, 1205, 1206, 1207, 60, 61, 62, 1208, 1209, 59], [[480], [480], [118], [19], [9], [55], [56], [416], [481], [23], [45], [23], [47], [87], [25]], [[[667, 668], [3, 2]], [[669], [2]], [[670], [2]], [[59], [2]], [[60, 61], [3, 2]], [[62], [2]], [[671], [2]], [[672], [2]], [], [[53], [2]], [], [[54], [2]], [], [[673], [2]], []], [1, 2, 3, 3, 1, 4, 2, 1, 2, 1, 2, 1, 2, 3, 3]], [[72, 43, 187, 190, 237, 238, 288, 332, 397, 482, 155, 151, 152, 12, 12, 483, 484, 319, 472, 26], [1210, 293, 1211, 1212, 425, 1213, 680, 1214, 1215, 1216, 1217, 235, 1218, 12, 1219, 1220, 1221, 1222, 1223, 1224], [[72], [43], [187], [190], [237], [238], [288], [332], [397], [482], [155], [151], [152], [12], [12], [483], [484], [319], [472], [26]], [[], [[674, 191], [3, 2]], [], [], [[266], [2]], [], [[406], [2]], [[675], [2]], [[676], [2]], [[677], [2]], [[678], [2]], [[161], [2]], [], [[7], [2]], [], [[679], [2]], [], [[

In [67]:
print(data.biword_alphabet.instance2index)

{u'\u4e66\u8bb0': 38, u'\u5c11\u56f0': 671, u'\u6709\u53d1': 984, u'\u5728\u4e2d': 1311, u'\u5c11\u56fd': 1456, u'\u5168\u4e16': 1761, u'\u5e73\u5b89': 1478, u'\u52a8\u65e9': 1073, u'\u9047\u3002': 956, u'\u548c\u4eba': 881, u'\u4e3a\u4e0d': 1733, u'\u8ddd\u7ee7': 975, u'\u53d1\u751f': 459, u'\u627f\u548c': 1662, u'\u53f2\u4e0a': 188, u'\u8981\u9a76': 1256, u'\u3001\u76f8': 1638, u'\u3001\u56fd': 40, u'\u8c08\u5224': 919, u'\u5b81\u3002': 965, u'\u4e2d\u4e00': 832, u';\u4eba': 1025, u'\u5c31\u4f1a': 747, u'\u5b9a\u3001': 1511, u'\u5b9a\u3002': 281, u'\u91cd\u547c': 923, u'\u5173\u7cfb': 542, u'\u798f\u4e8e': 1578, u'\u8bbe\u8fdb': 1301, u'\u8c03,': 1122, u'\u4e16\u7eaa': 10, u'\u7eed\u6df1': 391, u'(\u4e00': 48, u'\u4f1a\u524d': 1251, u'\u7b2c\u4e00': 1290, u'\u4e2d\u4ecd': 1017, u'\u56fd\u613f': 1110, u'\u548c\u4e2d': 127, u'\u7b2c\u4e09': 1269, u'\u603b\u4e66': 37, u'\u5927\u6210': 689, u'\u5e94\u4e16': 618, u'\u6280\u672f': 407, u'\u653e\u548c': 345, u'\u60c5\u51b5': 483, u'\u754c!'

In [76]:
'''
调用来自：self.train_texts, self.train_Ids = read_instance_with_gaz(
input_file, self.gaz, self.word_alphabet, self.biword_alphabet, self.char_alphabet, self.gaz_alphabet,  
self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH)
'''


input_file='D:\\workspace\\SubwordEncoding_comment\\data\\bala_dev'

gaz=data.gaz
word_alphabet=data.word_alphabet
biword_alphabet=data.biword_alphabet
char_alphabet=data.char_alphabet
gaz_alphabet=data.gaz_alphabet
label_alphabet=data.label_alphabet
number_normalized=data.number_normalized
max_sent_length=250
char_padding_size=-1
char_padding_symbol = '</pad>'
    
    
    

In [111]:
in_lines = open(input_file,'r').readlines()
instence_texts = []
instence_Ids = []
words = []     #每行的字
biwords = []    #每行的字+下一行的字
chars = []    ##第四维放入的这一句话的char
labels = []
word_Ids = []    #word_alphabet 中 词对应的id
biword_Ids = []
char_Ids = []
label_Ids = []    #label_alphabet  中 词对应的id
for idx in xrange(len(in_lines)):
    line = in_lines[idx]
    if len(line) > 2:
        pairs = line.strip().split()   #词+标注
        word = pairs[0].decode('utf-8')
        if number_normalized:
            word = normalize_word(word)
        label = pairs[-1]
        if idx < len(in_lines) -1 and len(in_lines[idx+1]) > 2:
            biword = word + in_lines[idx+1].strip().split()[0].decode('utf-8')
        else:
            biword = word + NULLKEY   #NULLKEY = "-null-"
        biwords.append(biword)
        words.append(word)
        labels.append(label)
        word_Ids.append(word_alphabet.get_index(word))
        biword_Ids.append(biword_alphabet.get_index(biword))
        label_Ids.append(label_alphabet.get_index(label))

        char_list = []
        char_Id = []
        for char in word:
            char_list.append(char)   #char_list就是word
        if char_padding_size > 0:
            char_number = len(char_list)
            if char_number < char_padding_size:
                char_list = char_list + [char_padding_symbol]*(char_padding_size-char_number)
            assert(len(char_list) == char_padding_size)
        else:
            ### not padding
            pass
        for char in char_list:
            char_Id.append(char_alphabet.get_index(char))
        chars.append(char_list)  #第四维放入的这一句话的char
        char_Ids.append(char_Id)

    #到了一句话的结束
    else:
       # print '----------------------------------------------------------------------------'
        if ((max_sent_length < 0) or (len(words) < max_sent_length)) and (len(words)>0):
            gazs = []  #子词列表
            gaz_Ids = []
            w_length = len(words)  
#             for w in words:
#                 print w," "
#             print '\n'
#             print '---------------------------'
            for idx in range(w_length):
                matched_list = gaz.enumerateMatchList(words[idx:])   #
                matched_length = [len(a) for a in matched_list]
#                 print idx,"----------"
#                 print "forward...feed:","".join(words[idx:])
#                 for a in matched_list:
#                     print a,len(a),"\n",
#                 print
#                 print matched_length
                gazs.append(matched_list)
                matched_Id  = [gaz_alphabet.get_index(entity) for entity in matched_list]
                if matched_Id:
                    gaz_Ids.append([matched_Id, matched_length])
                else:
                    gaz_Ids.append([])

            instence_texts.append([words, biwords, chars, gazs, labels])
            instence_Ids.append([word_Ids, biword_Ids, char_Ids, gaz_Ids, label_Ids])
        words = []
        biwords = []
        chars = []
        labels = []
        word_Ids = []
        biword_Ids = []
        char_Ids = []
        label_Ids = []
        gazs = []
        gaz_Ids = []
#print instence_texts, instence_Ids

In [80]:
def normalize_word(word): #字符串中的数字转换成0
    new_word = ""
    for char in word:
        if char.isdigit():  #检测字符串是否只由数字组成
            new_word += '0'
        else:
            new_word += char
    return new_word
NULLKEY = "-null-"

In [112]:
data.show_data_summary()

DATA SUMMARY START:
     Tag          scheme: BMES
     MAX SENTENCE LENGTH: 250
     MAX   WORD   LENGTH: -1
     Number   normalized: True
     Use          bigram: True
     Word  alphabet size: 582
     Biword alphabet size: 1800
     Char  alphabet size: 582
     Gaz   alphabet size: 984
     Label alphabet size: 5
     Word embedding size: 50
     Biword embedding size: 50
     Char embedding size: 30
     Gaz embedding size: 50
     Norm     word   emb: True
     Norm     biword emb: True
     Norm     gaz    emb: True
     Norm   gaz  dropout: 0.5
     Train instance number: 15
     Dev   instance number: 2
     Test  instance number: 13
     Raw   instance number: 0
     Hyperpara  iteration: 50
     Hyperpara  batch size: 1
     Hyperpara          lr: 0.01
     Hyperpara    lr_decay: 0.05
     Hyperpara     HP_clip: 5.0
     Hyperpara    momentum: 0
     Hyperpara  hidden_dim: 200
     Hyperpara     dropout: 0.5
     Hyperpara  lstm_layer: 1
     Hyperpara      bilstm: True
 

NameError: name 'char_embedding_dim' is not defined