In [2]:
# 配置tensorflow利用显存方式
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth=True 
#config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))


import time
import pickle
import numpy as np
from keras import backend as K
from keras import optimizers
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Activation
from keras.optimizers import SGD, Adam, RMSprop
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from bert_serving.client import BertClient
from bratreader.repomodel import RepoModel



def get_words(doc, bc): 
    '''get: words, embedding, spans, labels
    '''    
    words = []
    wordsvec = []
    spans = []
    wordslabel = []
    
    for sent in doc.sentences:
        str_sent = sent.line
        
        # Embeddings of each sentence/ sequence via BERT.
        vec = bc.encode([str_sent], show_tokens=True)
        #print(type(vec))
        for idx_sentence in range(len(vec[1])):
            #print('\n',vec[1][idx_sentence])
            for idx_token in range(len(vec[1][idx_sentence])):
                #print(vec[1][idx_sentence][idx_token],'\t', vec[0][idx_sentence][idx_token][0:5])
                
                if( vec[1][idx_sentence][idx_token].find('[CLS]', 0, 5)==0 ):
                    # [CLS]
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])
                    if len(spans)>0:
                        spans.append([spans[-1][1],spans[-1][1]])
                    else:
                        spans.append([0,0])
                    wordslabel.append(['NULL'])
                elif( vec[1][idx_sentence][idx_token].find('[SEP]', 0, 5)==0 ):
                    # [SEP]
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])
                    if len(spans)>0:
                        spans.append([spans[-1][1],spans[-1][1]])
                    else:
                        spans.append([0,0])
                    wordslabel.append(['NULL'])
                elif( vec[1][idx_sentence][idx_token].find('##', 0, 2)<0 ):
                    # Token in BERT table
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])                      
                    start = doc.text.lower().find(words[-1], spans[-1][0])
                    end = start + len(words[-1])
                    spans.append([start, end])
                    label = list(set(doc.getlabelinspan(start, end)))
                    if len(str(label))>2:
                        wordslabel.append(label)
                    else:
                        wordslabel.append(['NULL'])                    
                else:
                    # Token started with '##' in BERT
                    words[-1] = words[-1] + vec[1][idx_sentence][idx_token][2:]
                    wordsvec[-1] = wordsvec[-1] + vec[0][idx_sentence][idx_token][0:]
                    spans[-1] = ([spans[-1][0], spans[-1][0]+len(words[-1])])
                    label = list(set(doc.getlabelinspan(spans[-1][0], spans[-1][1])))
                    if len(str(label))>2:
                        wordslabel[-1] = label
                    else:
                        wordslabel[-1] = ['NULL']
                #print(spans[-1], wordslabel[-1], words[-1], wordsvec[-1])
    return words, wordsvec, spans, wordslabel


def create_base_network(input_dim, nb_classes):
    '''Base network to be shared (eq. to feature extraction).
    '''    
    sgd = optimizers.SGD(lr=0.01, clipnorm=1.)
    sgd = optimizers.SGD(lr=0.01, momentum=0.0, decay=0.0, nesterov=False)
    rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
    adagrad = optimizers.Adagrad(lr=0.01, epsilon=None, decay=0.0)
    adadelta = optimizers.Adadelta(lr=1.0, rho=0.95, epsilon=None, decay=0.0)
    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    adamax = optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0)
    nadam = optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
    
    N_nodes = input_dim
    r_droupout = 0.2
    model_base = Sequential()
    model_base.add(Dense(N_nodes, input_shape=(input_dim,)))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(N_nodes))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(N_nodes))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(nb_classes))
    model_base.add(Activation('softmax'))    
    model_base.compile(loss='categorical_crossentropy',
                       optimizer=rmsprop,
                       metrics=['accuracy'])
    #model_base.load_weights('model_base.h5')
    return model_base


def model_init(dir_data, name_file, bc):
    '''
    fit the model on given file with annotation 
    '''
    corpus = RepoModel(dir_data) # load corpus
    doc = corpus.documents[name_file] # get document with key
    
    words, wordsvec, spans, wordslabel = get_words(doc, bc)
    # wordsvec from list to array
    wordsvec = np.asarray(wordsvec)    
    # label encoder
    wordslabel = [label[0] for label in wordslabel]
    print(set(wordslabel))
    
    model = create_base_network(wordsvec[0].shape[0], len(set(wordslabel)))
    model.summary()
    return model


def fit_on_data(dir_data, name_file, bc):
    '''
    fit the model on given file with annotation 
    '''
    corpus = RepoModel(dir_data) # load corpus
    doc = corpus.documents[name_file] # get document with key
    
    words, wordsvec, spans, wordslabel = get_words(doc, bc)
    # wordsvec from list to array
    wordsvec = np.asarray(wordsvec)    
    # label encoder
    wordslabel = [label[0] for label in wordslabel]
    print('samples:', wordsvec.shape)
    print('labels:', len(set(wordslabel)), set(wordslabel))
    
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(wordslabel)
    Y_encoder = encoder.transform(wordslabel)
    # convert integers to dummy variables (i.e. one hot encoded)
    Y_encoder = np_utils.to_categorical(Y_encoder)
    
    #X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)
    X_train, X_test, Y_train, Y_test  = wordsvec, wordsvec, Y_encoder, Y_encoder
    
    # model define
    N_batch = 4
    N_epoch = 4
    en_verbose = 1
    input_dim = wordsvec.shape[1]
    N_classes = len(set(wordslabel))
    
    model = create_base_network(input_dim, N_classes)
    model.summary()
    
    # model training
    start   = time.time()
    history = model.fit(X_train, Y_train,
                        batch_size=N_batch, epochs=N_epoch,
                        verbose=en_verbose, validation_data=(X_test, Y_test))
    end     = time.time()
    print('time elapse training:\t', end - start, 'sec')
    return model


def probs_on_data_ann(dir_data, name_file, bc, model):
    '''
    test the model on given file with annotation 
    '''
    # model test
    corpus = RepoModel(dir_data) # load corpus
    doc = corpus.documents[name_file] # get document with key
    
    words, wordsvec, spans, wordslabel = get_words(doc, bc)    
    # wordsvec from list to array
    wordsvec = np.asarray(wordsvec)    
    # label encoder
    wordslabel = [label[0] for label in wordslabel]
    
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(wordslabel)
    Y_encoder = encoder.transform(wordslabel)
    # convert integers to dummy variables (i.e. one hot encoded)
    Y_encoder = np_utils.to_categorical(Y_encoder)
    
    #X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)
    X_train, X_test, Y_train, Y_test  = wordsvec, wordsvec, Y_encoder, Y_encoder
    
    # model testing
    start   = time.time()
    probs = model.predict(X_test, verbose=1)
    end     = time.time()
    print('time elapse training:\t', end - start, 'sec')
    return probs


###############################################################################################
## setting up

DIR_DATA = "./dataset/tmpbratfiles/"
NAME_FILE = "agm_briefing_unilever_11-05-2005"
NAME_FILE = 'text_1'
BERT_CLIENT = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True)# bert model as service
model = model_init(DIR_DATA, NAME_FILE, BERT_CLIENT)

server config:
                        client	=	6659ea0b-c5f1-4447-9089-53bf28e041ec
                   num_process	=	2                             
          ventilator -> worker	=	['ipc://tmpGcaEYy/socket', 'ipc://tmpBc6wgh/socket', 'ipc://tmp4JeryZ/socket', 'ipc://tmpjYTlQH/socket', 'ipc://tmpc62g8p/socket', 'ipc://tmplCZcq8/socket', 'ipc://tmpCIt9HQ/socket', 'ipc://tmpfhu6Zy/socket']
                worker -> sink	=	ipc://tmpiqZ5KQ/socket        
           ventilator <-> sink	=	ipc://tmpPDiMGQ/socket        
           server_current_time	=	2019-08-07 16:26:17.541847    
                     statistic	=	{'num_data_request': 16, 'num_total_seq': 16, 'num_sys_request': 11, 'num_total_request': 27, 'num_total_client': 11, 'num_active_client': 6, 'avg_request_per_client': 2.4545454545454546, 'min_request_per_client': 1, 'max_request_per_client': 3, 'num_min_request_per_client': 2, 'num_max_request_per_client': 7, 'avg_size_per_request': 1.0, 'min_size_per_request': 1, 'max_size_per_re

In [2]:
DIR_DATA = "./dataset/tmpbratfiles/"
NAME_FILE = "agm_briefing_unilever_11-05-2005"
MODEL_Trigger = fit_on_data(dir_data=DIR_DATA, name_file=NAME_FILE, bc=BERT_CLIENT)
probs = probs_on_data_ann(dir_data=DIR_DATA, name_file=NAME_FILE, bc=BERT_CLIENT, model=MODEL_Trigger)
print(probs.shape)

samples: (381, 768)
labels: 4 {'Company', 'QuarterlyResults', 'SalesVolume', 'NULL'}
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 768)               590592    
_________________________________________________________________
activation_5 (Activation)    (None, 768)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 768)               590592    
_________________________________________________________________
activation_6 (Activation)    (None, 768)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 768)               0         
_________________________________________________________

In [None]:
# event trigger

import time
import numpy as np
from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Activation
from keras.optimizers import SGD, Adam, RMSprop
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

DIR_DATA = "./dataset/tmpbratfiles/"
NAME_FILE = "agm_briefing_unilever_11-05-2005"

corpus = RepoModel(DIR_DATA) # load corpus
doc = corpus.documents[NAME_FILE] # get document with key
bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True) # bert model as service

print(doc)

words, wordsvec, spans, wordslabel = get_words_info(doc, bc)

# wordsvec from list to array
wordsvec = np.asarray(wordsvec)

# label encoder
wordslabel = [label[0] for label in wordslabel]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(wordslabel)
Y_encoder = encoder.transform(wordslabel)
# convert integers to dummy variables (i.e. one hot encoded)
Y_encoder = np_utils.to_categorical(Y_encoder)

X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)

# model define
N_batch = 1
N_epoch = 4
en_verbose = 1
input_dim = wordsvec.shape[1]
N_classes = len(set(wordslabel))

model = create_base_network(input_dim, N_classes)
model.summary()

# model training
print('='*65,'\n>>training')
start   = time.time()
history = model.fit(X_train, Y_train,
                    batch_size=N_batch, epochs=N_epoch,
                    verbose=en_verbose, validation_data=(X_test, Y_test))
end     = time.time()
print('time elapse training:\t', end - start, 'sec') 

# model test
print('='*65,'\n>>testing')
probs = model.predict(X_test, verbose=1)

# model eval
print('='*65,'\n>>evaluating')
probs = model.predict(wordsvec, verbose=1)
#Returns the loss value & metrics values for the model in test mode.
[loss, metrics] = model.evaluate(x=wordsvec, y=Y_encoder, verbose=1)
print('loss : ', loss)
print(model.metrics[0], ':', metrics)

In [None]:
# error analysis and Retrain
label_pred =  np.argmax(probs, axis=1)
label_true = np.argmax(Y_encoder, axis=1)
predict_diff = abs(label_pred - label_true)
idx_diff = np.where(predict_diff>0)
print(idx_diff[0])
for idx in idx_diff[0]:
    print(idx, '\t', spans[idx], '\t', wordslabel[idx], '\t', words[idx])

Times_wrong =  40
data_wrong = np.tile(wordsvec[idx_diff[0]], (Times_wrong, 1))
label_wrong = np.tile(Y_encoder[idx_diff[0]], (Times_wrong, 1))
data_retrain = np.append(wordsvec, data_wrong, axis = 0)
label_retrain = np.append(Y_encoder, label_wrong, axis =0)
print(data_retrain.shape, label_retrain.shape)



X_train, X_test, Y_train, Y_test = train_test_split(data_retrain, label_retrain, random_state=0)

# model define
N_batch = 8
N_epoch = 100
en_verbose = 0
input_dim = wordsvec.shape[1]
N_classes = len(set(wordslabel))

model = create_base_network(input_dim, N_classes)
model.summary()

# model training
print('='*65,'\n>>training')
start   = time.time()
history = model.fit(X_train, Y_train,
                    batch_size=N_batch, epochs=N_epoch,
                    verbose=en_verbose, validation_data=(X_test, Y_test))
end     = time.time()
print('time elapse training:\t', end - start, 'sec') 

# model test
print('='*65,'\n>>testing')
probs = model.predict(X_test, verbose=1)

# model eval
print('='*65,'\n>>evaluating')
probs = model.predict(wordsvec, verbose=1)
#Returns the loss value & metrics values for the model in test mode.
[loss, metrics] = model.evaluate(x=wordsvec, y=Y_encoder, verbose=1)
print('loss : ', loss)
print(model.metrics[0], ':', metrics)


# error analysis
label_pred = np.argmax(probs, axis=1)
label_true = np.argmax(Y_encoder, axis=1)
predict_diff = abs(label_pred - label_true)
idx_diff = np.where(predict_diff>0)
for idx in idx_diff[0]:
    print(idx, '\t', spans[idx], '\t', wordslabel[idx], '\t', words[idx])

In [2]:
def get_embdinspan(embds, spans, wordslabel, span):
    for idxs in range(len(spans)):
        s = spans[idxs]
        if s[0]<=span[0] and span[0]<=s[1]:
            break
    for idxe in range(len(spans)):
        s = spans[idxe]
        if s[0]<=span[1] and span[1]<=s[1]:
            break
    idxe = idxe + 1
    embdsin = []
    labelsin = []
    for idx in range(idxs,idxe):
        embdsin.append(embds[idx])
        labelsin.append(wordslabel[idx])
    return embdsin, labelsin

def get_embdoutspan(embds, spans, wordslabel, span):
    for idxs in range(len(spans)):
        s = spans[idxs]
        if s[0]<=span[0] and span[0]<=s[1]:
            break
    for idxe in range(len(spans)):
        s = spans[idxe]
        if s[0]<=span[1] and span[1]<=s[1]:
            break
    embdsout = []
    labelsout = []
    for idx in range(0,idxs):
        embdsout.append(embds[idx])
        labelsout.append(wordslabel[idx])
    for idx in range(idxe,len(wordslabel)):
        embdsout.append(embds[idx])
        labelsout.append(wordslabel[idx])
    return embdsout, labelsout

def get_embdintype(embds, spans, wordslabel, labelType='NULL'):
    embdsNULL = []
    labelsNULL = []
    for idx in range(len(spans)):
        if wordslabel[idx]==labelType:
            embdsNULL.append(embds[idx])
            labelsNULL.append(wordslabel[idx])
    return embdsNULL, labelsNULL


def get_events(doc, bc): 
    
    words, wordsvec, spans, wordslabel = get_words(doc, bc)    
    wordslabel = [label[0] for label in wordslabel]
    #print('labels:', len(set(wordslabel)), set(wordslabel))
    
    '''get: triggers, triggers_embedding, triggers_labels, args, args_embedding, args_labels
    '''    
    triggers, triggers_embedding, triggers_labels, args, args_embedding, args_labels = [], [], [], [], [], []
    
    for event in doc.events:
        embdsin, labelsin = get_embdinspan(wordsvec, spans, wordslabel, event.trigger_spans)
        for ebd in embdsin:
            #triggers.append(event.trigger)
            triggers_embedding.append(ebd)
            triggers_labels.append(event.trigger_label)
        for idx in range(len(event.args)):
            #arg = event.args[idx]
            span = event.args_spans[idx]
            label = event.args_labels[idx]
            embdsin, labelsin = get_embdinspan(wordsvec, spans, wordslabel, span)
            for ebd in embdsin:
                #args.append(arg)
                args_embedding.append(ebd)
                args_labels.append(label)
        embdsNULL, labelsNULL = get_embdNULL(wordsvec, spans, wordslabel)
        #print(type(triggers_embedding), type(triggers_labels), type(args_embedding), type(args_labels), type(embdsNULL), type(labelsNULL))
        
    return triggers, triggers_embedding, triggers_labels, args, args_embedding, args_labels, embdsNULL, labelsNULL


# event arguments extraction

import time
import numpy as np
from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Activation
from keras.optimizers import SGD, Adam, RMSprop
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

DIR_DATA = "./dataset/tmpbratfiles/"
NAME_FILE = "text_2"
#NAME_FILE = "agm_briefing_unilever_11-05-2005"

corpus = RepoModel(DIR_DATA) # load corpus
doc = corpus.documents[NAME_FILE] # get document with key
bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=False) # bert model as service

triggers, triggers_embedding, triggers_labels, args, args_embedding, args_labels, embdsNULL, labelsNULL = get_events(doc, bc)
#print(triggers_labels, args_labels, labelsNULL)

words, wordsvec, wordslabel = triggers, triggers_embedding, triggers_labels
for idx_temp in range(len(labelsNULL)):
    #triggers.append(event.trigger)
    wordsvec.append(embdsNULL[idx_temp])
    wordslabel.append(labelsNULL[idx_temp])

# wordsvec from list to array
wordsvec = np.asarray(wordsvec)
#print(wordsvec.shape, wordslabel)

# label encoder
wordslabel = [label[0] for label in wordslabel]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(wordslabel)
Y_encoder = encoder.transform(wordslabel)
# convert integers to dummy variables (i.e. one hot encoded)
Y_encoder = np_utils.to_categorical(Y_encoder)

X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)

# model define
N_batch = 4
N_epoch = 4
en_verbose = 1
input_dim = wordsvec.shape[1]
N_classes = len(set(wordslabel))

model = create_base_network(input_dim, N_classes)
model.summary()
with open('./model.pkl', 'wb') as f:
    pickle.dump([input_dim, N_classes], f, protocol=2)

# model training
print('='*65,'\n>>training')
start   = time.time()
history = model.fit(X_train, Y_train,
                    batch_size=N_batch, epochs=N_epoch,
                    verbose=en_verbose, validation_data=(X_test, Y_test))
end     = time.time()
print('time elapse training:\t', end - start, 'sec')

# model test
print('='*65,'\n>>testing')
probs = model.predict(X_test, verbose=1)

# model eval
print('='*65,'\n>>evaluating')
#Returns the loss value & metrics values for the model in test mode.
[loss, metrics] = model.evaluate(x=X_test, y=Y_test, verbose=1)
print('loss : ', loss)
print(model.metrics[0], ':', metrics)

model.save_weights('model_trigger.h5')






words, wordsvec, wordslabel = args, args_embedding, args_labels
for idx_temp in range(len(labelsNULL)):
    #triggers.append(event.trigger)
    wordsvec.append(embdsNULL[idx_temp])
    wordslabel.append(labelsNULL[idx_temp])

# wordsvec from list to array
wordsvec = np.asarray(wordsvec)
print(wordsvec.shape)

# label encoder
wordslabel = [label[0] for label in wordslabel]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(wordslabel)
Y_encoder = encoder.transform(wordslabel)
# convert integers to dummy variables (i.e. one hot encoded)
Y_encoder = np_utils.to_categorical(Y_encoder)

X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)

# model define
N_batch = 4
N_epoch = 4
en_verbose = 1
input_dim = wordsvec.shape[1]
N_classes = len(set(wordslabel))

model = create_base_network(input_dim, N_classes)
model.summary()

# model training
print('='*65,'\n>>training')
start   = time.time()
history = model.fit(X_train, Y_train,
                    batch_size=N_batch, epochs=N_epoch,
                    verbose=en_verbose, validation_data=(X_test, Y_test))
end     = time.time()
print('time elapse training:\t', end - start, 'sec')

# model test
print('='*65,'\n>>testing')
probs = model.predict(X_test, verbose=1)

# model eval
print('='*65,'\n>>evaluating')
#Returns the loss value & metrics values for the model in test mode.
[loss, metrics] = model.evaluate(x=X_test, y=Y_test, verbose=1)
print('loss : ', loss)
print(model.metrics[0], ':', metrics)

model.save_weights('model_arg.h5')



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 768)               590592    
_________________________________________________________________
activation_5 (Activation)    (None, 768)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 768)               590592    
_________________________________________________________________
activation_6 (Activation)    (None, 768)               0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 768)               590592    
__________

In [3]:
#test

# 配置tensorflow利用显存方式
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth=True 
#config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))


import time
import pickle
import numpy as np
from keras import backend as K
from keras import optimizers
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Activation
from keras.optimizers import SGD, Adam, RMSprop
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from bert_serving.client import BertClient
from bratreader.repomodel import RepoModel



text = '中广网北京11月15日消息 (记者陈欣)据中国之声《央广新闻》报道，经中国和巴基斯坦两军协商同意，“友谊－2011”中巴反恐联合训练于昨天(14日)起在巴基斯坦举行。一时间，印度媒体热炒这一话题，《印度时报》报道称，此次中巴军演是为了加强沙漠战能力，在边境地区向印度施加压力。事实果真如此吗？中巴军演的目标是针对印度吗，印度媒体为何会如此解读，下面中国之声连线中央台军事记者陈欣。　　主持人：对于印度媒体热炒中巴联合军演，你有什么样的观点和评价？　　记者：首先，正像中国国防部回应所说，这次中巴联演是中巴两军的年度交流计划，并不是刻意针对某个第三国进行的。这次的代号是友谊2011，实际上此前已经有了友谊2004、友谊2006、友谊2010三次联演，分别在中巴举行，目的也很明确就是提升中巴两军的反恐作战能力。　　中国要面对东突等分裂的恐怖势力，巴基斯坦更是要应对基地和塔利班的组织。一方面，两军确实需要互相学习的，我军近年来在信息化、现代化上有很大进步可以与巴方进行交流，同时巴基斯坦参与反恐作战很多丰富的实践经验，有许多甚至是用血的代价换回来的实战经验，实战取得的经验是非常值得我军学习的。　　另一方面，反恐作战往往是依靠单独一个国家是无法完成的。很多恐怖势力的互相的联系中，中国和巴基斯坦很多恐怖势力是有彼此的联系的，需要两军联合作战。印度媒体的猜测我觉得是没有道理的，首先，从这次演习派出的人员的数量、装备包括这次科目可以看出，典型的针对防控作战需求而不是针对一个国家的。　　印度常说自己是世界大国，但如果中巴两军派出200多人，各自派出200多人的演习，印度媒体就觉得是针对它们，我觉得有点反应过渡，要么是它们信心不足，要么是它们借题发挥。印度媒体还是应该更多的尊重和互信，而不是过分的炒作对方对自己的威胁。'

bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, 
                show_server_config=False)
vec = bc.encode([text],
                show_tokens=True)
print(vec[0].shape, vec[1])
for idx_sentence in range(len(vec[1])):
    print('\n', vec[1][idx_sentence])
    #for idx_token in range(len(vec[1][idx_sentence])):
        #print(vec[1][idx_sentence][idx_token],'\t', vec[0][idx_sentence][idx_token][0:5])


X_test = vec[0][idx_sentence]
#print(X_test.shape, X_test)


with open('./model.pkl', 'rb') as f:
    structure = pickle.load(f)
    input_dim, N_classes = structure[0], structure[1]
    
    
model = create_base_network(input_dim, N_classes)

model.load_weights('model_trigger.h5')
# model test
print('='*65,'\n>>testing')
probs = model.predict(X_test, verbose=1)
print(np.argmax(probs))



model.load_weights('model_arg.h5')
# model test
print('='*65,'\n>>testing')
probs = model.predict(X_test, verbose=1)
print(np.argmax(probs))




(1, 128, 768) [['[CLS]', '中', '广', '网', '北', '京', '11', '月', '15', '日', '消', '息', '(', '记', '者', '陈', '欣', ')', '据', '中', '国', '之', '声', '《', '央', '广', '新', '闻', '》', '报', '道', '，', '经', '中', '国', '和', '巴', '基', '斯', '坦', '两', '军', '协', '商', '同', '意', '，', '[UNK]', '友', '谊', '－', '2011', '[UNK]', '中', '巴', '反', '恐', '联', '合', '训', '练', '于', '昨', '天', '(', '14', '日', ')', '起', '在', '巴', '基', '斯', '坦', '举', '行', '。', '一', '时', '间', '，', '印', '度', '媒', '体', '热', '炒', '这', '一', '话', '题', '，', '《', '印', '度', '时', '报', '》', '报', '道', '称', '，', '此', '次', '中', '巴', '军', '演', '是', '为', '了', '加', '强', '沙', '漠', '战', '能', '力', '，', '在', '边', '境', '地', '区', '向', '印', '度', '[SEP]']]

 ['[CLS]', '中', '广', '网', '北', '京', '11', '月', '15', '日', '消', '息', '(', '记', '者', '陈', '欣', ')', '据', '中', '国', '之', '声', '《', '央', '广', '新', '闻', '》', '报', '道', '，', '经', '中', '国', '和', '巴', '基', '斯', '坦', '两', '军', '协', '商', '同', '意', '，', '[UNK]', '友', '谊', '－', '2011', '[UNK]', '中', '巴', '反', '恐', '联', '合', '训', '

In [None]:

import os

CLASSPATH = "$CLASSPATH:"
path_standford = '/home/linbo/workspace/Datasets/Standford-coreNLP/'
path_segmenter = path_standford + 'stanford-segmenter-2018-10-16/stanford-segmenter.jar'
CLASSPATH = CLASSPATH + path_segmenter

path_postagger = path_standford + 'stanford-postagger-full-2018-10-16/stanford-postagger.jar'
CLASSPATH = CLASSPATH + ':' + path_postagger

path_ner = path_standford + 'stanford-ner-2018-10-16/stanford-ner.jar'
CLASSPATH = CLASSPATH + ':' + path_ner

path_parser = path_standford + 'stanford-parser-full-2018-10-17/stanford-parser.jar'
CLASSPATH = CLASSPATH + ':' + path_parser

path_parser_model = path_standford + 'stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar'
CLASSPATH = CLASSPATH + ':' + path_parser_model

path_corenlp = path_standford + 'stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar:' 
CLASSPATH = CLASSPATH + ':' + path_corenlp

path_model = path_standford + 'stanford-english-corenlp-2018-10-05-models.jar'
#path_model = path_standford + 'stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2-models.jar'
CLASSPATH = CLASSPATH + ':' + path_model

path_api = path_standford + 'stanford-corenlp-full-2018-10-05/slf4j-api.jar'
CLASSPATH = CLASSPATH + ':' + path_api

print(CLASSPATH)

os.environ["CLASSPATH"] = CLASSPATH
os.environ['STANFORD_PARSER'] = path_corenlp
os.environ['STANFORD_MODELS'] = path_model

In [None]:
sent = "Kalla, it\'s a dog!"

from nltk.tokenize.stanford import StanfordTokenizer

tokenizer = StanfordTokenizer()
print(tokenizer.tokenize(sent))

In [None]:

from nltk.parse.stanford import StanfordParser

class MyParser(StanfordParser):
    def raw_parse_sents(self, sentences, verbose=False):
        """
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list of strings.
        Each sentence will be automatically tokenized and tagged by the Stanford Parser.
        The output format is `wordsAndTags`.

        :param sentences: Input sentences to parse
        :type sentences: list(str)
        :rtype: iter(iter(Tree))
        """
        cmd = [
            self._MAIN_CLASS,
            '-model', self.model_path,
            '-outputFormat', 'penn', # conll, conll2007, penn
            '-sentences', 'newline'
        ]
        return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), True ))
myparser = MyParser(model_path= path_standford + 'stanford-english-corenlp-2018-10-05-models/' 
                    + "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

en_GUI = 0
sent = "the quick brown fox jumps over the \" lazy \" dog ."
print(sent)
res = list(myparser.raw_parse_sents([sent, sent]))
for row in res:
    for t in row:
        print(type(t),'\n',t)
        if  en_GUI:
            t.draw()

In [None]:
from nltk.corpus import BracketParseCorpusReader

reader = BracketParseCorpusReader("./data/", "temp.txt")
for sent in reader.parsed_sents():
    print(sent)
t = sent
for s in t.subtrees(lambda t: t.height() == 2): print(s)


from nltk.tree import Tree

t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
for s in t.subtrees(lambda t: t.height() == 2): print(s)

In [None]:
from collections import OrderedDict
from nltk.tree import Tree
from nltk.parse.stanford import StanfordParser

class MyParser(StanfordParser):
    def raw_parse_sents(self, sentences, verbose=False):
        """
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list of strings.
        Each sentence will be automatically tokenized and tagged by the Stanford Parser.
        The output format is `penn`.

        :param sentences: Input sentences to parse
        :type sentences: list(str)
        :rtype: iter(iter(Tree))
        """
        cmd = [
            self._MAIN_CLASS,
            '-model', self.model_path,
            '-outputFormat', 'penn', # conll, conll2007, penn
            '-sentences', 'newline'
        ]
        return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), True ))
myparser = MyParser(model_path= path_standford + 'stanford-english-corenlp-2018-10-05-models/' 
                    + "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

def load_tags(file_tags):
    tags = OrderedDict()
    with open(file_tags, encoding='utf-8') as ft:
        for line in ft.readlines():
            line = line.strip()
            tags[line] = len(tags)
    return tags

tags = load_tags('tags.csv')

sent = "the quick brown fox jumps over the \" lazy \" dog ."
print(sent)
res = list(myparser.raw_parse_sents(['1 ' + sent, '2 ' + sent]))
for row in res:
    for t in row: 
        x = {s[0]:tags[s.label()] for s in t.subtrees(lambda t: t.height() == 2)}
        print(x)