In [4]:
import time
import numpy as np
from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Activation
from keras.optimizers import SGD, Adam, RMSprop
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from bert_serving.client import BertClient
from bratreader.repomodel import RepoModel

# 配置tensorflow利用显存方式
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth=True 
#config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))


def create_base_network(input_dim, nb_classes):
    '''Base network to be shared (eq. to feature extraction).
    '''
    N_nodes = input_dim
    r_droupout = 0.2
    model_base = Sequential()
    model_base.add(Dense(N_nodes, input_shape=(input_dim,)))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(N_nodes))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(N_nodes))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(nb_classes))
    model_base.add(Activation('softmax'))
    model_base.compile(loss='categorical_crossentropy',
                       optimizer=RMSprop(),
                       metrics=['accuracy'])
    #model_base.load_weights('model_base.h5')
    return model_base

def words_vec_label(doc, bc): 
    '''get: words, embedding, spans, labels
    '''    
    words = []
    wordsvec = []
    spans = []
    wordslabel = []
    
    for str_sent in doc.text.splitlines():
        
        # Embeddings of each sentence/ sequence via BERT.
        vec = bc.encode([str_sent], show_tokens=True)
        print(type(vec))
        for idx_sentence in range(len(vec[1])):
            #print('\n',vec[1][idx_sentence])
            for idx_token in range(len(vec[1][idx_sentence])):
                #print(vec[1][idx_sentence][idx_token],'\t', vec[0][idx_sentence][idx_token][0:5])
                
                if( vec[1][idx_sentence][idx_token].find('[CLS]', 0, 5)==0 ):
                    # [CLS]
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])
                    if len(spans)>0:
                        spans.append([spans[-1][1],spans[-1][1]])
                    else:
                        spans.append([0,0])
                    wordslabel.append(['NULL'])
                elif( vec[1][idx_sentence][idx_token].find('[SEP]', 0, 5)==0 ):
                    # [SEP]
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])
                    if len(spans)>0:
                        spans.append([spans[-1][1],spans[-1][1]])
                    else:
                        spans.append([0,0])
                    wordslabel.append(['NULL'])
                elif( vec[1][idx_sentence][idx_token].find('##', 0, 2)<0 ):
                    # Token in BERT table
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])                      
                    start = doc.text.lower().find(words[-1], spans[-1][0])
                    end = start + len(words[-1])
                    spans.append([start, end])
                    label = list(set(doc.getlabelinspan(start, end)))
                    if len(str(label))>2:
                        wordslabel.append(label)
                    else:
                        wordslabel.append(['NULL'])                    
                else:
                    # Token started with '##' in BERT
                    words[-1] = words[-1] + vec[1][idx_sentence][idx_token][2:]
                    wordsvec[-1] = wordsvec[-1] + vec[0][idx_sentence][idx_token][0:]
                    spans[-1] = ([spans[-1][0], spans[-1][0]+len(words[-1])])
                    label = list(set(doc.getlabelinspan(spans[-1][0], spans[-1][1])))
                    if len(str(label))>2:
                        wordslabel[-1] = label
                    else:
                        wordslabel[-1] = ['NULL']
                #print(spans[-1], wordslabel[-1], words[-1], wordsvec[-1])
    return words, wordsvec, spans, wordslabel

DIR_DATA = "./dataset/tmpbratfiles/"
NAME_FILE = "agm_briefing_unilever_11-05-2005"
def fit_on_data(dir_data=DIR_DATA, name_file=NAME_FILE):
    '''
    fit the model on given file with annotation 
    '''
    corpus = RepoModel(dir_data) # load corpus
    doc = corpus.documents[name_file] # get document with key
    bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True) # bert model as service
    
    words, wordsvec, spans, wordslabel = words_vec_label(doc, bc)
    
    # wordsvec from list to array
    wordsvec = np.asarray(wordsvec)
    
    # label encoder
    wordslabel = [label[0] for label in wordslabel]
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(wordslabel)
    Y_encoder = encoder.transform(wordslabel)
    # convert integers to dummy variables (i.e. one hot encoded)
    Y_encoder = np_utils.to_categorical(Y_encoder)
    
    #X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)
    X_train, X_test, Y_train, Y_test  = wordsvec, wordsvec, Y_encoder, Y_encoder
    
    # model define
    N_batch = 4
    N_epoch = 4
    en_verbose = 1
    input_dim = wordsvec.shape[1]
    N_classes = len(set(wordslabel))
    
    model = create_base_network(X_train[0].shape[0], len(np.unique(wordslabel)))
    model.summary()
    
    # model training
    start   = time.time()
    history = model.fit(X_train, Y_train,
                        batch_size=N_batch, epochs=N_epoch,
                        verbose=en_verbose, validation_data=(X_test, Y_test))
    end     = time.time()
    print('time elapse training:\t', end - start, 'sec')
    return model

def probs_on_data_ann(dir_data=DIR_DATA, name_file=NAME_FILE):
    '''
    test the model on given file with annotation 
    '''
    # model test
    corpus = RepoModel(dir_data) # load corpus
    doc = corpus.documents[name_file] # get document with key
    bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True) # bert model as service
    
    words, wordsvec, spans, wordslabel = words_vec_label(doc, bc)
    
    # wordsvec from list to array
    wordsvec = np.asarray(wordsvec)
    
    # label encoder
    wordslabel = [label[0] for label in wordslabel]
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(wordslabel)
    Y_encoder = encoder.transform(wordslabel)
    # convert integers to dummy variables (i.e. one hot encoded)
    Y_encoder = np_utils.to_categorical(Y_encoder)
    
    #X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)
    X_train, X_test, Y_train, Y_test  = wordsvec, wordsvec, Y_encoder, Y_encoder
    
    # model define
    N_batch = 4
    N_epoch = 4
    en_verbose = 1
    input_dim = wordsvec.shape[1]
    N_classes = len(set(wordslabel))
    
    model = create_base_network(X_train[0].shape[0], len(np.unique(wordslabel)))
    model.summary()
    
    # model training
    start   = time.time()
    probs = model.predict(X_test, verbose=1)
    end     = time.time()
    print('time elapse training:\t', end - start, 'sec')
    return probs

In [5]:
DIR_DATA = "./dataset/tmpbratfiles/"
NAME_FILE = "agm_briefing_unilever_11-05-2005"
fit_on_data(dir_data=DIR_DATA, name_file=NAME_FILE)
probs_on_data_ann(dir_data=DIR_DATA, name_file=NAME_FILE)

server config:
                        client	=	94b3be19-1e3a-412c-98c4-faaba292632d
                   num_process	=	2                             
          ventilator -> worker	=	['ipc://tmpRqVyMN/socket', 'ipc://tmpH2y51z/socket', 'ipc://tmp97xChm/socket', 'ipc://tmpniZ9w8/socket', 'ipc://tmpRR3HMU/socket', 'ipc://tmp5jwg2G/socket', 'ipc://tmpl1GPht/socket', 'ipc://tmp3zvpxf/socket']
                worker -> sink	=	ipc://tmpDQBp4U/socket        
           ventilator <-> sink	=	ipc://tmpvpP2w1/socket        
           server_current_time	=	2019-03-18 08:42:29.300647    
                     statistic	=	{'num_data_request': 1, 'num_total_seq': 1, 'num_sys_request': 2, 'num_total_request': 3, 'num_total_client': 2, 'num_active_client': 1, 'avg_request_per_client': 1.5, 'min_request_per_client': 1, 'max_request_per_client': 2, 'num_min_request_per_client': 1, 'num_max_request_per_client': 1, 'avg_size_per_request': 1.0, 'min_size_per_request': 1, 'max_size_per_request': 1, 'num_min_

<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 768)               590592    
_________________________________________________________________
activation_5 (Activation)    (None, 768)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 768)               590592    
_________________________________________________________________
activation_6 (Activation)    (None, 768)               0         
_________________________________________________________________
dropout_5 (Dropout)          (No

array([[0.24751453, 0.23398344, 0.22957015, 0.28893185],
       [0.2055532 , 0.13576895, 0.38066113, 0.2780167 ],
       [0.18024075, 0.11403263, 0.35785243, 0.3478742 ],
       ...,
       [0.20988485, 0.1656967 , 0.2577377 , 0.36668074],
       [0.26215678, 0.06955403, 0.19814733, 0.4701419 ],
       [0.24212164, 0.24264325, 0.24866417, 0.26657093]], dtype=float32)

In [None]:
# 配置tensorflow利用显存方式
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0.3
config.gpu_options.allow_growth=True 
#config.gpu_options.visible_device_list = "0"
set_session(tf.Session(config=config))

import matplotlib.pyplot as plt
from bert_serving.client import BertClient
from bratreader.repomodel import RepoModel


def create_base_network(input_dim, nb_classes):
    '''Base network to be shared (eq. to feature extraction).
    '''
    N_nodes = input_dim
    r_droupout = 0.2
    model_base = Sequential()
    model_base.add(Dense(N_nodes, input_shape=(input_dim,)))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(N_nodes))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(N_nodes))
    model_base.add(Activation('relu'))
    model_base.add(Dropout(r_droupout))
    model_base.add(Dense(nb_classes))
    model_base.add(Activation('softmax'))
    model_base.compile(loss='categorical_crossentropy',
                       optimizer=RMSprop(),
                       metrics=['accuracy'])
    #model_base.load_weights('model_base.h5')    
    return model_base

def words_vec_label(doc, bc): 
    '''get: words, embedding, spans, labels
    '''    
    words = []
    wordsvec = []
    spans = []
    wordslabel = []
    
    for str_sent in doc.text.splitlines():
        
        # Embeddings of each sentence/ sequence via BERT.
        vec = bc.encode([str_sent], show_tokens=True)
        for idx_sentence in range(len(vec[1])):
            #print('\n',vec[1][idx_sentence])
            for idx_token in range(len(vec[1][idx_sentence])):
                #print(vec[1][idx_sentence][idx_token],'\t', vec[0][idx_sentence][idx_token][0:5])
                
                if( vec[1][idx_sentence][idx_token].find('[CLS]', 0, 5)==0 ):
                    # [CLS]
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])
                    if len(spans)>0:
                        spans.append([spans[-1][1],spans[-1][1]])
                    else:
                        spans.append([0,0])
                    wordslabel.append(['NULL'])
                elif( vec[1][idx_sentence][idx_token].find('[SEP]', 0, 5)==0 ):
                    # [SEP]
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])
                    if len(spans)>0:
                        spans.append([spans[-1][1],spans[-1][1]])
                    else:
                        spans.append([0,0])
                    wordslabel.append(['NULL'])
                elif( vec[1][idx_sentence][idx_token].find('##', 0, 2)<0 ):
                    # Token in BERT table
                    words.append(vec[1][idx_sentence][idx_token])
                    wordsvec.append(vec[0][idx_sentence][idx_token][0:])                      
                    start = doc.text.lower().find(words[-1], spans[-1][0])
                    end = start + len(words[-1])
                    spans.append([start, end])
                    label = list(set(doc.getlabelinspan(start, end)))
                    if len(str(label))>2:
                        wordslabel.append(label)
                    else:
                        wordslabel.append(['NULL'])                    
                else:
                    # Token started with '##' in BERT
                    words[-1] = words[-1] + vec[1][idx_sentence][idx_token][2:]
                    wordsvec[-1] = wordsvec[-1] + vec[0][idx_sentence][idx_token][0:]
                    spans[-1] = ([spans[-1][0], spans[-1][0]+len(words[-1])])
                    label = list(set(doc.getlabelinspan(spans[-1][0], spans[-1][1])))
                    if len(str(label))>2:
                        wordslabel[-1] = label
                    else:
                        wordslabel[-1] = ['NULL']
                print(spans[-1], wordslabel[-1], words[-1], wordsvec[-1])
    return words, wordsvec, spans, wordslabel


In [None]:
# dataset generation

import time
import numpy as np
from keras import backend as K
from keras.utils import np_utils
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input, Lambda, Activation
from keras.optimizers import SGD, Adam, RMSprop
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

DIR_DATA = "./dataset/tmpbratfiles/"
NAME_FILE = "agm_briefing_unilever_11-05-2005"

corpus = RepoModel(DIR_DATA) # load corpus
doc = corpus.documents[NAME_FILE] # get document with key
bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True) # bert model as service

words, wordsvec, spans, wordslabel = words_vec_label(doc, bc)

# wordsvec from list to array
wordsvec = np.asarray(wordsvec)

# label encoder
wordslabel = [label[0] for label in wordslabel]
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(wordslabel)
Y_encoder = encoder.transform(wordslabel)
# convert integers to dummy variables (i.e. one hot encoded)
Y_encoder = np_utils.to_categorical(Y_encoder)

X_train, X_test, Y_train, Y_test = train_test_split(wordsvec, Y_encoder, random_state=0)

# model define
N_batch = 4
N_epoch = 4
en_verbose = 1
input_dim = wordsvec.shape[1]
N_classes = len(set(wordslabel))

model = create_base_network(X_train[0].shape[0], len(np.unique(wordslabel)))
model.summary()

# model training
start   = time.time()
history = model.fit(X_train, Y_train,
                    batch_size=N_batch, epochs=N_epoch,
                    verbose=en_verbose, validation_data=(X_test, Y_test))
end     = time.time()
print('time elapse training:\t', end - start, 'sec') 

# model test
probs = model.predict(X_test, verbose=1)
print(probs)

In [None]:

import os

CLASSPATH = "$CLASSPATH:"
path_standford = '/home/linbo/workspace/Datasets/Standford-coreNLP/'
path_segmenter = path_standford + 'stanford-segmenter-2018-10-16/stanford-segmenter.jar'
CLASSPATH = CLASSPATH + path_segmenter

path_postagger = path_standford + 'stanford-postagger-full-2018-10-16/stanford-postagger.jar'
CLASSPATH = CLASSPATH + ':' + path_postagger

path_ner = path_standford + 'stanford-ner-2018-10-16/stanford-ner.jar'
CLASSPATH = CLASSPATH + ':' + path_ner

path_parser = path_standford + 'stanford-parser-full-2018-10-17/stanford-parser.jar'
CLASSPATH = CLASSPATH + ':' + path_parser

path_parser_model = path_standford + 'stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar'
CLASSPATH = CLASSPATH + ':' + path_parser_model

path_corenlp = path_standford + 'stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar:' 
CLASSPATH = CLASSPATH + ':' + path_corenlp

path_model = path_standford + 'stanford-english-corenlp-2018-10-05-models.jar'
#path_model = path_standford + 'stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2-models.jar'
CLASSPATH = CLASSPATH + ':' + path_model

path_api = path_standford + 'stanford-corenlp-full-2018-10-05/slf4j-api.jar'
CLASSPATH = CLASSPATH + ':' + path_api

print(CLASSPATH)

os.environ["CLASSPATH"] = CLASSPATH
os.environ['STANFORD_PARSER'] = path_corenlp
os.environ['STANFORD_MODELS'] = path_model

In [None]:
sent = "Kalla, it\'s a dog!"

from nltk.tokenize.stanford import StanfordTokenizer

tokenizer = StanfordTokenizer()
print(tokenizer.tokenize(sent))

In [None]:

from nltk.parse.stanford import StanfordParser

class MyParser(StanfordParser):
    def raw_parse_sents(self, sentences, verbose=False):
        """
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list of strings.
        Each sentence will be automatically tokenized and tagged by the Stanford Parser.
        The output format is `wordsAndTags`.

        :param sentences: Input sentences to parse
        :type sentences: list(str)
        :rtype: iter(iter(Tree))
        """
        cmd = [
            self._MAIN_CLASS,
            '-model', self.model_path,
            '-outputFormat', 'penn', # conll, conll2007, penn
            '-sentences', 'newline'
        ]
        return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), True ))
myparser = MyParser(model_path= path_standford + 'stanford-english-corenlp-2018-10-05-models/' 
                    + "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

en_GUI = 0
sent = "the quick brown fox jumps over the \" lazy \" dog ."
print(sent)
res = list(myparser.raw_parse_sents([sent, sent]))
for row in res:
    for t in row:
        print(type(t),'\n',t)
        if  en_GUI:
            t.draw()

In [None]:
from nltk.corpus import BracketParseCorpusReader

reader = BracketParseCorpusReader("./data/", "temp.txt")
for sent in reader.parsed_sents():
    print(sent)
t = sent
for s in t.subtrees(lambda t: t.height() == 2): print(s)


from nltk.tree import Tree

t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))")
for s in t.subtrees(lambda t: t.height() == 2): print(s)

In [None]:
from collections import OrderedDict
from nltk.tree import Tree
from nltk.parse.stanford import StanfordParser

class MyParser(StanfordParser):
    def raw_parse_sents(self, sentences, verbose=False):
        """
        Use StanfordParser to parse multiple sentences. Takes multiple sentences as a
        list of strings.
        Each sentence will be automatically tokenized and tagged by the Stanford Parser.
        The output format is `penn`.

        :param sentences: Input sentences to parse
        :type sentences: list(str)
        :rtype: iter(iter(Tree))
        """
        cmd = [
            self._MAIN_CLASS,
            '-model', self.model_path,
            '-outputFormat', 'penn', # conll, conll2007, penn
            '-sentences', 'newline'
        ]
        return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), True ))
myparser = MyParser(model_path= path_standford + 'stanford-english-corenlp-2018-10-05-models/' 
                    + "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

def load_tags(file_tags):
    tags = OrderedDict()
    with open(file_tags, encoding='utf-8') as ft:
        for line in ft.readlines():
            line = line.strip()
            tags[line] = len(tags)
    return tags

tags = load_tags('tags.csv')

sent = "the quick brown fox jumps over the \" lazy \" dog ."
print(sent)
res = list(myparser.raw_parse_sents(['1 ' + sent, '2 ' + sent]))
for row in res:
    for t in row: 
        x = {s[0]:tags[s.label()] for s in t.subtrees(lambda t: t.height() == 2)}
        print(x)