In [1]:
import matplotlib

matplotlib.use('Agg')
import train
import dataset as ds
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector
from lstm_crf import EntityLSTM
import utils
import os
import conll2brat
import glob
import codecs
import shutil
import time
import copy
import evaluate
import random
import pickle
import brat2conll
import numpy as np
import utils_nlp
import distutils.util as distutils_util
import configparser
from pprint import pprint

parameters = {'pretrained_model_folder':'../../../ML_EntityData/model/vi',
                      'dataset_text_folder':'../../../ML_EntityData/data/vi/cv',
                      'character_embedding_dimension':25,
                      'character_lstm_hidden_state_dimension':25,
                      'check_for_digits_replaced_with_zeros':True,
                      'check_for_lowercase':True,
                      'debug':False,
                      'dropout_rate':0.5,
                      'experiment_name':'test',
                      'freeze_token_embeddings':False,
                      'gradient_clipping_value':5.0,
                      'learning_rate':0.005,
                      'load_only_pretrained_token_embeddings':False,
                      'load_all_pretrained_token_embeddings':False,
                      'main_evaluation_mode':'conll',
                      'maximum_number_of_epochs':300,
                      'number_of_cpu_threads':8,
                      'number_of_gpus':0,
                      'optimizer':'sgd',
                      'output_folder':'../../../ML_EntityData/output',
                      'patience':10,
                      'plot_format':'pdf',
                      'reload_character_embeddings':True,
                      'reload_character_lstm':True,
                      'reload_crf':True,
                      'reload_feedforward':True,
                      'reload_token_embeddings':True,
                      'reload_token_lstm':True,
                      'remap_unknown_tokens_to_unk':True,
                      'spacylanguage':'en',
                      'tagging_format':'bioes',
                      'token_embedding_dimension':300,
                      'token_lstm_hidden_state_dimension':300,
                      'token_pretrained_embedding_filepath':'../../../ML_EntityData/embedding/vi/embedding.txt',
                      'tokenizer':'spacy',
                      'train_model':True,
                      'use_character_lstm':True,
                      'use_crf':True,
                      'use_pretrained_model':False,
                      'verbose':False}



### Load Dataset

In [2]:

# Load dataset
dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters)
dataset = ds.Dataset(verbose=False, debug=False)
token_to_vector = dataset.load_dataset(dataset_filepaths, parameters)

Formatting train set from CONLL to BRAT... Done.
Converting CONLL from BIO to BIOES format... Done.
Formatting valid set from CONLL to BRAT... Done.
Converting CONLL from BIO to BIOES format... Done.
Formatting test set from CONLL to BRAT... Done.
Converting CONLL from BIO to BIOES format... Done.
Load dataset... done (101.57 seconds)


### Create Model

In [3]:
# Create model lstm+crf
session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=parameters['number_of_cpu_threads'],
            inter_op_parallelism_threads=parameters['number_of_cpu_threads'],
            device_count={'CPU': 1, 'GPU': parameters['number_of_gpus']},
            allow_soft_placement=True,
            # automatically choose an existing and supported device to run the operations in case the specified one doesn't exist
            log_device_placement=False
        )
sess = tf.Session(config=session_conf)

with sess.as_default():
    # Create model and initialize or load pretrained model
    ### Instantiate the model
    model = EntityLSTM(dataset=dataset, token_embedding_dimension=parameters['token_embedding_dimension'],
                       character_lstm_hidden_state_dimension=parameters['character_lstm_hidden_state_dimension'],
                       token_lstm_hidden_state_dimension=parameters['token_lstm_hidden_state_dimension'],
                       character_embedding_dimension=parameters['character_embedding_dimension'],
                       use_crf=parameters['use_crf'],
                       use_character_lstm=parameters['use_character_lstm'],
                       gradient_clipping_value=parameters['gradient_clipping_value'],
                       learning_rate=parameters['learning_rate'],
                       freeze_token_embeddings=parameters['freeze_token_embeddings'],
                       optimizer=parameters['optimizer'],
                       maximum_number_of_epochs=parameters['maximum_number_of_epochs'])

sess.run(tf.global_variables_initializer())

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


### Initial Model

In [4]:
# Load embedding
model.load_pretrained_token_embeddings(sess, dataset,embedding_filepath=parameters['token_pretrained_embedding_filepath'],
                                                       check_lowercase= parameters['check_for_lowercase'],check_digits=parameters['check_for_digits_replaced_with_zeros'],
                                                       token_to_vector=token_to_vector)
# Initial params_train
transition_params_trained = np.random.rand(len(dataset.unique_labels) + 2,len(dataset.unique_labels) + 2)

del token_to_vector

Load token embeddings... done (0.10 seconds)
number_of_token_original_case_found: 54
number_of_token_lowercase_found: 69
number_of_token_digits_replaced_with_zeros_found: 0
number_of_token_lowercase_and_digits_replaced_with_zeros_found: 0
number_of_loaded_word_vectors: 123
dataset.vocabulary_size: 148


### Restore Model

In [None]:
transition_params_trained = model.restore_from_pretrained_model(dataset, sess , model_pathfile=os.path.join(parameters['pretrained_model_folder'],'model.ckpt'),
                                                                                     dataset_pathfile=(parameters['pretrained_model_folder']+'/dataset.pickle'),
                                                                                     embedding_filepath= parameters['token_pretrained_embedding_filepath'],
                                                                                     character_dimension = parameters['character_embedding_dimension'],
                                                                                     token_dimension=parameters['token_embedding_dimension'],token_to_vector=token_to_vector)
del token_to_vector

In [5]:
stats_graph_folder, experiment_timestamp = utils.create_stats_graph_folder(parameters)

        # Initialize and save execution details
start_time = time.time()
results = {}
results['epoch'] = {}
results['execution_details'] = {}
results['execution_details']['train_start'] = start_time
results['execution_details']['time_stamp'] = experiment_timestamp
results['execution_details']['early_stop'] = False
results['execution_details']['keyboard_interrupt'] = False
results['execution_details']['num_epochs'] = 0
results['model_options'] = copy.copy(parameters)

model_folder = os.path.join(stats_graph_folder, 'model')
utils.create_folder_if_not_exists(model_folder)

pickle.dump(dataset, open(os.path.join(model_folder, 'dataset.pickle'), 'wb'))

### Training Model

In [10]:
bad_counter = 0  # number of epochs with no improvement on the validation test in terms of F1-score
previous_best_valid_f1_score = 0
epoch_number = -1
try:
    while True:

        step = 0
        epoch_number += 1
        print('\nStarting epoch {0}'.format(epoch_number))

        epoch_start_time = time.time()

        if epoch_number != 0:
            # Train model: loop over all sequences of training set with shuffling
            sequence_numbers = list(range(len(dataset.token_indices['train'])))
            random.shuffle(sequence_numbers)
            for sequence_number in sequence_numbers:
                transition_params_trained = train.train_step(sess, dataset, sequence_number, model, parameters['dropout_rate'])
                step += 1
                if step % 10 == 0:
                    print('Training {0:.2f}% done'.format(step / len(sequence_numbers) * 100), end='\r', flush=True)

        epoch_elapsed_training_time = time.time() - epoch_start_time
        print('Training completed in {0:.2f} seconds'.format(epoch_elapsed_training_time), flush=True)

        y_pred, y_true, output_filepaths = train.predict_labels_lite(sess=sess,model= model,transition_params_trained= transition_params_trained,
                                                                         dataset=dataset,epoch_number= epoch_number,
                                                                        stats_graph_folder= stats_graph_folder,dataset_filepaths= dataset_filepaths,
                                                                        tagging_format= parameters['tagging_format'], main_evaluation_mode=parameters['main_evaluation_mode'],use_crf=parameters['use_crf'])

        # # Evaluate model: save and plot results
        # evaluate.evaluate_model(results, dataset, y_pred, y_true, stats_graph_folder, epoch_number,
        #                                 epoch_start_time, output_filepaths, parameters)
        #
        # if parameters['use_pretrained_model'] and not parameters['train_model']:
        #     conll2brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder)
        #     break
        #
        # # Save model
        model.saver.save(sess, os.path.join(model_folder, 'model_{0:05d}.ckpt'.format(epoch_number)))
        #
        # # Save TensorBoard logs
        # summary = sess.run(model.summary_op, feed_dict=None)
        # writers['train'].add_summary(summary, epoch_number)
        # writers['train'].flush()
        # utils.copytree(writers['train'].get_logdir(), model_folder)
        #
        # # Early stop
        # valid_f1_score = results['epoch'][epoch_number][0]['valid']['f1_score']['micro']
        # if valid_f1_score > previous_best_valid_f1_score:
        #     bad_counter = 0
        #     previous_best_valid_f1_score = valid_f1_score
        #     conll2brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder,
        #                                       overwrite=True)
        #     transition_params_trained = transition_params_trained
        # else:
        #     bad_counter += 1
        # print("The last {0} epochs have not shown improvements on the validation set.".format(bad_counter))
        #
        # if bad_counter >= parameters['patience']:
        #     print('Early Stop!')
        #     results['execution_details']['early_stop'] = True
        #     break

        if 3*epoch_number >= 300: break


except KeyboardInterrupt:
    results['execution_details']['keyboard_interrupt'] = True
    print('Training interrupted')

print('Finishing the experiment')
end_time = time.time()
print(end_time-start_time)
# results['execution_details']['train_duration'] = end_time - start_time
# results['execution_details']['train_end'] = end_time
# evaluate.save_results(results, stats_graph_folder)
# for dataset_type in dataset_filepaths.keys():
#     writers[dataset_type].close()


Starting epoch 0
Training completed in 0.00 seconds
Evaluate model on the train set


  .format(len(labels), len(target_names))


             precision    recall  f1-score   support

 B-BirthDay     1.0000    0.5000    0.6667         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     0.8000    1.0000    0.8889         8
      B-Sex     1.0000    0.5000    0.6667         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    0.9412    0.9697        17
 E-Location     0.8000    1.0000    0.8889         8
   E-Person     1.0000    0.7500    0.8571         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8000    1.0000    0.8889         8
  I-Country     0.0000    0.0000    0.0000         3
 I-Location     1.0000    0.8000    0.8889         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9447    0.9362    0.9351       141

Evaluate model on the valid set


  .format(len(labels), len(target_names))
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set


  .format(len(labels), len(target_names))


             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    1.0000    1.0000        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     1.0000    0.9818    0.9879        55


Starting epoch 1
Training completed in 2.00 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    0.5000    0.6667         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     1.0000    0.8750    0.9333         8
      B-Se


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 8
Training completed in 1.87 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    0.5000    0.6667         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     0.8750    0.8750    0.8750         8
      B-Sex     1.0000    0.5000    0.6667         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    0.9412    0.9697        17
 E-Location     0.8750    0.8750    0.8750         8
   E-Person     1.0000    0.7500    0.8571         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8750    0.8750    0.8750         8
  I-Country     0.0000    0.0000    0.0000         0
 I-Location     1.0000    0.8000    0.8889         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9783    0.9348    0.9530       138

Evaluate m


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    1.0000    1.0000        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     1.0000    0.9818    0.9879        55


Starting epoch 12
Training completed in 2.04 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    0.5000    0.6667         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     0.8750    0.87


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 19
Training completed in 1.97 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    0.5000    0.6667         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     0.8750    0.8750    0.8750         8
      B-Sex     1.0000    0.5000    0.6667         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    0.9412    0.9697        17
 E-Location     0.8750    0.8750    0.8750         8
   E-Person     1.0000    0.7500    0.8571         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8750    0.8750    0.8750         8
  I-Country     0.0000    0.0000    0.0000         0
 I-Location     1.0000    0.8000    0.8889         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9783    0.9348    0.9530       138

Evaluate 


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    0.9545    0.9767        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     0.9836    0.9636    0.9700        55


Starting epoch 23
Training completed in 2.39 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    0.5000    0.6667         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     0.8750    0.87


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 30
Training completed in 2.09 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     0.8889    1.0000    0.9412         8
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    1.0000    1.0000        17
 E-Location     0.8889    1.0000    0.9412         8
   E-Person     1.0000    1.0000    1.0000         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8889    1.0000    0.9412         8
  I-Country     0.0000    0.0000    0.0000         4
 I-Location     0.8333    1.0000    0.9091         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9472    0.9718    0.9587       142

Evaluate 


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    0.9545    0.9767        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     0.9836    0.9636    0.9700        55


Starting epoch 34
Training completed in 2.03 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     1.0000    0.87


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 41
Training completed in 1.94 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     0.8889    1.0000    0.9412         8
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    0.9412    0.9697        17
 E-Location     0.8889    1.0000    0.9412         8
   E-Person     1.0000    1.0000    1.0000         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8889    1.0000    0.9412         8
  I-Country     0.0000    0.0000    0.0000         3
 I-Location     1.0000    1.0000    1.0000         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9598    0.9645    0.9614       141

Evaluate 


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    1.0000    1.0000        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     1.0000    0.9818    0.9879        55


Starting epoch 45
Training completed in 1.98 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     0.8889    1.00


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 52
Training completed in 1.83 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     0.8889    1.0000    0.9412         8
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    1.0000    1.0000        17
 E-Location     0.8889    1.0000    0.9412         8
   E-Person     1.0000    1.0000    1.0000         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8889    1.0000    0.9412         8
  I-Country     0.0000    0.0000    0.0000         3
 I-Location     1.0000    1.0000    1.0000         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9598    0.9787    0.9687       141

Evaluate 


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    1.0000    1.0000        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     1.0000    0.9818    0.9879        55


Starting epoch 56
Training completed in 2.17 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     1.0000    0.87


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 63
Training completed in 2.08 seconds
Evaluate model on the train set


  .format(len(labels), len(target_names))


             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     1.0000    1.0000    1.0000         8
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    1.0000    1.0000        17
 E-Location     1.0000    1.0000    1.0000         8
   E-Person     1.0000    1.0000    1.0000         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     1.0000    1.0000    1.0000         8
  I-Country     1.0000    1.0000    1.0000         5
 I-Location     1.0000    1.0000    1.0000         5
   I-Person     1.0000    1.0000    1.0000         6

avg / total     1.0000    1.0000    1.0000       138

Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Countr


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    0.9545    0.9767        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     0.9836    0.9636    0.9700        55


Starting epoch 67
Training completed in 2.03 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     1.0000    1.00


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 74
Training completed in 2.05 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    0.9412    0.9697        17
   B-Person     1.0000    0.8750    0.9333         8
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    0.9412    0.9697        17
 E-Location     1.0000    0.8750    0.9333         8
   E-Person     1.0000    1.0000    1.0000         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     1.0000    0.8750    0.9333         8
  I-Country     0.0000    0.0000    0.0000         0
 I-Location     1.0000    1.0000    1.0000         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     1.0000    0.9638    0.9809       138

Evaluate 


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    0.9545    0.9767        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     0.9836    0.9636    0.9700        55


Starting epoch 78
Training completed in 2.15 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     0.8889    1.00


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 85
Training completed in 1.84 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     0.8889    1.0000    0.9412         8
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    1.0000    1.0000        17
 E-Location     0.8889    1.0000    0.9412         8
   E-Person     1.0000    1.0000    1.0000         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8889    1.0000    0.9412         8
  I-Country     0.0000    0.0000    0.0000         3
 I-Location     1.0000    1.0000    1.0000         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9598    0.9787    0.9687       141

Evaluate 


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    0.9545    0.9767        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     0.9836    0.9636    0.9700        55


Starting epoch 89
Training completed in 2.08 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     1.0000    1.00


Evaluate model on the valid set
             precision    recall  f1-score   support

 B-BirthDay     0.0000    0.0000    0.0000         1
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     0.0000    0.0000    0.0000         1
   B-Person     0.0000    0.0000    0.0000         1
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     0.0000    0.0000    0.0000         1
  E-Country     0.0000    0.0000    0.0000         1
 E-Location     1.0000    1.0000    1.0000         4
   E-Person     0.0000    0.0000    0.0000         1
      E-Sex     0.0000    0.0000    0.0000         0
 I-BirthDay     1.0000    1.0000    1.0000         1

avg / total     0.6000    0.6000    0.6000        15

Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.000



Starting epoch 96
Training completed in 1.85 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     0.8889    1.0000    0.9412         8
      B-Sex     1.0000    1.0000    1.0000         2
 E-BirthDay     1.0000    1.0000    1.0000         2
  E-Country     1.0000    1.0000    1.0000        17
 E-Location     0.8889    1.0000    0.9412         8
   E-Person     1.0000    1.0000    1.0000         4
      E-Sex     1.0000    1.0000    1.0000        52
 I-BirthDay     0.8889    1.0000    0.9412         8
  I-Country     0.0000    0.0000    0.0000         3
 I-Location     1.0000    1.0000    1.0000         5
   I-Person     1.0000    1.0000    1.0000         5
      I-Sex     1.0000    1.0000    1.0000         6

avg / total     0.9598    0.9787    0.9687       141

Evaluate 


Evaluate model on the test set
             precision    recall  f1-score   support

 B-BirthDay     0.9000    1.0000    0.9474         9
  B-Country     1.0000    1.0000    1.0000         3
 B-Location     1.0000    1.0000    1.0000         9
   B-Person     1.0000    1.0000    1.0000         3
      B-Sex     1.0000    0.9545    0.9767        22
 E-BirthDay     1.0000    1.0000    1.0000         3
  E-Country     0.0000    0.0000    0.0000         0
 E-Location     1.0000    0.5000    0.6667         2
   E-Person     1.0000    1.0000    1.0000         3
      E-Sex     1.0000    1.0000    1.0000         1

avg / total     0.9836    0.9636    0.9700        55


Starting epoch 100
Training completed in 1.99 seconds
Evaluate model on the train set
             precision    recall  f1-score   support

 B-BirthDay     1.0000    1.0000    1.0000         2
  B-Country     1.0000    1.0000    1.0000         2
 B-Location     1.0000    1.0000    1.0000        17
   B-Person     0.8889    1.0

In [11]:
prediction_count=0
def predict( text):
        

#         if prediction_count == 1:
        parameters['dataset_text_folder'] = os.path.join('..', 'data', 'temp')
        stats_graph_folder, _ = utils.create_stats_graph_folder(parameters)

        # Update the deploy folder, file, and dataset
        dataset_type = 'deploy'
        ### Delete all deployment data
        for filepath in glob.glob(os.path.join(parameters['dataset_text_folder'], '{0}*'.format(dataset_type))):
            if os.path.isdir(filepath):
                shutil.rmtree(filepath)
            else:
                os.remove(filepath)
        ### Create brat folder and file
        dataset_brat_deploy_folder = os.path.join(parameters['dataset_text_folder'], dataset_type)
        utils.create_folder_if_not_exists(dataset_brat_deploy_folder)
        dataset_brat_deploy_filepath = os.path.join(dataset_brat_deploy_folder, 'temp_{0}.txt'.format(
            str(prediction_count).zfill(5)))  # self._get_dataset_brat_deploy_filepath(dataset_brat_deploy_folder)
        with codecs.open(dataset_brat_deploy_filepath, 'w', 'UTF-8') as f:
            f.write(text)
        ### Update deploy filepaths
        dataset_filepaths, dataset_brat_folders = utils.get_valid_dataset_filepaths(parameters,
                                                                                    dataset_types=[dataset_type])
        dataset_filepaths.update(dataset_filepaths)
        dataset_brat_folders.update(dataset_brat_folders)
        ### Update the dataset for the new deploy set
        dataset.update_dataset(dataset_filepaths, [dataset_type])

        # Predict labels and output brat
        output_filepaths = {}
        prediction_output = train.prediction_step(sess, dataset, dataset_type, model,
                                                  transition_params_trained, stats_graph_folder,
                                                  prediction_count, dataset_filepaths,parameters['tagging_format'],
                                                 parameters['main_evaluation_mode'])
        _, _, output_filepaths[dataset_type] = prediction_output
        conll2brat.output_brat(output_filepaths, dataset_brat_folders, stats_graph_folder, overwrite=True)

        # Print and output result
        text_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy',
                                     os.path.basename(dataset_brat_deploy_filepath))
        annotation_filepath = os.path.join(stats_graph_folder, 'brat', 'deploy', '{0}.ann'.format(
            utils.get_basename_without_extension(dataset_brat_deploy_filepath)))
        text2, entities = brat2conll.get_entities_from_brat(text_filepath, annotation_filepath, verbose=True)
        assert (text == text2)
        return entities


In [12]:
predict("Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_Dương sang Thái_Bình_Dương , chiếm đến lượng hàng_hoá lưu_thông đường_biển của thế_giới , đó là hải_trình lớn nhất từ tây sang đông với 50.000 lượt tàu_bè qua_lại mỗi năm ...")

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set
Formatting 000_deploy set from CONLL to BRAT... Done.

text:
Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_Dương sang Thái_Bình_Dương , chiếm đến lượng hàng_hoá lưu_thông đường_biển của thế_giới , đó là hải_trình lớn nhất từ tây sang đông với 50.000 lượt tàu_bè qua_lại mỗi năm ...

entity: {'id': 'T1', 'type': 'Location', 'start': 0, 'end': 109, 'text': 'Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_Dương sang Thái_Bình_Dương , chiếm đến lượng hàng_hoá lưu_thông'}
entity: {'id': 'T2', 'type': 'Person', 'start': 110, 'end': 133, 'text': 'đường_biển của thế_giới'}
entity: {'id': 'T3', 'type': 'Location', 'start': 136, 'end': 160, 'text': 'đó là hải_trình lớn nhất'}
entity: {'id': 'T4', 'type': 'Person', 'start': 161, 'end': 172, 'text': 'từ tây sang'}
entity: {'id': 'T5', 'type': 'Location', 'start': 173, 'end': 200, 'text': 'đông với 50.000 lượt tàu_bè'}

[{'end': 109,
  'id': 'T1',
  'start': 0,
  'text': 'Đó là con đường biển ngắn nhất để đi từ Ấn_Độ_Dương sang Thái_Bình_Dương , chiếm đến lượng hàng_hoá lưu_thông',
  'type': 'Location'},
 {'end': 133,
  'id': 'T2',
  'start': 110,
  'text': 'đường_biển của thế_giới',
  'type': 'Person'},
 {'end': 160,
  'id': 'T3',
  'start': 136,
  'text': 'đó là hải_trình lớn nhất',
  'type': 'Location'},
 {'end': 172,
  'id': 'T4',
  'start': 161,
  'text': 'từ tây sang',
  'type': 'Person'},
 {'end': 200,
  'id': 'T5',
  'start': 173,
  'text': 'đông với 50.000 lượt tàu_bè',
  'type': 'Location'},
 {'end': 220,
  'id': 'T6',
  'start': 209,
  'text': 'mỗi năm ...',
  'type': 'Person'}]

In [13]:
predict('Hà Nội')

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set
Formatting 000_deploy set from CONLL to BRAT... Done.

text:
Hà Nội

entity: {'id': 'T1', 'type': 'Location', 'start': 0, 'end': 6, 'text': 'Hà Nội'}





[{'end': 6, 'id': 'T1', 'start': 0, 'text': 'Hà Nội', 'type': 'Location'}]

In [29]:
predict('Họ và tên: Phạm Ngọc Linh , giới tính : Nam , sinh ngày 25/11/1996 , địa chỉ : Hà trung Thanh Hóa')

Formatting deploy set from BRAT to CONLL... Done.
Converting CONLL from BIO to BIOES format... Done.
Predict labels for the deploy set
Formatting 000_deploy set from CONLL to BRAT... Done.

text:
Họ và tên: Phạm Ngọc Linh , giới tính : Nam , sinh ngày 25/11/1996 , địa chỉ : Hà trung Thanh Hóa

entity: {'id': 'T1', 'type': 'Person', 'start': 11, 'end': 25, 'text': 'Phạm Ngọc Linh'}
entity: {'id': 'T2', 'type': 'Sex', 'start': 40, 'end': 43, 'text': 'Nam'}
entity: {'id': 'T3', 'type': 'BirthDay', 'start': 44, 'end': 45, 'text': ','}
entity: {'id': 'T4', 'type': 'BirthDay', 'start': 46, 'end': 50, 'text': 'sinh'}
entity: {'id': 'T5', 'type': 'BirthDay', 'start': 51, 'end': 55, 'text': 'ngày'}
entity: {'id': 'T6', 'type': 'BirthDay', 'start': 56, 'end': 66, 'text': '25/11/1996'}
entity: {'id': 'T7', 'type': 'BirthDay', 'start': 67, 'end': 68, 'text': ','}
entity: {'id': 'T8', 'type': 'Location', 'start': 79, 'end': 97, 'text': 'Hà trung Thanh Hóa'}





[{'end': 25,
  'id': 'T1',
  'start': 11,
  'text': 'Phạm Ngọc Linh',
  'type': 'Person'},
 {'end': 43, 'id': 'T2', 'start': 40, 'text': 'Nam', 'type': 'Sex'},
 {'end': 45, 'id': 'T3', 'start': 44, 'text': ',', 'type': 'BirthDay'},
 {'end': 50, 'id': 'T4', 'start': 46, 'text': 'sinh', 'type': 'BirthDay'},
 {'end': 55, 'id': 'T5', 'start': 51, 'text': 'ngày', 'type': 'BirthDay'},
 {'end': 66,
  'id': 'T6',
  'start': 56,
  'text': '25/11/1996',
  'type': 'BirthDay'},
 {'end': 68, 'id': 'T7', 'start': 67, 'text': ',', 'type': 'BirthDay'},
 {'end': 97,
  'id': 'T8',
  'start': 79,
  'text': 'Hà trung Thanh Hóa',
  'type': 'Location'}]