In [9]:
import os
import sys

from tensorflow import keras

import numpy as np
import conlleval

from common import encode, label_encode, write_result
from common import load_pretrained, viterbi_probabilities
from common import create_ner_model, create_optimizer, argument_parser
from common import read_conll, process_sentences, get_labels
from common import save_ner_model, save_viterbi_probabilities


In [11]:
#Heavier part, loading the pretrained model

In [10]:
argv = ['/users/htoivone/keras-bert-ner/scripts/../ner_v2.py', 
        '--vocab_file', '/scratch/project_2001426/models/biobert_v1.1_pubmed_std_naming/vocab.txt', 
        '--bert_config_file', '/scratch/project_2001426/models/biobert_v1.1_pubmed_std_naming/bert_config.json', 
        '--init_checkpoint', '/scratch/project_2001426/models/biobert_v1.1_pubmed_std_naming/bert_model.ckpt', 
        '--learning_rate', '5e-5', 
        '--num_train_epochs', '3', 
        '--max_seq_length', '256', 
        '--batch_size', '4', 
        '--train_data', '/users/htoivone/links/august/data/chemdner-smaller/conll/train.tsv', 
        '--test_data', '/users/htoivone/links/august/data/chemdner-smaller/conll/test.tsv', 
        '--ner_model_dir', '/users/htoivone/keras-bert-ner/scripts/../ner-models/testi_1']


argparser = argument_parser()
args = argparser.parse_args(argv[1:])
seq_len = args.max_seq_length    # abbreviation
pretrained_model, tokenizer = load_pretrained(args)

In [12]:
train_words, train_tags = read_conll(args.train_data)
test_words, test_tags = read_conll(args.test_data)


In [13]:
np.shape(train_words)

(2972,)

In [32]:
for tw in train_words[6:10]:
    print(tw)

['Chronic', 'administration', 'of', 'haloperidol', 'increased', 'Dpp6', 'expression', 'in', 'mouse', 'brains', '.']
['DPP6', 'is', 'an', 'auxiliary', 'subunit', 'of', 'Kv4', 'and', 'regulates', 'the', 'properties', 'of', 'Kv4', ',', 'which', 'regulates', 'the', 'activity', 'of', 'dopaminergic', 'neurons', '.']
['The', 'findings', 'of', 'this', 'study', 'indicate', 'that', 'an', 'altered', 'response', 'of', 'Kv4', '/', 'DPP6', 'to', 'long', '-', 'term', 'neuroleptic', 'administration', 'is', 'involved', 'in', 'neuroleptic', '-', 'induced', 'TD', '.']
['Nanosilver', 'effects', 'on', 'growth', 'parameters', 'in', 'experimental', 'aflatoxicosis', 'in', 'broiler', 'chickens', '.']


In [15]:
for tt in train_tags[6:8]:
    print(tt)

['O', 'O', 'O', 'B-Chemical', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [16]:
train_data = process_sentences(train_words, train_tags, tokenizer, seq_len)
test_data = process_sentences(test_words, test_tags, tokenizer, seq_len)

In [29]:
train_data[0][6]

['Chronic',
 'administration',
 'of',
 'haloperidol',
 'increased',
 'Dpp6',
 'expression',
 'in',
 'mouse',
 'brains',
 '.']

In [27]:
for td,tt in zip(train_data[1][6],train_data[2][6]):
    print(td+"\t\t\t\t"+tt)

Ch				O
##ronic				O
administration				O
of				O
ha				B-Chemical
##lop				I-Chemical
##eri				I-Chemical
##do				I-Chemical
##l				I-Chemical
increased				O
D				O
##pp				O
##6				O
expression				O
in				O
mouse				O
brains				O
.				O


In [30]:
train_data[3][6]

4

In [48]:
len(train_data[4][6])

246

In [31]:
for td in train_data[4:7]:
    print(td[6])

['Ch', '##ronic', 'administration', 'of', 'ha', '##lop', '##eri', '##do', '##l', 'increased', 'D', '##pp', '##6', 'expression', 'in', 'mouse', 'brains', '.', '[SEP]', 'D', '##PP', '##6', 'is', 'an', 'auxiliary', 'subunit', 'of', 'K', '##v', '##4', 'and', 'regulate', '##s', 'the', 'properties', 'of', 'K', '##v', '##4', ',', 'which', 'regulate', '##s', 'the', 'activity', 'of', 'do', '##pa', '##mine', '##rg', '##ic', 'neurons', '.', '[SEP]', 'The', 'findings', 'of', 'this', 'study', 'indicate', 'that', 'an', 'altered', 'response', 'of', 'K', '##v', '##4', '/', 'D', '##PP', '##6', 'to', 'long', '-', 'term', 'ne', '##uro', '##le', '##ptic', 'administration', 'is', 'involved', 'in', 'ne', '##uro', '##le', '##ptic', '-', 'induced', 'TD', '.', '[SEP]', 'Nan', '##os', '##il', '##ver', 'effects', 'on', 'growth', 'parameters', 'in', 'experimental', 'a', '##f', '##lat', '##ox', '##ico', '##sis', 'in', 'br', '##oil', '##er', 'chickens', '.', '[SEP]', 'A', '##f', '##lat', '##ox', '##ico', '##sis', '

In [36]:
label_list = get_labels(train_data.labels)
label_list

['O', 'I-Chemical', 'B-Chemical', '[SEP]', '[PAD]']

In [35]:
tag_map = { l: i for i, l in enumerate(label_list) }
tag_map

{'O': 0, 'I-Chemical': 1, 'B-Chemical': 2, '[SEP]': 3, '[PAD]': 4}

In [37]:
inv_tag_map = { v: k for k, v in tag_map.items() }
inv_tag_map

{0: 'O', 1: 'I-Chemical', 2: 'B-Chemical', 3: '[SEP]', 4: '[PAD]'}

In [38]:
init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map)


In [39]:
init_prob

array([0.94448183, 0.        , 0.05551817, 0.        , 0.        ])

In [40]:
trans_prob

array([[0.97498724, 0.        , 0.02501276, 0.        , 0.        ],
       [0.23771207, 0.76044595, 0.00184198, 0.        , 0.        ],
       [0.09586535, 0.90413465, 0.        , 0.        , 0.        ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ],
       [0.2       , 0.2       , 0.2       , 0.2       , 0.2       ]])

In [41]:
train_x = encode(train_data.combined_tokens, tokenizer, seq_len)

In [57]:
np.shape(train_x)

(2, 2972, 256)

In [44]:
np.shape(train_x[0][6])

(256,)

In [46]:
len(train_x[0][6])

256

In [51]:
train_x[0][6]

array([  101, 20394, 26003,  3469,  1104,  5871, 13200,  9866,  2572,
        1233,  2569,   141,  8661,  1545,  2838,  1107, 10322, 16570,
         119,   102,   141, 20923,  1545,  1110,  1126, 13817, 27555,
        1104,   148,  1964,  1527,  1105, 16146,  1116,  1103,  4625,
        1104,   148,  1964,  1527,   117,  1134, 16146,  1116,  1103,
        3246,  1104,  1202,  4163,  9685, 10805,  1596, 16993,   119,
         102,  1109,  9505,  1104,  1142,  2025,  5057,  1115,  1126,
        8599,  2593,  1104,   148,  1964,  1527,   120,   141, 20923,
        1545,  1106,  1263,   118,  1858, 24928, 11955,  1513,  8956,
        3469,  1110,  2017,  1107, 24928, 11955,  1513,  8956,   118,
       10645, 15439,   119,   102, 20689,  2155,  2723,  4121,  3154,
        1113,  3213, 11934,  1107,  6700,   170,  2087, 16236, 10649,
       10658,  4863,  1107,  9304, 20708,  1200, 26199,   119,   102,
         138,  2087, 16236, 10649, 10658,  4863,  1110,   170,  2612,
        1104,  2670,

In [52]:
test_x = encode(test_data.combined_tokens, tokenizer, seq_len)

In [53]:
train_y, train_weights = label_encode(
    train_data.combined_labels, tag_map, seq_len)

In [56]:
np.shape(train_y)

(2972, 256, 1)

In [None]:





test_y, test_weights = label_encode(
    test_data.combined_labels, tag_map, seq_len)

ner_model = create_ner_model(pretrained_model, len(tag_map))
optimizer = create_optimizer(len(train_x[0]), args)

ner_model.compile(
    optimizer,
    loss='sparse_categorical_crossentropy',
    sample_weight_mode='temporal',
    metrics=['sparse_categorical_accuracy']
)

callbacks_list = [
keras.callbacks.EarlyStopping(monitor='val_loss',patience=1,),
keras.callbacks.ModelCheckpoint(filepath=args.ner_model_dir+'/model.hdf5',
monitor='val_loss',save_best_only=True,)]

if args.early_stopping:
    print("NER:Using early stopping.")
    ner_model.fit(
    train_x,train_y,sample_weight=train_weights,epochs=args.num_train_epochs,
    batch_size=args.batch_size,callbacks=callbacks_list,validation_data=(test_x,test_y))
else:
    ner_model.fit(train_x,train_y,sample_weight=train_weights,
    epochs=args.num_train_epochs,batch_size=args.batch_size,)

if args.ner_model_dir is not None:
    label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
    save_ner_model(ner_model, tokenizer, label_list, args)
    save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args)

probs = ner_model.predict(test_x, batch_size=args.batch_size)
preds = np.argmax(probs, axis=-1)

pred_tags = []
for i, pred in enumerate(preds):
    pred_tags.append([inv_tag_map[t]
                      for t in pred[1:len(test_data.tokens[i])+1]])

lines = write_result(
    args.output_file, test_data.words, test_data.lengths,
    test_data.tokens, test_data.labels, pred_tags
)

c = conlleval.evaluate(lines)
conlleval.report(c)
#return 0


#if __name__ == '__main__':
#    sys.exit(main(sys.argv))


In [17]:
args = {
    'batch_size':4, 
    'bert_config_file':'/users/htoivone/links/august/scripts/../models/bert-base-finnish-cased-v1/bert_config.json',
    'dev_data':None, 
    'do_lower_case':False, 
    'early_stopping':False, 
    'init_checkpoint':'/users/htoivone/links/august/scripts/../models/bert-base-finnish-cased-v1/bert_model.ckpt', 
    'learning_rate':5e-05, 
    'max_seq_length':128, 
    'ner_model_dir':'/users/htoivone/links/august/scripts/../ner-models/turku-ner2-model', 
    'num_train_epochs':4, 
    'output_file':'output.tsv', 
    'test_data':'/users/htoivone/links/august/scripts/../data/AnatEM-1.0.2/conll_single_class//test.tsv', 
    'train_data':'/users/htoivone/links/august/scripts/../data/AnatEM-1.0.2/conll_single_class//train.tsv', 
    'viterbi':False, 
    'vocab_file':'/users/htoivone/links/august/scripts/../models/bert-base-finnish-cased-v1/vocab.txt', 
    'warmup_proportion':0.1}

In [19]:
from argparse import Namespace

args = Namespace(
    batch_size=4, 
    bert_config_file='/users/htoivone/links/august/scripts/../models/bert-base-finnish-cased-v1/bert_config.json',
    dev_data=None, 
    do_lower_case=False, 
    early_stopping=False, 
    init_checkpoint='/users/htoivone/links/august/scripts/../models/bert-base-finnish-cased-v1/bert_model.ckpt', 
    learning_rate=5e-05, 
    max_seq_length=128, 
    ner_model_dir='/users/htoivone/links/august/scripts/../ner-models/turku-ner2-model', 
    num_train_epochs=4, 
    output_file='output.tsv', 
    test_data='/users/htoivone/links/august/scripts/../data/AnatEM-1.0.2/conll_single_class//test.tsv', 
    train_data='/users/htoivone/links/august/scripts/../data/AnatEM-1.0.2/conll_single_class//train.tsv', 
    viterbi=False, 
    vocab_file='/users/htoivone/links/august/scripts/../models/bert-base-finnish-cased-v1/vocab.txt', 
    warmup_proportion=0.1)

In [21]:
args.viterbi

False

In [22]:
seq_len = args.max_seq_length    # abbreviation

pretrained_model, tokenizer = load_pretrained(args)

IndexError: list index out of range

In [None]:


train_words, train_tags = read_conll(args.train_data)
test_words, test_tags = read_conll(args.test_data)
train_data = process_sentences(train_words, train_tags, tokenizer, seq_len)
test_data = process_sentences(test_words, test_tags, tokenizer, seq_len)

label_list = get_labels(train_data.labels)
tag_map = { l: i for i, l in enumerate(label_list) }
inv_tag_map = { v: k for k, v in tag_map.items() }

init_prob, trans_prob = viterbi_probabilities(train_data.labels, tag_map)

train_x = encode(train_data.combined_tokens, tokenizer, seq_len)
test_x = encode(test_data.combined_tokens, tokenizer, seq_len)

train_y, train_weights = label_encode(
    train_data.combined_labels, tag_map, seq_len)
test_y, test_weights = label_encode(
    test_data.combined_labels, tag_map, seq_len)

ner_model = create_ner_model(pretrained_model, len(tag_map))
optimizer = create_optimizer(len(train_x[0]), args)

ner_model.compile(
    optimizer,
    loss='sparse_categorical_crossentropy',
    sample_weight_mode='temporal',
    metrics=['sparse_categorical_accuracy']
)

callbacks_list = [
keras.callbacks.EarlyStopping(monitor='val_loss',patience=1,),
keras.callbacks.ModelCheckpoint(filepath=args.ner_model_dir+'/model.hdf5',
monitor='val_loss',save_best_only=True,)]

if args.early_stopping:
    print("NER:Using early stopping.")
    ner_model.fit(
    train_x,train_y,sample_weight=train_weights,epochs=args.num_train_epochs,
    batch_size=args.batch_size,callbacks=callbacks_list,validation_data=(test_x,test_y))
else:
    ner_model.fit(train_x,train_y,sample_weight=train_weights,
    epochs=args.num_train_epochs,batch_size=args.batch_size,)

if args.ner_model_dir is not None:
    label_list = [v for k, v in sorted(list(inv_tag_map.items()))]
    save_ner_model(ner_model, tokenizer, label_list, args)
    save_viterbi_probabilities(init_prob, trans_prob, inv_tag_map, args)

probs = ner_model.predict(test_x, batch_size=args.batch_size)
preds = np.argmax(probs, axis=-1)

pred_tags = []
for i, pred in enumerate(preds):
    pred_tags.append([inv_tag_map[t]
                      for t in pred[1:len(test_data.tokens[i])+1]])

lines = write_result(
    args.output_file, test_data.words, test_data.lengths,
    test_data.tokens, test_data.labels, pred_tags
)

c = conlleval.evaluate(lines)
conlleval.report(c)
