References

https://www.kaggle.com/akshay235/bert-implementation-on-ner-corpus/notebook

https://androidkt.com/name-entity-recognition-with-bert-in-tensorflow/

https://colab.research.google.com/drive/1ptxQIRWIHH7sMO097KQeih1nSnrofz7g

https://pytorch.org/tutorials/beginner/nlp/advanced_tutorial.html



## Load Data

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [1]:
import torch
print("num of cuda device: ", torch.cuda.device_count())
for i in range(torch.cuda.device_count()):
    print("name of cuda device: ", torch.cuda.get_device_name(i))

num of cuda device:  1
name of cuda device:  Quadro P1000


In [None]:
!pip install transformers
!pip install seqeval

In [2]:
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
import tensorflow as tf
print("tensorflow version:", tf.__version__)
import transformers
print("transformers version:", transformers.__version__)

tensorflow version: 2.0.0
transformers version: 2.8.0


In [3]:
import os, time, gc
import numpy as np
import pandas as pd
from tqdm import tqdm, trange

import sklearn
import seqeval
from seqeval import metrics as seq_metrics
from seqeval.metrics import f1_score

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda
from tensorflow.keras.layers import Bidirectional, LSTM, Add
from tensorflow.keras.callbacks import Callback, EarlyStopping, ModelCheckpoint
from transformers import *

class FlatScore(object):
    """ Compute sklearn-allowable metric, e.g. average_score, f1_score.
    flat_metric = FlatScore('accuracy', 'macro')
    flat_metric.score(trues, preds)
    """
    def __init__(self, scorer='accuracy', average='macro', trues=None, preds=None):
        self.scorer = scorer
        self.scorer_fn = self.get_scorer()
        self.average = average
        self.ensure_inputs(trues, preds)

    def get_scorer(self):
        self.scorer = self.scorer if self.scorer.endswith('_score') else self.scorer + '_score'
        scorer_fn = getattr(sklearn.metrics, self.scorer)
        return scorer_fn

    def ensure_inputs(self, trues, preds):
        # convert probs & logits into real preds
        if trues is None or preds is None:
            self.trues, self.preds = None, None
        else:
            not_int = not (preds.astype(int) == preds).all()
            preds = np.argmax(preds, axis=2) if not_int else preds
            self.trues = trues.flatten()
            self.preds = preds.flatten()

    def score(self, scorer=None, average=None, trues=None, preds=None):
        if preds is not None:
            self.ensure_inputs(trues, preds)
        if scorer is not None:
            self.scorer = scorer
            self.scorer_fn = self.get_scorer()
        self.average = average if average else self.average
        # compute metric score
        if 'average' in self.scorer_fn.__code__.co_varnames:
            ans = self.scorer_fn(self.trues, self.preds, average=self.average)
        else:
            ans = self.scorer_fn(self.trues, self.preds)
        return ans

    def report(self, trues=None, preds=None):
        if preds is not None:
            self.ensure_inputs(trues, preds)
        report = sklearn.metrics.classification_report(self.trues, self.preds, digits=4)
        self.report = report
        print(report)

def get_column(groups, i):
    return [[x[i] for x in group] for group in groups]

def ensure_length(targets, labels):
    return [[y] * len(x) for x, y in zip(targets, labels)]

def tokenize_words_with_tags(words_tags):
    """
    words_tags is tuple of (word, tag1, tag2, ...)
    """
    ans = []
    tokens = [tokenizer.tokenize(x[0]) for x in words_tags]
    ans.append(tokens)
    for i in range(len(words_tags[0]) - 1):
        ys = ensure_length(tokens, [x[i+1] for x in words_tags])
        ys = [x for l in ys for x in l]
        ans.append(ys)
    ans[0] = [x for l in ans[0] for x in l]
    ans = tuple(zip(*ans))
    return ans


Using TensorFlow backend.


In [4]:
def print_params():
    full_data = 'full_data' if include_paragraph and include_table else 'paragraph' if include_paragraph else 'table'
    message = "max_len={max_len}, {full_data}, {data_version}, chunk_data={chunk_data}, " \
              "epoch={epoch}, source='{source}', shuffle={shuffle}"
    print(
        message.format(
            max_len=MAX_LEN, 
            full_data=full_data, 
            data_version='data_' + version, 
            chunk_data=chunk_data,
            epoch=evaluator.best_epoch,
            source=source,
            shuffle=shuffle
            )
        )

def ner_scorer(model, params):
    probs = model.predict([params['inputs']], batch_size=params['batch_size'])
    preds = probs.argmax(2)
    preds2 = [[params['unique_labels'][i] for i in pred] for pred in preds]
    trues2 = [[params['unique_labels'][i] for i in true] for true in params['tags']]
    out = [(tag[:len(label)], pred[:len(label)]) for label, tag, pred in zip(params['labels'], trues2, preds2)]
    trues3, preds3 = zip(*out)
    dic = {'val_f1_score': f1_score(trues3, preds3)}
    return dic 


class ModelEvaluator(Callback):
    def __init__(self, model, eval_func, eval_params, monitor='val_f1_score',
                 patience=1e5, restore_best_weights=True, baseline=None, min_delta=1e-5, 
                 pathfile='', save_best_only=True, 
                 ):
        self.model = model
        self.eval_func = eval_func
        self.eval_params = eval_params
        self.monitor = monitor
        self.min_delta = min_delta
        self.patience = patience
        self.baseline = baseline
        self.restore_best_weights = restore_best_weights
        self.pathfile = pathfile
        self.save_best_only = save_best_only
        #
        self.metrics = []
        self.wait = 0
        self.epoch = 0
        self.best_epoch = 0
        self.early_stopped_epoch = 0
        self.best_score = 0.0
        self.best_weights = None
    
    def on_epoch_begin(self, epoch, logs):
        print('\n')
    
    def on_epoch_end(self, epoch, logs):
        # counting epochs - starting at 1 instead of 0
        self.epoch += 1
        
        # compute evaluation metrics
        metrics = self.eval_func(self.model, self.eval_params)
        self.metrics.append(metrics)
        
        # update best results
        current = metrics[self.monitor]
        if current - self.min_delta > self.best_score:
            self.wait = 0
            self.best_epoch = self.epoch
            self.best_score = current
            self.best_weights = self.model.get_weights()
        else:
            self.wait += 1
            if self.wait >= self.patience:  # trigger early-stopping
                self.early_stopped_epoch = self.epoch
                self.model.stop_training = True
        
        # save weights at every epoch, if save_best_only=False
        if self.pathfile and not self.save_best_only:
            # TODO: assert pathfile is formattable like 'weights_{epoch:02d}_{val_loss:.4f}.hdf5'
            pathfile = self.pathfile.format(epoch=self.epoch, **metrics)
            self.model.save_weights(pathfile)
        
        print()
        print('%s: %.4f' % (self.monitor, current))
    
    def on_train_end(self, epoch=None, logs=None):
        # must run code in the exact sequence below:
        if self.pathfile and self.save_best_only and not self.restore_best_weights:
            if self.best_epoch < self.epoch:
                last_weights = self.model.get_weights()
        
        if self.pathfile and self.save_best_only or self.restore_best_weights:
            if self.best_epoch < self.epoch:
                self.model.set_weights(self.best_weights)
        
        if self.pathfile and self.save_best_only:
            pathfile = self.pathfile.format(epoch=self.best_epoch, **self.metrics[self.best_epoch - 1])
            self.model.save_weights(pathfile)
        
        if self.pathfile and self.save_best_only and not self.restore_best_weights:
            if self.best_epoch < self.epoch:
                self.model.set_weights(last_weights)
        
        # print messages
        print('\n')
        if self.early_stopped_epoch:  # early-stopped
            print('early stopped at Epoch %d' % (self.early_stopped_epoch))
        
        print("Epoch %d gives the best '%s' of %0.4f" % (self.best_epoch, self.monitor, self.best_score))
        
        if self.restore_best_weights and self.best_epoch < self.epoch: 
            print('Restoring model weights from Epoch %d' % (self.best_epoch))

In [5]:
def chunkup(lst, max_len=200, overlap=30):
    if len(lst) <= max_len:
        lsts = [lst]
    else:
        lsts = []
        start_idx = 0
        while start_idx + overlap <= len(lst):
            lsts.append(lst[start_idx:start_idx + max_len])
            start_idx += max_len - overlap
    return lsts

    xdat = pd.DataFrame()
    for i in range(len(sdat)):
        row = sdat.iloc[i]
        groups = chunkup(row['group'], max_len=MAX_LEN, overlap=overlap)
        is_tag = np.array([any([x[1] != 'O' for x in group]) for group in groups])
        if window_size:
            # keep only tags and neighbors
            idxes = np.argwhere(is_tag).flatten()
            idxes = np.array(list(set(idxes.tolist() + (idxes + window_size).tolist() + (idxes - window_size).tolist())))
            idxes = idxes[(0 <= idxes) & (idxes < len(is_tag))]
            groups = [groups[i] for i in idxes]
        # create save-worthy columns
        tdat = pd.DataFrame()
        filename = row['file_name']
        tdat['file_name'] = np.repeat(filename, len(groups))
        tdat['file_id'] = i
        tdat['group'] = groups
        tdat['is_tag'] = [is_tag[i] for i in idxes]
        # combine iterations
        xdat = pd.concat([xdat, tdat], axis=0)
    xdat.index = range(len(xdat))
    return xdat


def create_inputs(groups, labels, is_idx=None, chunk_data=True, MAX_LEN=200, overlap=0, pad_separate=False):
    
    groups = groups[is_idx] if is_idx is not None else groups
    labels = labels[is_idx] if is_idx is not None else labels
    
    if chunk_data:
        out = [chunkup(x, max_len=MAX_LEN, overlap=overlap) for x in groups]
        groups = [x for lst in out for x in lst]
        out = [chunkup(x, max_len=MAX_LEN, overlap=overlap) for x in labels]
        labels = [x for lst in out for x in lst]

    inputs = pad_sequences([[tokenizer.convert_tokens_to_ids(x) for x in doc] for doc in groups],
                              maxlen=MAX_LEN, value=0, padding="post", truncating="post", dtype='int32')
    masks = (inputs != tokenizer.pad_token_id).astype(np.int32)

    if pad_separate:
        tags = pad_sequences([[lab2idx.get(x) for x in tag] for tag in labels],
                                maxlen=MAX_LEN, value=lab2idx["<pad>"], padding="post", truncating="post", dtype="int32")
    else:
        tags = pad_sequences([[lab2idx.get(x) for x in tag] for tag in labels],
                                maxlen=MAX_LEN, value=lab2idx["O"], padding="post", truncating="post", dtype="int32")
    return (inputs, masks, labels, tags)

In [6]:
import joblib
# from google.colab import drive
# drive.mount('/content/drive')

## Prepare Data

In [7]:
# TODO: use cased version
model_fullname = 'distilbert-base-uncased'
lower = model_fullname.endswith('-uncased')
BertishTokenizer = DistilBertTokenizer
BertishConfig = DistilBertConfig
BertishModel = TFDistilBertModel
BertishClassifier = TFDistilBertForTokenClassification

tokenizer = BertishTokenizer.from_pretrained(model_fullname, do_lower_case=lower)

### Chicago Version

In [18]:
shuffle = False # if False, training on doc-level; otherwise block-level
chunk_data = True

include_paragraph = True
include_table = True

source = 'can'
version = 'v0'

path = "./datasets/"
file = 'i2k_' + source + '_' + version + '.pickle'
with open(path+file, 'rb') as f:
    sdat = joblib.load(f)
print('full_data file name:', sdat.file_name)
print("full_data shape:", sdat.shape)

full_data file name: 0       173942162.htm
1       173942162.htm
2       173942162.htm
3       173942162.htm
4       173942162.htm
            ...      
7502    223852392.htm
7503    223852392.htm
7504    223852392.htm
7505    223852392.htm
7506    223852392.htm
Name: file_name, Length: 7507, dtype: object
full_data shape: (7507, 7)


In [26]:
sdat[0:10]

Unnamed: 0,file_name,file_id,block_id,group,text,is_tag,is_table
0,173942162.htm,0,0,"((mclean, O), (asset, O), (management, O), (lt...","[McLEAN ASSET MANAGEMENT LTD., \nFUND FACTS\nJ...",True,False
1,173942162.htm,0,1,"(([PAD], O), ([PAD], O), (this, O), (document,...",[\n\nThis document contains key information yo...,True,True
2,173942162.htm,0,2,"((what, O), (does, O), (the, O), (fund, O), (i...","[What does the fund invest in?, ROMC Trust's i...",False,False
3,173942162.htm,0,4,"(([PAD], O), ([PAD], O), ([PAD], O), (top, O),...","[\n\n\nTop 10 Investments (April 30, 2018)\n\n...",False,True
4,173942162.htm,0,5,"(([PAD], O), ([PAD], O), ([PAD], O), (investme...","[\n\n\nInvestment Mix (April 30, 2018)\nTechno...",True,True
5,173942162.htm,0,6,"(([PAD], O), (how, O), (risky, O), (is, O), (i...",[\nHow risky is it?\nThe value of the fund can...,False,False
6,173942162.htm,0,21,"((the, O), (following, O), (tables, O), (show,...",[The following tables show the fees and expens...,False,False
7,173942162.htm,0,22,"(([PAD], O), (1, O), (., O), (sales, O), (char...","[\n1., Sales Charges, There are no sales charg...",True,False
8,173942162.htm,0,23,"((2, O), (., O), (fund, O), (expenses, O), (yo...","[2. Fund expenses, You don't pay these expense...",False,False
9,173942162.htm,0,24,"(([PAD], O), (fee, O), (for, O), (service, O),...","[\nFee for Service, \nSwitch Fee]",False,False


In [10]:
if not shuffle:
    tr_files, val_files = train_test_split(sdat.file_name, random_state=2018, test_size=0.1, shuffle=shuffle)

    # known = ['221794582.htm', '220021854.htm', '218286484.htm', '216588855.htm', '214893347.htm']
    # unkwn = ['222695709.htm', '222695715.htm', '222828866.htm', '223830364.htm', '223852392.htm']
    # val_files = known + unkwn
    # tr_files = list(set(sdat.file_name).difference(val_files))

len(tr_files), len(val_files)

(6756, 751)

In [11]:
MAX_LEN = 200
overlap = 30

tr_dat = chunkup_data(sdat[sdat['file_name'].isin(tr_files)], MAX_LEN, overlap=overlap, window_size=1)
val_dat = chunkup_data(sdat[sdat['file_name'].isin(val_files)], MAX_LEN, overlap=0, window_size=1)
print("chunked-up train data shape:", tr_dat.shape)
print("chunked-up valid data shape:", val_dat.shape)

chunked-up train data shape: (3333, 4)
chunked-up valid data shape: (400, 4)


In [12]:
pad_separate = False

labels = np.array(get_column(tr_dat['group'], 1) + get_column(val_dat['group'], 1))
unique_labels = sorted(set([x for label in labels for x in label]))
unique_labels.remove('O')
if pad_separate:
    unique_labels = ['<pad>', 'O'] + sorted(unique_labels)
else:
    unique_labels = ['O'] + sorted(unique_labels)

lab2idx = {t: i for i, t in enumerate(unique_labels)}
print("number of labels:", len(lab2idx))

number of labels: 17


In [13]:
groups = np.array(get_column(tr_dat['group'], 0))
labels = np.array(get_column(tr_dat['group'], 1))
tr_inputs, tr_masks, tr_labels, tr_tags = create_inputs(groups, labels, None, 
                                                        False, MAX_LEN, 30, 
                                                        pad_separate)
groups = np.array(get_column(val_dat['group'], 0))
labels = np.array(get_column(val_dat['group'], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            False, MAX_LEN, 0, 
                                                            pad_separate)
print("train data:", tr_inputs.shape) 
print("valid data:", val_inputs.shape)

train data: (3333, 200)
valid data: (400, 200)


#### known vs. unknown data

In [None]:
# training data
groups = np.array(get_column(tr_dat['group'], 0))
labels = np.array(get_column(tr_dat['group'], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            False, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# validation data
groups = np.array(get_column(val_dat['group'], 0))
labels = np.array(get_column(val_dat['group'], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            False, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# unknown valiation data
is_idx = val_dat['file_name'].isin(unkwn)

groups = np.array(get_column(val_dat['group'][is_idx], 0))
labels = np.array(get_column(val_dat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            False, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known valiation data
is_idx = val_dat['file_name'].isin(known)

groups = np.array(get_column(val_dat['group'][is_idx], 0))
labels = np.array(get_column(val_dat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            False, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# well-known validation data
is_idx = val_dat['file_name'].isin(known[:2])

groups = np.array(get_column(val_dat['group'][is_idx], 0))
labels = np.array(get_column(val_dat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            False, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known-somewhat validation data
is_idx = val_dat['file_name'].isin(known[2:])

groups = np.array(get_column(val_dat['group'][is_idx], 0))
labels = np.array(get_column(val_dat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            False, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# output training and validation files
# trn_files = xdat.loc[is_train, 'file_name'].unique()
# val_files = xdat.loc[is_valid, 'file_name'].unique()
# len(trn_files), len(val_files)
# trn_files = pd.DataFrame([trn_files, np.repeat(1, len(trn_files))], index=['filename', 'is_training']).T
# val_files = pd.DataFrame([val_files, np.repeat(0, len(val_files))], index=['filename', 'is_training']).T
# all_files = pd.concat([trn_files, val_files], axis=0, ignore_index=True)
# all_files.to_csv(path + 'train_val_split.cvs', index=False)

### Shenzhen Version

In [None]:
shuffle = False # if False, training on doc-level; otherwise block-level
override = False 

include_paragraph = True
include_table = True

source = 'uk'
version = 'v2'
path = "/content/drive/My Drive/Colab Notebooks/datasets/"

if (not shuffle) and override:  # shuffle/split docs by assignment
    file = 'i2k_' + source + '_' + version + '.pickle'
    with open(path+file, 'rb') as f:
        xdat = joblib.load(f)
    print("full_data    shape:", xdat.shape)
    
    # val_files = ['221794582.htm', '221902556.htm', '222466057.htm', '222466194.htm', '222466913.htm',
    #             '222695709.htm', '222695715.htm', '222828866.htm', '223830364.htm', '223852392.htm']

    known = ['221794582.htm', '220021854.htm', '218286484.htm', '216588855.htm', '214893347.htm']
    unkwn = ['222695709.htm', '222695715.htm', '222828866.htm', '223830364.htm', '223852392.htm']
    val_files = known + unkwn
    
    is_valid = xdat['file_name'].isin(val_files)
    is_train = ~is_valid
else:
    file = 'i2k_' + source + '_' + 'v2' + '.pickle'
    with open(path+file, 'rb') as f:
        xdat = joblib.load(f)
    print("full_data    shape:", xdat.shape)

    if include_paragraph and not include_table:
        xdat = xdat[~xdat['is_table']]
    elif not include_paragraph and include_table:
        xdat = xdat[xdat['is_table']]
    print("current_data shape:", xdat.shape)

    if version == 'v2':
        tr_idx, val_idx = train_test_split(xdat.index, random_state=2018, test_size=0.1, shuffle=shuffle)
        is_train = xdat.index.isin(tr_idx)
        is_valid = xdat.index.isin(val_idx)

    if version == 'v3':  # expand v2 to v3
        tdat = xdat
        version = 'v3'
        file = 'i2k_' + source + '_' + 'v3' + '.pickle'
        with open(path+file, 'rb') as f:
            xdat = joblib.load(f)
        print("full_data    shape:", xdat.shape)
        
        tr_idx, val_idx = train_test_split(tdat.index, random_state=2018, test_size=0.1, shuffle=shuffle)
        is_train = xdat['rid'].isin(tr_idx)
        is_valid = xdat['rid'].isin(val_idx)

print("training data:", is_train.sum())
print("test     data:",  is_valid.sum())

In [None]:
groups = np.array(get_column(xdat['group'], 0))
labels = np.array(get_column(xdat['group'], 1))

pad_separate = False
unique_labels = sorted(set([x for label in labels for x in label]))
unique_labels.remove('O')
if pad_separate:
    unique_labels = ['<pad>', 'O'] + sorted(unique_labels)
else:
    unique_labels = ['O'] + sorted(unique_labels)

lab2idx = {t: i for i, t in enumerate(unique_labels)}
print("number of labels:", len(lab2idx))

In [None]:
chunk_data = True
MAX_LEN = 200
overlap = 30

tr_inputs, tr_masks, tr_labels, tr_tags = create_inputs(groups, labels, is_train, 
                                                        chunk_data, MAX_LEN, overlap, 
                                                        pad_separate)
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, is_valid, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

print("train, test before chunking:", len(groups[is_train]), len(groups[is_valid]))
print("train, test after  chunking:", len(tr_inputs), len(val_inputs))
print("training cases: ", tr_inputs.shape)
print("validation cases:", val_inputs.shape)

#### known vs. unknown data

##### v0

In [None]:
# training data
is_idx = xdat['file_name'].isin(set(xdat['file_name']).difference(val_files))

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# validation data
is_idx = xdat['file_name'].isin(val_files)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# unknown validation data
is_idx = xdat['file_name'].isin(unkwn)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known validation data
is_idx = xdat['file_name'].isin(known)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known-well validation data
is_idx = xdat['file_name'].isin(known[:2])

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known-somewhat validation data
is_idx = xdat['file_name'].isin(known[2:])

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

##### v2

In [None]:
# training data
is_idx = xdat['file_name'].isin(set(xdat['file_name']).difference(val_files))

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# validation data
is_idx = xdat['file_name'].isin(val_files)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# unknown validation data
is_idx = xdat['file_name'].isin(unkwn)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known validation data
is_idx = xdat['file_name'].isin(known)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known-well validation data
is_idx = xdat['file_name'].isin(known[:2])

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known-somewhat validation data
is_idx = xdat['file_name'].isin(known[2:])

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

##### v3

In [None]:
# training data
is_idx = xdat['file_name'].isin(set(xdat['file_name']).difference(val_files))

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# validation data
is_idx = xdat['file_name'].isin(val_files)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# unknown validation data
is_idx = xdat['file_name'].isin(unkwn)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known validation data
is_idx = xdat['file_name'].isin(known)

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known-well validation data
is_idx = xdat['file_name'].isin(known[:2])

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# known-somewhat validation data
is_idx = xdat['file_name'].isin(known[2:])

groups = np.array(get_column(xdat['group'][is_idx], 0))
labels = np.array(get_column(xdat['group'][is_idx], 1))
val_inputs, val_masks, val_labels, val_tags = create_inputs(groups, labels, None, 
                                                            chunk_data, MAX_LEN, 0, 
                                                            pad_separate)

val_ntop = len(val_inputs)
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print_params()
print(f1_score(trues3, preds3), '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

## Bert Model

### Model Training

build model

In [14]:
num_classes = len(lab2idx)
customize = True

# build model
if customize:
    inputs = Input(shape=(MAX_LEN,), dtype=tf.int32, name='inputs')
    layer = BertishModel.from_pretrained(model_fullname)
    layer = layer([inputs])
    layer = Lambda(lambda x: x[0])(layer)
    layer = Dropout(rate=0.2)(layer)
    output = Dense(num_classes, activation='softmax')(layer)
    model = Model([inputs], output)
else:
    config = BertishConfig.from_pretrained(model_fullname, 
                                           attention_dropout=0.1,  # somehow matters
                                           dropout=0.1,  # matters greatly
                                           num_labels=num_classes)
    model = BertishClassifier.from_pretrained(model_fullname, config=config)
    model.layers[-1].activation = tf.keras.activations.softmax

# compile model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.00003)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

train model

In [15]:
gc.collect()
# bs = 16
bs = 4
tr_ntop = len(tr_inputs)
val_ntop = len(val_inputs)

eval_params = {
    'batch_size': bs,
    'inputs': val_inputs[:val_ntop],
    'tags': val_tags[:val_ntop],
    'labels': val_labels[:val_ntop],
    'unique_labels': unique_labels
}

pathfile = ''
# pathfile = path + 'weights_best_model.hdf5'
# pathfile = path + 'weights_{epoch:02d}_{val_f1_score:.4f}.hdf5'
evaluator = ModelEvaluator(model, eval_func=ner_scorer, eval_params=eval_params, monitor='val_f1_score',
                           patience=5, restore_best_weights=True,
                           pathfile=pathfile, save_best_only=True)

print_params()

max_len=200, full_data, data_v0, chunk_data=True, epoch=0, source='can', shuffle=False


In [16]:
history = model.fit(x=[tr_inputs[:tr_ntop]], 
                    y=tr_tags[:tr_ntop], 
                    validation_data=([val_inputs[:val_ntop]], val_tags[:val_ntop]),
                    batch_size=bs,
                    callbacks=[evaluator],
                    epochs=100,
                    verbose=1)

print_params() 

Train on 3333 samples, validate on 400 samples


Epoch 1/100
val_f1_score: 0.6717


Epoch 2/100
val_f1_score: 0.7691


Epoch 3/100
val_f1_score: 0.7249


Epoch 4/100
val_f1_score: 0.8034


Epoch 5/100
val_f1_score: 0.7487


Epoch 6/100
val_f1_score: 0.7415


Epoch 7/100
val_f1_score: 0.7571


Epoch 8/100
val_f1_score: 0.7997


Epoch 9/100
val_f1_score: 0.7908


early stopped at Epoch 9
Epoch 4 gives the best 'val_f1_score' of 0.8034
Restoring model weights from Epoch 4
max_len=200, full_data, data_v0, chunk_data=True, epoch=4, source='can', shuffle=False


In [None]:
path + source + '_' + version + '_unshuffle.h5'  

In [None]:
save_model = True
if save_model:
    model.save_weights(path + source + '_' + version + '_unshuffle.h5')

In [None]:
[x for x in os.listdir(path) if x.endswith('h5') or x.endswith('hdf5')]

In [None]:
load_model = True
if load_model:
    model.load_weights(path + source + '_' + version + '_unshuffle.h5')

In [None]:
clean_models = False
if clean_models:
    [os.remove(path + x) for x in os.listdir(path) if x.endswith('h5') or x.endswith('hdf5')]

source = CAN

In [None]:
gc.collect()
bs = 16
tr_ntop = len(tr_inputs)
val_ntop = len(val_inputs)

eval_params = {
    'batch_size': bs,
    'inputs': val_inputs[:val_ntop],
    'tags': val_tags[:val_ntop],
    'labels': val_labels[:val_ntop],
    'unique_labels': unique_labels
}

pathfile = ''
# pathfile = path + 'weights_best_model.hdf5'
# pathfile = path + 'weights_{epoch:02d}_{val_f1_score:.4f}.hdf5'
evaluator = ModelEvaluator(model, eval_func=ner_scorer, eval_params=eval_params, monitor='val_f1_score',
                           patience=5, restore_best_weights=True,
                           pathfile=pathfile, save_best_only=True)

history = model.fit(x=[tr_inputs[:tr_ntop]], 
                    y=tr_tags[:tr_ntop], 
                    validation_data=([val_inputs[:val_ntop]], val_tags[:val_ntop]),
                    batch_size=bs,
                    callbacks=[evaluator],
                    epochs=100,
                    verbose=1)

print_params()

source = UK

In [None]:
# max_len=40, full_labels, bert, full_data
gc.collect()
bs = 16
tr_ntop = len(tr_inputs)
val_ntop = len(val_inputs)
history = model.fit(x=[tr_inputs[:tr_ntop]], 
                    y=tr_tags[:tr_ntop], 
                    validation_data=([val_inputs[:val_ntop]], val_tags[:val_ntop]),
                    batch_size=bs,
                    epochs=1,
                    verbose=1)

source = US

In [None]:
# max_len=80, bert, source='can'
gc.collect()
bs = 16
tr_ntop = len(tr_inputs)
val_ntop = len(val_inputs)
history = model.fit(x=[tr_inputs[:tr_ntop]], 
                    y=tr_tags[:tr_ntop], 
                    validation_data=([val_inputs[:val_ntop]], val_tags[:val_ntop]),
                    batch_size=bs,
                    epochs=3,
                    verbose=1)

### Model Prediction

In [None]:
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

In [None]:
print(evaluator.best_score)
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))

In [None]:
probs = model.predict([val_inputs[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

print(evaluator.best_score)
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))

### Model Evaluation - B2

- compare Chicago vs. SZ versions with chunked-up full data
- different chunking strategies with varying max_length and overlap
- examine use of varying chunking strategies at each epoch to prevent overfitting
- paragraphs vs. tables
- remove 'support < 50' from test and/or from training data 

#### CAN Dataset

In [None]:
print_params()  # 10 vs. 10
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 10 vs. 10
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 10 vs. 10
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 10 vs. 10
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 1 vs. 9
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 157 sec/epoch
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 225 sec/epoch (16G)
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 225 sec/epoch
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 171 sec/epoch (16G)
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

#### UK Dataset

In [None]:
print_params()  # 186 sec/epoch (16G)
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # natural split
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # natural split
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # natural split
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 186 sec/epoch (16G)
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 285 sec/epoch (16G)
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

#### US Dataset

In [None]:
print_params()  # 139 (16G) sec/epoch
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 142 (16G) sec/epoch 304 (P4)
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()  # 276 sec/epoch (16G)
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

### Model Evaluation - B1

#### CAN Dataset

##### max_len = 300

In [None]:
print_params() 
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report() 

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# max_len=300, paragraph, data_v2, epoch=4
print(f1_score(trues3, preds3))
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# max_len=300, table, data_v2, epoch=3
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))

metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

##### max_len = 200

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params() 
print(evaluator.best_score, '\n') 
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report() 

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# max_len=200, paragraph, data_v2, epoch=4
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))

metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

##### max_len = 120

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# max_len=120, paragraph, data_v2, epoch=4
print(f1_score(trues3, preds3))
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# max_len=120, table, data_v2, epoch=7
print(f1_score(trues3, preds3))
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

##### max_len = 80

In [None]:
# max_len=80, full_data, data_v2, epoch=4-6
metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# max_len=80, paragraph, data_v2, epoch=3-4
metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

##### max_len = 50

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
# max_len=50, full_data, data_v2, epoch=3-4
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

In [None]:
print_params()
print(evaluator.best_score, '\n')
metric = FlatScore('precision', 'macro', trues=trues, preds=preds); metric.report()

#### UK Dataset

In [None]:
# max_len=120, bert, source='uk'
metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

#### US Dataset

In [None]:
# max_len=120, bert, source='us'
metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

In [None]:
# max_len=80, bert, source='uk'
metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

### Bert + BiLSTM

build model

In [None]:
num_classes = len(lab2idx)

# build model
inputs = Input(shape=(MAX_LEN,), dtype=tf.int32, name='inputs')
layer = BertishModel.from_pretrained(model_fullname)
layer = layer([inputs])
layer = Lambda(lambda x: x[0])(layer)
layer = Dropout(rate=0.2)(layer)
layer = Bidirectional(LSTM(units=512, return_sequences=True,
                        recurrent_dropout=0.2, dropout=0.2))(layer)
layer2 = Bidirectional(LSTM(units=512, return_sequences=True,
                        recurrent_dropout=0.2, dropout=0.2))(layer)
layer = Add()([layer, layer2])  # residual connection to the first biLSTM
output = Dense(num_classes, activation='softmax')(layer)
model = Model([inputs], output)

# compile model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.00003)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

train model

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, 10000_data
gc.collect()
bs = 32
tr_ntop = 10000
val_ntop = 1000
history = model.fit(x=[tr_inputs[:tr_ntop]], 
                    y=tr_tags[:tr_ntop], 
                    validation_data=([val_inputs[:val_ntop]], val_tags[:val_ntop]),
                    batch_size=bs,
                    epochs=1,
                    verbose=1)

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, full_data
gc.collect()
bs = 32
tr_ntop = len(tr_inputs)
val_ntop = len(val_inputs)
history = model.fit(x=[tr_inputs[:tr_ntop]], 
                    y=tr_tags[:tr_ntop], 
                    validation_data=([val_inputs[:val_ntop]], val_tags[:val_ntop]),
                    batch_size=bs,
                    epochs=1,
                    verbose=1)

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, full_data
gc.collect()
bs = 32
tr_ntop = len(tr_inputs)
val_ntop = len(val_inputs)
history = model.fit(x=[tr_inputs[:tr_ntop]], 
                    y=tr_tags[:tr_ntop], 
                    validation_data=([val_inputs[:val_ntop]], val_tags[:val_ntop]),
                    batch_size=bs,
                    epochs=1,
                    verbose=1)

prediction

In [None]:
probs = model.predict([val_inputs[:val_ntop], val_masks[:val_ntop]], batch_size=bs)

preds = probs.argmax(2)
trues = val_tags[:val_ntop]
labes = val_labels[:val_ntop]
preds2 = [[unique_labels[i] for i in pred] for pred in preds]
trues2 = [[unique_labels[i] for i in true] for true in trues]
out = [(t[:len(l)], p[:len(l)]) for l,t,p in zip(labes, trues2, preds2)]
trues3, preds3 = zip(*out)

if collapse_labels:
    # unique_labels = sorted(set(xdat["tag"]))
    # unique_labels.remove('O')
    # unique_labels = ['O'] + sorted(unique_labels, key=lambda x: x.split('-')[1])
    # lab2idx = {t: i for i, t in enumerate(unique_labels)}
    lab2collapse = {x: x.split('-')[1] if x.find('-')>=0 else x for x in lab2idx}
    trues4 = [[lab2collapse[x] for x in true] for true in trues3]
    preds4 = [[lab2collapse[x] for x in pred] for pred in preds3]

evaluation

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, 10000_data
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))
if collapse_labels:
    print(f1_score(trues4, preds4))

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, 10000_data
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))
if collapse_labels:
    print(f1_score(trues4, preds4))

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, full_data
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))
if collapse_labels:
    print(f1_score(trues4, preds4))

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, full_data
print(f1_score(trues2, preds2))
print(f1_score(trues3, preds3))
if collapse_labels:
    print(f1_score(trues4, preds4))

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, full_data
metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

In [None]:
# max_len=40, collapsed_labels, bert+BiLSTM, full_data
metric = FlatScore('precision', 'macro', trues=trues, preds=preds)
metric.report()

inspect samples

In [None]:
i = 22
print("truth tags",'\n', trues[i], '\n')
print("predictions", '\n', preds[i], '\n')
print("truth labels", '\n', np.array(labes[i]), '\n')

### ELMo + BiLSTM

In [None]:
https://colab.research.google.com/drive/1jNrJQIpwZaJoiq-Y_t9ALPc_g1QchqZB

## John's Sandbox

In [None]:
https://colab.research.google.com/drive/1AnrBf23MibMx1KbWlk8KcOYMKnyOn09v

## Scrap Code

In [None]:

agg_func = lambda s: tuple([(w, p, t) for w, p, t in zip(s["word"], s['pos'], s["tag"])])
groups = xdat.groupby("sentence").apply(agg_func)
sentences = [" ".join([s[0] for s in sent]) for sent in groups]
labels = [[s[2] for s in sent] for sent in groups]


# encode y labels
labels_unique = sorted(set(xdat["tag"].values))
lab2idx = {t: i for i, t in enumerate(labels_unique)}

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
attention_masks = (input_ids > 0).astype(int)

tags = pad_sequences([[lab2idx.get(l) for l in lab] for lab in labels],
                    maxlen=MAX_LEN, value=lab2idx["O"], padding="post",
                    dtype="long", truncating="post")
