# Load dataset

Data format:

|id|word_seq|tag_seq|
|:--|:--|:--|
|index of the sentence|tokenized words|corresponding NER tags|
|0|`["protection", "calves", ...]`|`["O", "LIVESTOCK", ...]`|
|1|`["prevent", "diarrhea",...]` |`["O", "DISEASE_OR_SYNDROME", ...]`|
|...|...|...|



There are 64 categories of NER tags (plus 1 padding token).

The ground-truth tags are provided for the training and testing set, while being omitted in the testing set.

In [1]:
import keras
from keras.utils import to_categorical
import numpy as np
import os
import pickle as pkl

train_dict = pkl.load(open("data/train.pkl", "rb"))
val_dict = pkl.load(open("data/val.pkl", "rb"))
test_dict = pkl.load(open("data/test.pkl", "rb"))
print("keys in train_dict:", train_dict.keys())
print("keys in val_dict:", val_dict.keys())
print("keys in test_dict:", test_dict.keys())

keys in train_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in val_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in test_dict: dict_keys(['id', 'word_seq'])


In [2]:
# an entry of the dataset
print("index:", train_dict["id"][0])
print(*zip(train_dict["word_seq"][0], train_dict["tag_seq"][0]))

index: 0
('Protection', 'O') ('of', 'O') ('calves', 'LIVESTOCK') ('against', 'O') ('fatal', 'O') ('enteric', 'DISEASE_OR_SYNDROME') ('colibacillosis', 'DISEASE_OR_SYNDROME') ('by', 'O') ('orally', 'GENE_OR_GENOME') ('administered', 'GENE_OR_GENOME') ('Escherichia', 'GENE_OR_GENOME') ('coli', 'GENE_OR_GENOME') ('K99', 'GENE_OR_GENOME') ('-', 'O') ('specific', 'CARDINAL') ('monoclonal', 'CARDINAL') ('antibody', 'CARDINAL') ('.', 'O') ('A', 'O') ('monoclonal', 'CHEMICAL') ('antibody', 'CHEMICAL') ('(', 'O') ('MCA', 'GENE_OR_GENOME') (')', 'O') ('to', 'O') ('enterotoxigenic', 'CHEMICAL') ('Escherichia', 'CHEMICAL') ('coli', 'CHEMICAL') ('K99', 'O') ('antigen', 'O') ('agglutinated', 'O') ('K99+', 'GENE_OR_GENOME') ('enterotoxigenic', 'GENE_OR_GENOME') ('E', 'GENE_OR_GENOME') ('.', 'O') ('coli', 'CHEMICAL') ('strains', 'CHEMICAL') ('B44', 'CHEMICAL') ('(', 'O') ('O9', 'O') (':', 'O') ('K30', 'O') (';', 'O') ('K99', 'O') (';', 'O') ('F41', 'O') (':', 'O') ('H-', 'O') (')', 'O') ('and', 'O') (

In [3]:
# all the NER tags:
from itertools import chain
print("count of the NER tags:", len(set(chain(*train_dict["tag_seq"]))))
print("all the NER tags:", set(chain(*train_dict["tag_seq"])))

count of the NER tags: 65
all the NER tags: {'MACHINE_ACTIVITY', 'CELL', 'DISEASE_OR_SYNDROME', 'INJURY_OR_POISONING', 'EUKARYOTE', 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE', 'MONEY', 'PERCENT', 'GENE_OR_GENOME', 'EDUCATIONAL_ACTIVITY', 'LOC', 'WORK_OF_ART', 'ARCHAEON', 'RESEARCH_ACTIVITY', 'CARDINAL', 'LABORATORY_PROCEDURE', 'CELL_COMPONENT', 'EVOLUTION', 'LABORATORY_OR_TEST_RESULT', 'DAILY_OR_RECREATIONAL_ACTIVITY', 'BODY_SUBSTANCE', 'LANGUAGE', 'BACTERIUM', 'TISSUE', 'SOCIAL_BEHAVIOR', 'DIAGNOSTIC_PROCEDURE', 'CELL_OR_MOLECULAR_DYSFUNCTION', 'BODY_PART_ORGAN_OR_ORGAN_COMPONENT', 'O', 'EVENT', 'MOLECULAR_FUNCTION', 'CELL_FUNCTION', 'SUBSTRATE', 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY', 'VIRUS', 'PERSON', 'FAC', 'GPE', 'LAW', 'LIVESTOCK', 'TIME', 'EXPERIMENTAL_MODEL_OF_DISEASE', 'PHYSICAL_SCIENCE', 'DATE', 'QUANTITY', 'VIRAL_PROTEIN', 'PRODUCT', 'FOOD', 'CHEMICAL', 'CORONAVIRUS', 'SIGN_OR_SYMPTOM', 'GROUP_ATTRIBUTE', 'NORP', 'GROUP', 'ORGAN_OR_TISSUE_FUNCTION', 'MATERIAL', 'IMMUNE_RESPONSE',

# Prepare the data for training

In [4]:
# prepare word vocab and tag vocab

vocab_dict = {'_unk_': 0, '_w_pad_': 1}

for doc in train_dict['word_seq']:
    for word in doc:
        if(word not in vocab_dict):
            vocab_dict[word] = len(vocab_dict)

tag_dict = {'_t_pad_': 0} # add a padding token

for tag_seq in train_dict['tag_seq']:
    for tag in tag_seq:
        if(tag not in tag_dict):
            tag_dict[tag] = len(tag_dict)
word2idx = vocab_dict
idx2word = {v:k for k,v in word2idx.items()}
tag2idx = tag_dict
idx2tag = {v:k for k,v in tag2idx.items()}            

print("size of word vocab:", len(vocab_dict), "size of tag_dict:", len(tag_dict))

size of word vocab: 82275 size of tag_dict: 65


In [5]:
# The maximum length of a sentence is set to 128
max_sent_length = 128

train_tokens = np.array([[word2idx[w] for w in doc] for doc in train_dict['word_seq']])
val_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in val_dict['word_seq']])
test_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in test_dict['word_seq']])


train_tags = [[tag2idx[t] for t in t_seq] for t_seq in train_dict['tag_seq']]
train_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in train_tags])

val_tags = [[tag2idx[t] for t in t_seq] for t_seq in val_dict['tag_seq']]
val_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in val_tags])

# we don't have test tags

In [6]:
print("training size:", train_tokens.shape, "tag size:", train_tags.shape)
print("validating size:", val_tokens.shape, "tag size:", val_tags.shape)

training size: (23600, 128) tag size: (23600, 128, 65)
validating size: (2950, 128) tag size: (2950, 128, 65)


In [7]:
# an example of training instance and training tags.
print(train_tokens[0,:10], np.argmax(train_tags[0, :10, :], axis=1))

[ 2  3  4  5  6  7  8  9 10 11] [1 1 2 1 1 3 3 1 4 4]


In [8]:
num_training_data = train_tokens.shape[0]
sequence_length = train_tokens.shape[1]
vocabulary_size = len(vocab_dict)
num_tags = len(tag_dict)

In [9]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Input, Add, Concatenate,\
    Bidirectional, SimpleRNN, LSTM, GRU, TimeDistributed

In [10]:
def build_RNN(input_length, vocab_size, embedding_size,
              hidden_size, output_size,
              num_rnn_layers, num_mlp_layers,
              rnn_type="lstm",
              bidirectional=False,
              activation="tanh",
              dropout_rate=0.0,
              batch_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.001,
              metric="accuracy"):
    """
    :param input_length: the maximum length of sentences, type: int
    :param vocab_size: the vacabulary size, type: int
    :param embedding_size: the dimension of word representations, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param output_size: the dimension of the prediction, type: int
    :param num_rnn_layers: the number of layers of the RNN, type: int
    :param num_mlp_layers: the number of layers of the MLP, type: int
    :param rnn_type: the type of RNN, type: str
    :param bidirectional: whether to use bidirectional rnn, type: bool
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a RNN for text classification,
    # activation document: https://keras.io/activations/
    # dropout document: https://keras.io/layers/core/#dropout
    # embedding document: https://keras.io/layers/embeddings/#embedding
    # recurrent layers document: https://keras.io/layers/recurrent
    # batch normalization document: https://keras.io/layers/normalization/
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_length,))
    
    ################################
    ###### Word Representation #####
    ################################
    # word representation layer
    emb = Embedding(input_dim=vocab_size,
                    output_dim=embedding_size,
                    input_length=input_length,
                    embeddings_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=0))(x)
    
    ################################
    ####### Recurrent Layers #######
    ################################
    # recurrent layers
    # Referennce: https://keras.io/api/layers/#recurrent-layers
    if rnn_type == "rnn":
        fn = SimpleRNN
    elif rnn_type == "lstm":
        fn = LSTM
    elif rnn_type == "gru":
        fn = GRU
    else:
        raise NotImplementedError
        
    h = emb
    for i in range(num_rnn_layers):
        is_last = (i == num_rnn_layers-1)
        if bidirectional:
            h = Bidirectional(fn(hidden_size,
                   kernel_initializer=keras.initializers.glorot_uniform(seed=0),
                   recurrent_initializer=keras.initializers.Orthogonal(gain=1.0, seed=0),
                   return_sequences=True))(h)
            # return_sequences:
            # Boolean. Whether to return the last output. in the output sequence, or the full sequence.
            # [h_1, h_2, ..., h_n] or h_n
        else:
            h = fn(hidden_size,
                   kernel_initializer=keras.initializers.glorot_uniform(seed=0),
                   recurrent_initializer=keras.initializers.Orthogonal(gain=1.0, seed=0),
                   return_sequences=not is_last)(h)
        h = Dropout(dropout_rate, seed=0)(h)
    
    ################################
    #### Fully Connected Layers ####
    ################################
    # multi-layer perceptron
    for i in range(num_mlp_layers-1):
        new_h = Dense(hidden_size,
                      kernel_initializer=keras.initializers.he_normal(seed=0),
                      bias_initializer="zeros",
                      kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
        # add batch normalization layer
        if batch_norm:
            new_h = BatchNormalization()(new_h)
        # add residual connection
        if i == 0:
            h = new_h
        else:
            h = Add()([h, new_h])
        # add activation
        h = Activation(activation)(h)
    y = Dense(output_size,
              activation="softmax",
              kernel_initializer=keras.initializers.he_normal(seed=0),
              bias_initializer="zeros")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optimizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [None]:
embedding_size = 64
hidden_size = 64
num_rnn_layers = 1
num_mlp_layers = 1
os.makedirs("models", exist_ok=True)
model = build_RNN(max_sent_length, vocabulary_size, embedding_size,
              hidden_size, num_tags,
              num_rnn_layers, num_mlp_layers,
              rnn_type="lstm",
              bidirectional=True,
              activation="relu",
              dropout_rate=0.3,
              batch_norm=True,
              l2_reg=0.3,
              loss="categorical_crossentropy",
              optimizer="Adam",
              learning_rate=0.01,
              metric="accuracy")

checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)
earlystopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0)

np.random.seed(0)
tf.random.set_seed(0)
rnn_history = model.fit(train_tokens, train_tags,
                    validation_split=0.1,
                    epochs=10, batch_size=100, verbose=1,
                    callbacks=[checkpointer, earlystopping])
model = keras.models.load_model(os.path.join("models", "weights.hdf5"))

train_score = model.evaluate(train_tokens, train_tags,
                             batch_size=100)
test_score = model.evaluate(val_tokens, val_tags,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

Epoch 1/10

In [13]:
# val set
val_preds = model.predict(val_tokens)
val_preds_id = np.argmax(val_preds, axis=2)
val_preds_labels = np.array([[idx2tag[p] for p in preds1] for preds1 in val_preds_id])

In [14]:
# test set
preds = model.predict(test_tokens)
preds_id = np.argmax(preds, axis=2)
preds_labels = np.array([[idx2tag[p] for p in preds1] for preds1 in preds_id])

In [15]:
val_tags_by_idx = np.argmax(val_tags, axis=2)
val_labels = np.array([[idx2tag[p] for p in preds] for preds in val_tags_by_idx])

In [16]:
# Provided function to test accuracy
# You could check the validation accuracy to select the best of your models
def calc_accuracy(preds, tags, padding_id="_t_pad_"):
    """
        Input:
            preds (np.narray): (num_data, length_sentence)
            tags  (np.narray): (num_data, length_sentence)
        Output:
            Proportion of correct prediction. The padding tokens are filtered out.
    """
    preds_flatten = preds.flatten()
    tags_flatten = tags.flatten()
    non_padding_idx = np.where(tags_flatten!=padding_id)[0]
    
    return sum(preds_flatten[non_padding_idx]==tags_flatten[non_padding_idx])/len(non_padding_idx)

In [17]:
print("Pred Acc:", calc_accuracy(val_preds_labels, val_labels))

Pred Acc: 0.9048193468136191


In [18]:
# test set
import json
import pandas as pd

df = pd.DataFrame({'id': test_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in preds_labels]})
df.to_csv('test_preds.csv', index=False)

In [19]:
pd.read_csv("test_preds.csv")

Unnamed: 0,id,labels
0,0,"[""O"", ""O"", ""IMMUNE_RESPONSE"", ""IMMUNE_RESPONSE..."
1,1,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""_t_pad_"", ""_t_..."
2,2,"[""O"", ""O"", ""O"", ""RESEARCH_ACTIVITY"", ""RESEARCH..."
3,3,"[""O"", ""O"", ""O"", ""CHEMICAL"", ""CHEMICAL"", ""O"", ""..."
4,4,"[""O"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMICAL"", ""O"",..."
...,...,...
2945,2945,"[""DATE"", ""O"", ""CORONAVIRUS"", ""O"", ""O"", ""O"", ""O..."
2946,2946,"[""VIRUS"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMICAL"", ..."
2947,2947,"[""O"", ""O"", ""O"", ""CHEMICAL"", ""O"", ""O"", ""GENE_OR..."
2948,2948,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""CHEMICAL"", ""CH..."
