# Load dataset

Data format:

|id|word_seq|tag_seq|
|:--|:--|:--|
|index of the sentence|tokenized words|corresponding NER tags|
|0|`["protection", "calves", ...]`|`["O", "LIVESTOCK", ...]`|
|1|`["prevent", "diarrhea",...]` |`["O", "DISEASE_OR_SYNDROME", ...]`|
|...|...|...|



There are 64 categories of NER tags (plus 1 padding token).

The ground-truth tags are provided for the training and testing set, while being omitted in the testing set.

In [1]:
import keras
from keras.utils import to_categorical
import numpy as np
import os
import pickle as pkl

train_dict = pkl.load(open("data/train.pkl", "rb"))
val_dict = pkl.load(open("data/val.pkl", "rb"))
test_dict = pkl.load(open("data/test.pkl", "rb"))
print("keys in train_dict:", train_dict.keys())
print("keys in val_dict:", val_dict.keys())
print("keys in test_dict:", test_dict.keys())

keys in train_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in val_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in test_dict: dict_keys(['id', 'word_seq'])


In [2]:
# an entry of the dataset
print("index:", train_dict["id"][0])
print(*zip(train_dict["word_seq"][0], train_dict["tag_seq"][0]))

index: 0
('Protection', 'O') ('of', 'O') ('calves', 'LIVESTOCK') ('against', 'O') ('fatal', 'O') ('enteric', 'DISEASE_OR_SYNDROME') ('colibacillosis', 'DISEASE_OR_SYNDROME') ('by', 'O') ('orally', 'GENE_OR_GENOME') ('administered', 'GENE_OR_GENOME') ('Escherichia', 'GENE_OR_GENOME') ('coli', 'GENE_OR_GENOME') ('K99', 'GENE_OR_GENOME') ('-', 'O') ('specific', 'CARDINAL') ('monoclonal', 'CARDINAL') ('antibody', 'CARDINAL') ('.', 'O') ('A', 'O') ('monoclonal', 'CHEMICAL') ('antibody', 'CHEMICAL') ('(', 'O') ('MCA', 'GENE_OR_GENOME') (')', 'O') ('to', 'O') ('enterotoxigenic', 'CHEMICAL') ('Escherichia', 'CHEMICAL') ('coli', 'CHEMICAL') ('K99', 'O') ('antigen', 'O') ('agglutinated', 'O') ('K99+', 'GENE_OR_GENOME') ('enterotoxigenic', 'GENE_OR_GENOME') ('E', 'GENE_OR_GENOME') ('.', 'O') ('coli', 'CHEMICAL') ('strains', 'CHEMICAL') ('B44', 'CHEMICAL') ('(', 'O') ('O9', 'O') (':', 'O') ('K30', 'O') (';', 'O') ('K99', 'O') (';', 'O') ('F41', 'O') (':', 'O') ('H-', 'O') (')', 'O') ('and', 'O') (

In [4]:
# all the NER tags:
from itertools import chain
print("count of the NER tags:", len(set(chain(*train_dict["tag_seq"]))))
print("all the NER tags:", set(chain(*train_dict["tag_seq"])))

count of the NER tags: 65
all the NER tags: {'ORDINAL', 'DISEASE_OR_SYNDROME', 'WILDLIFE', 'MACHINE_ACTIVITY', 'FOOD', 'CELL_OR_MOLECULAR_DYSFUNCTION', 'MATERIAL', 'PERSON', 'SIGN_OR_SYMPTOM', 'INDIVIDUAL_BEHAVIOR', 'RESEARCH_ACTIVITY', 'ORGAN_OR_TISSUE_FUNCTION', 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE', 'VIRUS', 'HUMAN-CAUSED_PHENOMENON_OR_PROCESS', 'CORONAVIRUS', 'VIRAL_PROTEIN', 'INJURY_OR_POISONING', 'LABORATORY_OR_TEST_RESULT', 'TIME', 'EUKARYOTE', 'ANATOMICAL_STRUCTURE', 'ORGANISM', 'EVENT', 'DATE', 'PERCENT', '_t_pad_', 'CELL', 'NORP', 'EVOLUTION', 'IMMUNE_RESPONSE', 'GENE_OR_GENOME', 'TISSUE', 'LIVESTOCK', 'CARDINAL', 'ARCHAEON', 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY', 'PRODUCT', 'PHYSICAL_SCIENCE', 'SOCIAL_BEHAVIOR', 'EXPERIMENTAL_MODEL_OF_DISEASE', 'LOC', 'LAW', 'BACTERIUM', 'CHEMICAL', 'O', 'MOLECULAR_FUNCTION', 'SUBSTRATE', 'MONEY', 'CELL_FUNCTION', 'QUANTITY', 'GROUP_ATTRIBUTE', 'LABORATORY_PROCEDURE', 'BODY_PART_ORGAN_OR_ORGAN_COMPONENT', 'WORK_OF_ART', 'GPE', 'BODY_SUBSTANC

# Prepare the data for training

In [5]:
# prepare word vocab and tag vocab

vocab_dict = {'_unk_': 0, '_w_pad_': 1}

for doc in train_dict['word_seq']:
    for word in doc:
        if(word not in vocab_dict):
            vocab_dict[word] = len(vocab_dict)

tag_dict = {'_t_pad_': 0} # add a padding token

for tag_seq in train_dict['tag_seq']:
    for tag in tag_seq:
        if(tag not in tag_dict):
            tag_dict[tag] = len(tag_dict)
word2idx = vocab_dict
idx2word = {v:k for k,v in word2idx.items()}
tag2idx = tag_dict
idx2tag = {v:k for k,v in tag2idx.items()}            

print("size of word vocab:", len(vocab_dict), "size of tag_dict:", len(tag_dict))

size of word vocab: 82275 size of tag_dict: 65


In [6]:
# The maximum length of a sentence is set to 128
max_sent_length = 128

train_tokens = np.array([[word2idx[w] for w in doc] for doc in train_dict['word_seq']])
val_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in val_dict['word_seq']])
test_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in test_dict['word_seq']])


train_tags = [[tag2idx[t] for t in t_seq] for t_seq in train_dict['tag_seq']]
train_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in train_tags])

val_tags = [[tag2idx[t] for t in t_seq] for t_seq in val_dict['tag_seq']]
val_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in val_tags])

# we don't have test tags

In [6]:
print("training size:", train_tokens.shape, "tag size:", train_tags.shape)
print("validating size:", val_tokens.shape, "tag size:", val_tags.shape)

training size: (23600, 128) tag size: (23600, 128, 65)
validating size: (2950, 128) tag size: (2950, 128, 65)


In [7]:
# an example of training instance and training tags.
print(train_tokens[0,:10], np.argmax(train_tags[0, :10, :], axis=1))

[ 2  3  4  5  6  7  8  9 10 11] [1 1 2 1 1 3 3 1 4 4]


In [8]:
num_training_data = train_tokens.shape[0]
sequence_length = train_tokens.shape[1]
vocabulary_size = len(vocab_dict)
num_tags = len(tag_dict)

In [9]:
drop = 0.3
epochs = 20
batch_size = 128
embedding_dim = 64

hidden_size = 64

In [10]:
import keras
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Input, Add, Concatenate,\
    Bidirectional, SimpleRNN, LSTM, GRU, TimeDistributed
from keras.models import Sequential, Model

In [11]:
inputs = Input(shape=(sequence_length,), dtype='int32')
emb_layer = Embedding(input_dim=vocabulary_size, 
                    output_dim=embedding_dim, 
                    input_length=sequence_length)
embedding = emb_layer(inputs)

drop_embed = Dropout(drop)(embedding)

lstm_out1 = Bidirectional(LSTM(units=hidden_size, return_sequences=True))(drop_embed)
# output: lstm_out -> [batch_size, sequence_length, hidden_size]

drop_lstm1 = Dropout(drop)(lstm_out1)

lstm_out2 = Bidirectional(LSTM(units=hidden_size, return_sequences=True))(drop_lstm1)
# output: lstm_out -> [batch_size, sequence_length, hidden_size]

dropout_lstm2 = Dropout(drop)(lstm_out2)


outputs = TimeDistributed(Dense(units=num_tags, activation='softmax'))(dropout_lstm2)
# output: outputs -> [batch_size, sequence_length, vocabulary_size]

In [12]:
model = Model(inputs=inputs, outputs=outputs)

adam = keras.optimizers.Adam()
model.compile(loss='categorical_crossentropy', optimizer=adam,metrics=["accuracy"])

print(model.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 128, 64)           5265600   
_________________________________________________________________
dropout (Dropout)            (None, 128, 64)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 128, 128)          66048     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128, 128)          98816     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128, 128)          0     

In [13]:
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)
earlystopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=0)

In [14]:
print("Traning Model...")
history = model.fit(
        train_tokens, 
        train_tags,
        validation_split=0.1, 
        batch_size=batch_size, 
        epochs=epochs,
        verbose=1,callbacks=[checkpointer, earlystopping])

Traning Model...
Epoch 1/20

ResourceExhaustedError:  [_Derived_]  OOM when allocating tensor with shape[128,128,64] and type float on /job:localhost/replica:0/task:0/device:CPU:0 by allocator cpu
	 [[{{node gradients/TensorArrayUnstack/TensorListFromTensor_grad/TensorListStack}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[StatefulPartitionedCall_3]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.
 [Op:__inference_train_function_11100]

Function call stack:
train_function -> train_function -> train_function


In [None]:
train_score = model.evaluate(train_tokens, train_tags,
                             batch_size=100)
test_score = model.evaluate(val_tokens, val_tags,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

In [None]:
preds = model.predict(test_tokens)

In [46]:
np.argmax(preds, axis=2).shape

(2950, 128)

In [47]:
preds_id = np.argmax(preds, axis=2)

In [48]:
preds_labels = np.array([[idx2tag[p] for p in preds1] for preds1 in preds_id])

In [40]:
val_preds = model.predict(val_tokens)

In [41]:
np.argmax(val_preds, axis=2).shape

(2950, 128)

In [42]:
val_preds_id = np.argmax(val_preds, axis=2)

In [43]:
val_preds_labels = np.array([[idx2tag[p] for p in preds1] for preds1 in val_preds_id])

In [37]:
val_tags_by_idx = np.argmax(val_tags, axis=2)
val_labels = np.array([[idx2tag[p] for p in preds] for preds in val_tags_by_idx])

In [38]:
# Provided function to test accuracy
# You could check the validation accuracy to select the best of your models
def calc_accuracy(preds, tags, padding_id="_t_pad_"):
    """
        Input:
            preds (np.narray): (num_data, length_sentence)
            tags  (np.narray): (num_data, length_sentence)
        Output:
            Proportion of correct prediction. The padding tokens are filtered out.
    """
    preds_flatten = preds.flatten()
    tags_flatten = tags.flatten()
    non_padding_idx = np.where(tags_flatten!=padding_id)[0]
    
    return sum(preds_flatten[non_padding_idx]==tags_flatten[non_padding_idx])/len(non_padding_idx)

In [49]:
print("Pred Acc:", calc_accuracy(preds_labels, val_labels))

Pred Acc: 0.5555786976771095


In [44]:
print("Val Acc:", calc_accuracy(val_preds_labels, val_labels))

Val Acc: 0.8905204084584454


In [None]:
# Let's take the baseline 1 as an example, where we predict all labels as 1.

import json
import pandas as pd

df = pd.DataFrame({'id': test_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in preds_labels]})
df.to_csv('test_preds.csv', index=False)

In [None]:
pd.read_csv("test_preds.csv")

Unnamed: 0,id,labels
0,0,"[""O"", ""O"", ""IMMUNE_RESPONSE"", ""IMMUNE_RESPONSE..."
1,1,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""_t_pad_"", ""_t_..."
2,2,"[""O"", ""O"", ""O"", ""RESEARCH_ACTIVITY"", ""RESEARCH..."
3,3,"[""O"", ""O"", ""O"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMI..."
4,4,"[""O"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMICAL"", ""O"",..."
...,...,...
2945,2945,"[""DATE"", ""O"", ""CORONAVIRUS"", ""O"", ""O"", ""O"", ""O..."
2946,2946,"[""VIRUS"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMICAL"", ..."
2947,2947,"[""O"", ""O"", ""O"", ""CHEMICAL"", ""O"", ""O"", ""DISEASE..."
2948,2948,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""CHEMICAL""..."


In [50]:
# Let's take the baseline 1 as an example, where we predict all labels as 1.

import json
import pandas as pd

df = pd.DataFrame({'id': val_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in val_preds_labels]})
df.to_csv('val_preds.csv', index=False)

In [54]:
pd.read_csv("val_preds.csv")

Unnamed: 0,id,labels
0,0,"[""O"", ""O"", ""O"", ""O"", ""GENE_OR_GENOME"", ""O"", ""C..."
1,1,"[""ORGANISM"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O""..."
2,2,"[""O"", ""O"", ""O"", ""O"", ""DISEASE_OR_SYNDROME"", ""D..."
3,3,"[""O"", ""O"", ""O"", ""VIRUS"", ""O"", ""WILDLIFE"", ""WIL..."
4,4,"[""EUKARYOTE"", ""VIRUS"", ""O"", ""O"", ""GENE_OR_GENO..."
...,...,...
2945,2945,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""GENE..."
2946,2946,"[""O"", ""O"", ""IMMUNE_RESPONSE"", ""IMMUNE_RESPONSE..."
2947,2947,"[""O"", ""DATE"", ""CORONAVIRUS"", ""O"", ""O"", ""O"", ""O..."
2948,2948,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ..."


# Two simple models and codes for evaluation

1. Predict all the tags as "O".
2. Random guess

You could use the `calc_accuracy` function to evaluate the accuracy of your predictions.

In [23]:
# Provided function to test accuracy
# You could check the validation accuracy to select the best of your models
def calc_accuracy(preds, tags, padding_id="_t_pad_"):
    """
        Input:
            preds (np.narray): (num_data, length_sentence)
            tags  (np.narray): (num_data, length_sentence)
        Output:
            Proportion of correct prediction. The padding tokens are filtered out.
    """
    preds_flatten = preds.flatten()
    tags_flatten = tags.flatten()
    non_padding_idx = np.where(tags_flatten!=padding_id)[0]
    
    return sum(preds_flatten[non_padding_idx]==tags_flatten[non_padding_idx])/len(non_padding_idx)

In [24]:
# Check accuracy on the training set
train_tags_by_idx = np.argmax(train_tags, axis=2)
train_labels = np.array([[idx2tag[p] for p in preds] for preds in train_tags_by_idx])

print(calc_accuracy(train_labels, train_labels))

# Predict all labels as "O"
baseline1_train_preds = np.array([[idx2tag[p] for p in preds] for preds in np.ones(train_labels.shape)])
print("baseline 1, make all predictions as 1. Acc:", 
      calc_accuracy(baseline1_train_preds, 
                    train_labels))

# Randomly guess labels.
baseline2_train_preds = np.array([[idx2tag[p] for p in preds] for preds in np.random.randint(1, len(tag_dict), train_labels.shape)]) 
print("baseline 2, Random guess. Acc:", 
      calc_accuracy(baseline2_train_preds,
                    train_labels))

1.0
baseline 1, make all predictions as 1. Acc: 0.7562260387120905
baseline 2, Random guess. Acc: 0.015568455525377604


# Output format

In this project, you should predict the NER tags for the test set tokens.

The index of test set starts from 0 and ends with 2949.

You should write the predictions into a .csv file, where the first column is the test indexes in ascending order, and the second column is a json format prediction list.

E.g.

|id|labels|
|:--:|:--:|
|0|`['O', 'O', 'CHEMICAL', 'VIRUS', ...]`|
|1|`['O', 'O', 'GENE_OR_GENOME', ...]`|
|...|...|

Format requirements:
1. The first column `id` should be an integer, in ascending order, starting from 0 and corresponding to the index in test_dict.
2. The second column `labels` should be a dumped string using json, storing the your predictions for each token. The size of the list should be exactly 128, including padding tokens.

### For example, this is your prediction for the test set:

In [25]:
test_preds_numerical = np.random.randint(1, len(tag_dict), 
                                         (len(test_dict["id"]), max_sent_length))
test_preds = np.array([[idx2tag[p] for p in preds] for preds in test_preds_numerical])
print(test_preds.shape)
print(test_preds[0])

(2950, 128)
['LOC' 'EVOLUTION' 'GROUP_ATTRIBUTE' 'TISSUE' 'NORP'
 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY' 'CELL_OR_MOLECULAR_DYSFUNCTION'
 'PERCENT' 'CELL_FUNCTION' 'MATERIAL' 'LIVESTOCK' 'PRODUCT' 'DATE'
 'WORK_OF_ART' 'CELL_OR_MOLECULAR_DYSFUNCTION' 'MATERIAL'
 'HUMAN-CAUSED_PHENOMENON_OR_PROCESS'
 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY' 'ORDINAL' 'QUANTITY'
 'MOLECULAR_FUNCTION' 'EVENT' 'CELL_OR_MOLECULAR_DYSFUNCTION' 'ORDINAL'
 'EVOLUTION' 'CELL_FUNCTION' 'NORP' 'MATERIAL'
 'HUMAN-CAUSED_PHENOMENON_OR_PROCESS' 'PERSON' 'PERCENT'
 'INJURY_OR_POISONING' 'HUMAN-CAUSED_PHENOMENON_OR_PROCESS' 'EVOLUTION'
 'IMMUNE_RESPONSE' 'ORDINAL' 'CELL_COMPONENT' 'PRODUCT'
 'DIAGNOSTIC_PROCEDURE' 'PERSON' 'O' 'CHEMICAL' 'GPE' 'O'
 'MOLECULAR_FUNCTION' 'DATE' 'LANGUAGE' 'INJURY_OR_POISONING'
 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY' 'PHYSICAL_SCIENCE' 'QUANTITY'
 'CELL_COMPONENT' 'EVENT' 'CELL_OR_MOLECULAR_DYSFUNCTION' 'VIRAL_PROTEIN'
 'TIME' 'INDIVIDUAL_BEHAVIOR' 'SIGN_OR_SYMPTOM' 'BACTERIUM' 'PERSON'
 'EXP

In [31]:
# Let's take the baseline 1 as an example, where we predict all labels as 1.

import json
import pandas as pd

df = pd.DataFrame({'id': test_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in preds_labels]})
df.to_csv('test_preds.csv', index=False)

In [32]:
pd.read_csv("test_preds.csv")

Unnamed: 0,id,labels
0,0,"[""O"", ""O"", ""IMMUNE_RESPONSE"", ""IMMUNE_RESPONSE..."
1,1,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""_t_pad_"", ""_t_..."
2,2,"[""O"", ""O"", ""O"", ""RESEARCH_ACTIVITY"", ""RESEARCH..."
3,3,"[""O"", ""O"", ""O"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMI..."
4,4,"[""O"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMICAL"", ""O"",..."
...,...,...
2945,2945,"[""DATE"", ""O"", ""CORONAVIRUS"", ""O"", ""O"", ""O"", ""O..."
2946,2946,"[""VIRUS"", ""CHEMICAL"", ""CHEMICAL"", ""CHEMICAL"", ..."
2947,2947,"[""O"", ""O"", ""O"", ""CHEMICAL"", ""O"", ""O"", ""DISEASE..."
2948,2948,"[""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""O"", ""CHEMICAL""..."


# Please make your output-format exactly the same as above

You could check it by playing around with the validation set with our evaluation codes `evaluate.py`:

In [53]:
# val_preds_numerical = np.random.randint(1, len(tag_dict), 
#                                          (len(val_dict["id"]), max_sent_length))
# val_preds = np.array([[idx2tag[p] for p in preds] for preds in np.ones((len(val_dict["id"]), max_sent_length))])

import json
import pandas as pd

df = pd.DataFrame({'id': val_dict["id"],
                   'labels': [json.dumps(np.array(preds).tolist()) for preds in val_preds_labels]})
df.to_csv('val_preds.csv', index=False)

from evaluate import evaluate

print("val accuracy", evaluate('val_preds.csv', "data/val.pkl"))

val accuracy 0.8905204084584454
