In [81]:
import keras
from keras.utils import to_categorical
import numpy as np
import os
import pickle as pkl
import tensorflow as tf
from nltk.stem import PorterStemmer
from numpy.random import seed
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Embedding, Dropout, BatchNormalization, Activation, Input, \
    Conv1D, MaxPool1D, Flatten, Concatenate, Add, Average
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [2]:
train_dict = pkl.load(open("data/train.pkl", "rb"))
val_dict = pkl.load(open("data/val.pkl", "rb"))
test_dict = pkl.load(open("data/test.pkl", "rb"))
print("keys in train_dict:", train_dict.keys())
print("keys in val_dict:", val_dict.keys())
print("keys in test_dict:", test_dict.keys())

keys in train_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in val_dict: dict_keys(['id', 'word_seq', 'tag_seq'])
keys in test_dict: dict_keys(['id', 'word_seq'])


In [3]:
def get_onehot_vector(feats, feats_dict):
    """
    :param feats: a list of features, type: list
    :param feats_dict: a dict from features to indices, type: dict
    return a feature vector,
    """
    # initialize the vector as all zeros
    vector = np.zeros(len(feats_dict), dtype=np.float)
    for f in feats:
        # get the feature index, return -1 if the feature is not existed
        f_idx = feats_dict.get(f, -1)
        if f_idx != -1:
            # set the corresponding element as 1
            vector[f_idx] = 1
    return vector
def stem(tokens):
    """
    :param tokens: a list of tokens, type: list
    return a list of stemmed words, type: list
    e.g.
    Input: ['Text', 'mining', 'is', 'to', 'identify', 'useful', 'information', '.']
    Output: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.']
    """
    ### equivalent code
    # results = list()
    # for token in tokens:
    #     results.append(ps.stem(token))
    # return results

    return [ps.stem(token) for token in tokens]
def n_gram(tokens, n=1):
    """
    :param tokens: a list of tokens, type: list
    :param n: the corresponding n-gram, type: int
    return a list of n-gram tokens, type: list
    e.g.
    Input: ['text', 'mine', 'is', 'to', 'identifi', 'use', 'inform', '.'], 2
    Output: ['text mine', 'mine is', 'is to', 'to identifi', 'identifi use', 'use inform', 'inform .']
    """
    if n == 1:
        return tokens
    else:
        results = list()
        for i in range(len(tokens)-n+1):
            # tokens[i:i+n] will return a sublist from i th to i+n th (i+n th is not included)
            results.append(" ".join(tokens[i:i+n]))
        return results


In [84]:
def build_CNN(input_length, vocab_size, embedding_size,
              hidden_size, output_size,
              kernel_sizes, num_filters, num_mlp_layers,
              padding="valid",
              strides=1,
              activation="relu",
              dropout_rate=0.0,
              batch_norm=False,
              l2_reg=0.0,
              loss="categorical_crossentropy",
              optimizer="SGD",
              learning_rate=0.1,
              metric="accuracy"):
    """
    :param input_length: the maximum length of sentences, type: int
    :param vocab_size: the vacabulary size, type: int
    :param embedding_size: the dimension of word representations, type: int
    :param hidden_size: the dimension of the hidden states, type: int
    :param output_size: the dimension of the prediction, type: int
    :param kernel_sizes: the kernel sizes of convolutional layers, type: list
    :param num_filters: the number of filters for each kernel, type: int
    :param num_mlp_layers: the number of layers of the MLP, type: int
    :param padding: the padding method in convolutional layers, type: str
    :param strides: the strides in convolutional layers, type: int
    :param activation: the activation type, type: str
    :param dropout_rate: the probability of dropout, type: float
    :param batch_norm: whether to enable batch normalization, type: bool
    :param l2_reg: the weight for the L2 regularizer, type: str
    :param loss: the training loss, type: str
    :param optimizer: the optimizer, type: str
    :param learning_rate: the learning rate for the optimizer, type: float
    :param metric: the metric, type: str
    return a CNN for text classification,
    # activation document: https://keras.io/activations/
    # dropout document: https://keras.io/layers/core/#dropout
    # embedding document: https://keras.io/layers/embeddings/#embedding
    # convolutional layers document: https://keras.io/layers/convolutional
    # pooling layers document: https://keras.io/layers/pooling/
    # batch normalization document: https://keras.io/layers/normalization/
    # losses document: https://keras.io/losses/
    # optimizers document: https://keras.io/optimizers/
    # metrics document: https://keras.io/metrics/
    """
    x = Input(shape=(input_length,))
    print(input_length,vocab_size,embedding_size,output_size)
    ################################
    ###### Word Representation #####
    ################################
    # word representation layer
    emb = Embedding(input_dim=vocab_size,
                    output_dim=embedding_size,
                    input_length=input_length,
                    embeddings_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=0))(x)
    
    ################################
    ########### Conv-Pool ##########
    ################################
    # convolutional and pooling layers
    cnn_results = list()
    for kernel_size in kernel_sizes:
        # add convolutional layer
        conv = Conv1D(filters=num_filters,
                      kernel_size=(kernel_size,),
                      padding=padding,
                      strides=strides)(emb)
        # add batch normalization layer
        if batch_norm:
            conv = BatchNormalization()(conv)
        # add activation
        conv = Activation(activation)(conv)
        # add max-pooling
        maxpool = MaxPool1D(pool_size=(input_length-kernel_size)//strides+1)(conv)
        cnn_results.append(Flatten()(maxpool))
    
    ################################
    ##### Fully Connected Layer ####
    ################################
    h = Average()(cnn_results) if len(kernel_sizes) > 1 else cnn_results[0]
    h = Dropout(dropout_rate, seed=0)(h)
    h = Embedding(input_dim=vocab_size,
                    output_dim=embedding_size,
                    input_length=1,
                    embeddings_initializer=keras.initializers.TruncatedNormal(mean=0.0, stddev=0.1, seed=0))(h)
    # multi-layer perceptron
    for i in range(num_mlp_layers-1):
        new_h = Dense(hidden_size,
                      kernel_regularizer=keras.regularizers.l2(l2_reg))(h)
        # add batch normalization layer
        if batch_norm:
            new_h = BatchNormalization()(new_h)
        # add skip connection
        if i == 0:
            h = new_h
        else:
            h = Add()([h, new_h])
        # add activation
        h = Activation(activation)(h)
    y = Dense(output_size,
              activation="softmax")(h)
    
    # set the loss, the optimizer, and the metric
    if optimizer == "SGD":
        optimizer = keras.optimizers.SGD(lr=learning_rate)
    elif optimizer == "RMSprop":
        optmizer = keras.optimizers.RMSprop(learning_rate=learning_rate)
    elif optimizer == "Adam":
        optmizer = keras.optimizers.Adam(learning_rate=learning_rate)
    else:
        raise NotImplementedError
    model = Model(x, y)
    model.compile(loss=loss, optimizer=optimizer, metrics=[metric])
    
    return model

In [6]:
# an entry of the dataset
print(train_dict["word_seq"][0])
print(train_dict["tag_seq"][0])
print("index:", train_dict["id"][0])
zipped = zip(train_dict["word_seq"][0], train_dict["tag_seq"][0])
print(*zipped)

['Protection', 'of', 'calves', 'against', 'fatal', 'enteric', 'colibacillosis', 'by', 'orally', 'administered', 'Escherichia', 'coli', 'K99', '-', 'specific', 'monoclonal', 'antibody', '.', 'A', 'monoclonal', 'antibody', '(', 'MCA', ')', 'to', 'enterotoxigenic', 'Escherichia', 'coli', 'K99', 'antigen', 'agglutinated', 'K99+', 'enterotoxigenic', 'E', '.', 'coli', 'strains', 'B44', '(', 'O9', ':', 'K30', ';', 'K99', ';', 'F41', ':', 'H-', ')', 'and', 'B41', '(', 'O101', ':', 'K99', ';', 'F41', ':', 'H-', ')', 'grown', 'at', '37', 'degrees', 'C', 'but', 'not', 'at', '18', 'degrees', 'C.', 'The', 'MCA', ',', 'which', 'was', 'characterized', 'as', 'immunoglobulin', 'G1', ',', 'reacted', 'specifically', 'with', 'K99', 'antigen', 'in', 'an', 'enzyme-linked', 'immunosorbent', 'assay', 'and', 'precipitated', 'radiolabeled', 'K99', 'antigen', '.', 'A', 'total', 'of', '45', 'colostrum', '-fed', 'and', 'colostrum', '-deprived', 'calves', 'were', 'used', 'in', 'three', 'separate', 'trials', 'to', '

In [7]:
# all the NER tags:
from itertools import chain
print("count of the NER tags:", len(set(chain(*train_dict["tag_seq"]))))
print("all the NER tags:", set(chain(*train_dict["tag_seq"])))

count of the NER tags: 65
all the NER tags: {'ORG', 'IMMUNE_RESPONSE', 'CELL_FUNCTION', 'ARCHAEON', 'LABORATORY_PROCEDURE', 'EVENT', 'BODY_SUBSTANCE', 'CELL_COMPONENT', 'MACHINE_ACTIVITY', 'MONEY', 'LABORATORY_OR_TEST_RESULT', 'THERAPEUTIC_OR_PREVENTIVE_PROCEDURE', 'WORK_OF_ART', 'PHYSICAL_SCIENCE', 'PERSON', 'CELL_OR_MOLECULAR_DYSFUNCTION', 'ORGAN_OR_TISSUE_FUNCTION', 'BODY_PART_ORGAN_OR_ORGAN_COMPONENT', 'NORP', 'HUMAN-CAUSED_PHENOMENON_OR_PROCESS', 'LOC', 'CELL', 'ORDINAL', 'EDUCATIONAL_ACTIVITY', 'TIME', 'LAW', 'GOVERNMENTAL_OR_REGULATORY_ACTIVITY', 'MATERIAL', 'DISEASE_OR_SYNDROME', 'CHEMICAL', 'QUANTITY', 'EVOLUTION', 'DAILY_OR_RECREATIONAL_ACTIVITY', 'GENE_OR_GENOME', 'DIAGNOSTIC_PROCEDURE', 'GROUP', 'VIRUS', 'DATE', 'FAC', 'O', 'VIRAL_PROTEIN', 'RESEARCH_ACTIVITY', 'CARDINAL', 'ANATOMICAL_STRUCTURE', 'SUBSTRATE', 'SOCIAL_BEHAVIOR', 'EUKARYOTE', 'TISSUE', 'SIGN_OR_SYMPTOM', 'LANGUAGE', 'INDIVIDUAL_BEHAVIOR', 'INJURY_OR_POISONING', 'ORGANISM', 'CORONAVIRUS', 'WILDLIFE', 'EXPERIME

In [8]:
train_dict['word_seq'] = [stem(tokens) for tokens in train_dict["word_seq"]]
val_dict["word_seq"] = [stem(tokens) for tokens in val_dict["word_seq"]]
test_dict["word_seq"] = [stem(tokens) for tokens in test_dict["word_seq"]]

In [9]:
vocab_dict = {'_unk_': 0, '_w_pad_': 1}

for doc in train_dict['word_seq']:
    for word in doc:
        if(word not in vocab_dict):
            vocab_dict[word] = len(vocab_dict)

tag_dict = {'_t_pad_': 0} # add a padding token

for tag_seq in train_dict['tag_seq']:
    for tag in tag_seq:
        if(tag not in tag_dict):
            tag_dict[tag] = len(tag_dict)
word2idx = vocab_dict
idx2word = {v:k for k,v in word2idx.items()}
tag2idx = tag_dict
idx2tag = {v:k for k,v in tag2idx.items()}            

print("size of word vocab:", len(vocab_dict), "size of tag_dict:", len(tag_dict))

size of word vocab: 55469 size of tag_dict: 65


In [66]:
# The maximum length of a sentence is set to 128
max_sent_length = 128



train_tokens = np.array([[word2idx[w] for w in doc] for doc in train_dict['word_seq']])
val_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in val_dict['word_seq']])
test_tokens = np.array([[word2idx.get(w, 0) for w in doc] for doc in test_dict['word_seq']])


train_tags = np.array([[tag2idx[t] for t in t_seq] for t_seq in train_dict['tag_seq']])
train_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in train_tags])

val_tags =  np.array([[tag2idx[t] for t in t_seq] for t_seq in val_dict['tag_seq']])
val_tags = np.array([to_categorical(t_seq, num_classes=len(tag_dict)) for t_seq in val_tags])

In [67]:
print("training size:", train_tokens.shape, "tag size:", train_tags.shape)
print("validating size:", val_tokens.shape, "tag size:", val_tags.shape)

training size: (23600, 128) tag size: (23600, 128, 65)
validating size: (2950, 128) tag size: (2950, 128, 65)


In [60]:
train_tags[0]

array([1, 1, 2, 1, 1, 3, 3, 1, 4, 4, 4, 4, 4, 1, 5, 5, 5, 1, 1, 6, 6, 1,
       4, 1, 1, 6, 6, 6, 1, 1, 1, 4, 4, 4, 1, 6, 6, 6, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 6, 1, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1, 8, 8, 1, 1,
       1, 1, 8, 8, 1, 1, 4, 1, 1, 1, 1, 1, 4, 4, 1, 1, 1, 1, 6, 6, 1, 1,
       6, 6, 6, 1, 1, 1, 6, 6, 1, 1, 1, 1, 1, 6, 1, 1, 6, 1, 2, 1, 1, 1,
       5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, 1, 1, 1, 4])

In [85]:
os.makedirs("models", exist_ok=True)

seed(0)
tf.random.set_seed(0)

model = build_CNN(input_length=max_sent_length, vocab_size=len(vocab_dict),
                  embedding_size=128, hidden_size=128, output_size=len(tag_dict),
                  kernel_sizes=[1,2,3,4], num_filters=128, num_mlp_layers=3,
                  activation="relu",
                  dropout_rate=0.3, l2_reg=0.05, batch_norm=True)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "cnn1_weights.hdf5"),
    monitor="val_accuracy",
    verbose=0,
    save_best_only=True)

print(model.summary())

cnn_history = model.fit(x=train_tokens, y=train_tags,
                    validation_split=0.1,
                    epochs=128, batch_size=128, verbose=0,
                    callbacks=[checkpointer])
model = keras.models.load_model(os.path.join("models", "cnn1_weights.hdf5"))

train_score = model.evaluate(train_tokens, train_tags,
                             batch_size=100)
test_score = model.evaluate(val_tokens, val_tags,
                            batch_size=100)
print("training loss:", train_score[0], "training accuracy", train_score[1])
print("test loss:", test_score[0], "test accuracy", test_score[1])

128 55469 128 65
Model: "functional_39"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_26 (InputLayer)           [(None, 128)]        0                                            
__________________________________________________________________________________________________
embedding_28 (Embedding)        (None, 128, 128)     7100032     input_26[0][0]                   
__________________________________________________________________________________________________
conv1d_94 (Conv1D)              (None, 128, 128)     16512       embedding_28[0][0]               
__________________________________________________________________________________________________
conv1d_95 (Conv1D)              (None, 127, 128)     32896       embedding_28[0][0]               
_____________________________________________________________________

training loss: 1.4689805507659912 training accuracy 0.7280279994010925
test loss: 1.480031132698059 test accuracy 0.7227463126182556
