<a href="https://colab.research.google.com/github/hieu-le-2412/extraction/blob/master/ner_elmo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import things

In [1]:
import numpy as np
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Conv1D
from keras.layers import Bidirectional, concatenate, SpatialDropout1D, GlobalMaxPooling1D
# from keras_contrib.layers import CRF

from keras import backend as K
from keras.models import load_model

#Elmo
import tensorflow as tf
import tensorflow_hub as hub
# from keras import backend as K
# from keras.models import Model, Input
from keras.layers.merge import add
# from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional, Lambda
from keras.layers import Lambda

Using TensorFlow backend.


Initialize model

In [0]:

def ner_elmo_model_initial(n_tags, max_len=50, batch_size=32):
    # ELMo residual LSTM model    
    sess = tf.Session()
    K.set_session(sess)

    elmo_model = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())

    def ElmoEmbedding(in_text):
        return elmo_model(inputs={
                                "tokens": tf.squeeze(tf.cast(in_text, "string")),
                                "sequence_len": tf.constant(batch_size*[max_len])
                        },
                        signature="tokens",
                        as_dict=True)["elmo"]

    input_text = Input(shape=(max_len,), dtype="string")
    embedding = Lambda(lambda row: ElmoEmbedding(row), output_shape=(None, 1024))(input_text)
    x = Bidirectional(LSTM(units=512, return_sequences=True,
                        recurrent_dropout=0.2, dropout=0.2))(embedding)
    x_rnn = Bidirectional(LSTM(units=512, return_sequences=True,
                            recurrent_dropout=0.2, dropout=0.2))(x)
    x = add([x, x_rnn])  # residual connection to the first biLSTM
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
    model = Model(input_text, out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=[custom_sparse_categorical_accuracy])
    model.summary()
    return model



Create check model function

In [0]:
def check_elmo_model(model, X_tr, y_tr, X_val, y_val, batch_size=32, epochs=5, verbose=1):
    history = model.fit(np.array(X_tr), y_tr, validation_data=(np.array(X_val), y_val), batch_size=batch_size, epochs=epochs, verbose=verbose)
    return history



Create ner utility

In [0]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

          

def input_ner_preprocessing_elmo(data, max_len=50, test_size=0.1, random_state=2018, batch_size=32):
    words = list(set(data["Word"].values))
    words.append("ENDPAD")
    n_words = len(words); n_words

    tags = list(set(data["Tag"].values))
    tags.sort()
    n_tags = len(tags); n_tags

    tag2idx = {t: i for i, t in enumerate(tags)}

    getter = SentenceGetter(data)
    sentences = getter.sentences

    # Elmo
    X = [[w[0] for w in s] for s in sentences]
    new_X = []
    for seq in X:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("__PAD__")
        new_X.append(new_seq)
    X = new_X
    # Do the same for tag sequence
    y = [[tag2idx[w[2]] for w in s] for s in sentences]
    
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
    # Split train and test
    X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=test_size, random_state=random_state)

    X_tr, X_val = X_tr[:1213*batch_size], X_tr[-135*batch_size:]
    y_tr, y_val = y_tr[:1213*batch_size], y_tr[-135*batch_size:]
    y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
    y_val = y_val.reshape(y_val.shape[0], y_val.shape[1], 1)

    return n_tags, X_tr, X_te, y_tr, y_te, X_val, y_val




Test ner

In [6]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()
import io
# Dataset is now stored in a Pandas Dataframe
data = pd.read_csv(io.BytesIO(uploaded['ner_dataset.csv']), encoding="latin1")


Saving ner_dataset.csv to ner_dataset (1).csv


In [7]:
data = data.fillna(method="ffill")
data.tail(10)


Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [0]:
n_tags, X_tr, X_te, y_tr, y_te, X_val, y_val = input_ner_preprocessing_elmo(data, max_len=50)

In [27]:
# tags = list(set(data["Tag"].values))
tags.sort()
tags

['B-art',
 'B-eve',
 'B-geo',
 'B-gpe',
 'B-nat',
 'B-org',
 'B-per',
 'B-tim',
 'I-art',
 'I-eve',
 'I-geo',
 'I-gpe',
 'I-nat',
 'I-org',
 'I-per',
 'I-tim',
 'O']

In [0]:
def custom_sparse_categorical_accuracy(y_true, y_pred):
    return K.cast(K.equal(K.max(y_true, axis=-1),
                          K.cast(K.argmax(y_pred, axis=-1), K.floatx())),
                  K.floatx())


In [10]:
model = ner_elmo_model_initial(n_tags)



W0620 07:43:33.651772 140503551743872 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0620 07:43:33.653326 140503551743872 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0620 07:43:34.133530 140503551743872 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0620 07:43:35.196481 140503551743872 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0620 07:43:35.205962 

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50)           0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               (None, None, 1024)   0           input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, None, 1024)   6295552     lambda_1[0][0]                   
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, None, 1024)   6295552     bidirectional_1[0][0]            
__________________________________________________________________________________________________
add_1 (Add

In [11]:
history = check_elmo_model(model, X_tr, y_tr, X_val, y_val, batch_size=32)

W0620 07:43:41.269915 140503551743872 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 38816 samples, validate on 4320 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [12]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive 
from google.colab import auth 
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default() 
drive = GoogleDrive(gauth)

folder_id = '1eZpMjVIqZFbF32gATJob2feV4lXEC5pB'
model.save('model_20190620.h5')
model_file = drive.CreateFile({'title' : 'model.h5'})  
model_file.SetContentFile('model_20190620.h5')                       
model_file.Upload()

W0620 08:30:24.070194 140503551743872 __init__.py:44] file_cache is unavailable when using oauth2client >= 4.0.0 or google-auth
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/__init__.py", line 36, in autodetect
    from google.appengine.api import memcache
ModuleNotFoundError: No module named 'google.appengine'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 33, in <module>
    from oauth2client.contrib.locked_file import LockedFile
ModuleNotFoundError: No module named 'oauth2client.contrib.locked_file'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/googleapiclient/discovery_cache/file_cache.py", line 37, in <module>
    from oauth2client.locked_file import Lock

In [0]:
model.save_weights('model_weight_20190620.h5')
model_file = drive.CreateFile({'title' : 'model_weight_20190620.h5'})  
model_file.SetContentFile('model_weight_20190620.h5')                       
model_file.Upload()

In [15]:
type(model)

keras.engine.training.Model

In [17]:
model_json = model.to_json()
type(model_json)


TypeError: ignored

In [0]:
model_file = drive.CreateFile({'title' : 'model_json_20190620.h5'})  
model_file.SetContentFile(model_json)                       
model_file.Upload()

Predicts

In [13]:

tags = list(set(data["Tag"].values))
i = 19
p = model.predict(np.array(X_te[i:i+32]))[0]
p = np.argmax(p, axis=-1)
print("{:15} {:5}: ({})".format("Word", "Pred", "True"))
print("="*30)
for w, true, pred in zip(X_te[i], y_te[i], p):
    if w != "__PAD__":
        print("{:15}:{:5} ({})".format(w, tags[pred], tags[true]))



Word            Pred : (True)
Meanwhile      :O     (O)
,              :O     (O)
in             :O     (O)
Belgrade       :B-geo (B-geo)
,              :O     (O)
Serbia         :B-geo (B-geo)
's             :O     (O)
extreme        :O     (O)
nationalist    :O     (O)
Radical        :B-org (B-org)
Party          :I-org (I-org)
has            :O     (O)
filed          :O     (O)
a              :O     (O)
motion         :O     (O)
of             :O     (O)
no-confidence  :O     (O)
in             :O     (O)
the            :O     (O)
government     :O     (O)
of             :O     (O)
Prime          :B-per (B-per)
Minister       :I-per (O)
Vojislav       :B-per (B-per)
Kostunica      :I-per (I-per)
to             :O     (O)
protest        :O     (O)
the            :O     (O)
extradition    :O     (O)
of             :O     (O)
11             :O     (O)
suspects       :O     (O)
to             :O     (O)
the            :O     (O)
court          :O     (O)
since          :B-tim (B-tim)
Oc

Plot model

In [0]:
import matplotlib.pyplot as plt

def plot_trained_history(history, val=['acc', 'val_acc'], loss=['loss', 'val_loss']):
    # Plot training & validation accuracy values
    if len(val) > 0:
        for v in val:        
            plt.plot(history.history[v])        
        plt.title('Model accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Test'], loc='upper left')
        plt.show()

    if len(loss) > 0:
        # Plot training & validation loss values
        for l in loss:        
            plt.plot(history.history[l])
        plt.title('Model loss')
        plt.ylabel('Loss')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Test'], loc='upper left')
        plt.show()

