# Setup

In [0]:
%tensorflow_version 2.x

In [0]:
!pip install seqeval transformers



In [0]:
!mkdir data
!mkdir models
# Pre-trained word embeddings for English language
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz -P data/ 

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘models’: File exists
--2020-04-24 04:23:02--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 2606:4700:10::6816:4a8e, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1325960915 (1.2G) [binary/octet-stream]
Saving to: ‘data/cc.en.300.vec.gz.3’


2020-04-24 04:23:46 (29.0 MB/s) - ‘data/cc.en.300.vec.gz.3’ saved [1325960915/1325960915]



# utils.py

In [0]:
"""
Utilities.
"""
import string
import gensim
import numpy as np
import pandas as pd

def load_dataset(filename, encoding='utf-8'):
    """Loads data and label from a file.
    Args:
        filename (str): path to the file.
        encoding (str): file encoding format.
        The file format is tab-separated values.
        A blank line is required at the end of a sentence.
        For example:
        ```
        EU	B-ORG
        rejects	O
        German	B-MISC
        call	O
        to	O
        boycott	O
        British	B-MISC
        lamb	O
        .	O
        Peter	B-PER
        Blackburn	I-PER
        ...
        ```
    Returns:
        tuple(numpy array, numpy array): data and labels.
    Example:
        >>> filename = 'conll2003/en/ner/train.txt'
        >>> data, labels = load_data_and_labels(filename)
    """
    sentences, labels = [], []
    words, tags = [], []
    with open(filename, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, tag = line.split('\t')
                words.append(word)
                tags.append(tag)
            else:
                sentences.append(words)
                labels.append(tags)
                words, tags = [], []
        if words:
            sentences.append(words)
            labels.append(tags)

    return sentences, labels

def filter_embeddings(embeddings, vocab, num_words, dim=300):
    """Filter word vectors.

    Args:
        embeddings: a dictionary like object.
        vocab: word-index lookup table.
        num_words: the number of words.
        dim: dimension.

    Returns:
        numpy array: an array of word embeddings.
    """
    _embeddings = np.zeros((num_words, dim))
    for word in vocab:
        if word in embeddings:
            word_id = vocab[word]
            # Get the word embedding of word whose id is less than num_words. 
            # What is this? 
            if word_id >= num_words:
                continue
            _embeddings[word_id] = embeddings[word]

    return _embeddings


def load_fasttext(filepath, binary=False):
    """Loads fastText vectors.

    Args:
        filepath (str): a path to a fastText file.

    Return:
        model: KeyedVectors
    """
    model = gensim.models.KeyedVectors.load_word2vec_format(filepath, binary=binary)
    return model

# preprocessing.py

In [0]:
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences


class Vocab:

    def __init__(self, num_words=None, lower=True, oov_token=None):
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=num_words, # max size of vocabulary
            oov_token=oov_token,
            filters='',
            lower=lower,
            split='\t'
        )

    def fit(self, sequences):
        texts = self._texts(sequences)
        # Create vocabulary. 
        self.tokenizer.fit_on_texts(texts)
        return self

    def encode(self, sequences):
        """ Convert words to ids """
        texts = self._texts(sequences)
        print('texts in encode():', texts[:5]) # list of strings (one string per sentence)
        return self.tokenizer.texts_to_sequences(texts) # For one string, change string to list of ids. 

    def decode(self, sequences):
        texts = self.tokenizer.sequences_to_texts(sequences)
        return [text.split(' ') for text in texts]

    def _texts(self, sequences):
        return ['\t'.join(words) for words in sequences]

    def get_index(self, word):
        return self.tokenizer.word_index.get(word)

    @property
    def size(self):
        """Return vocabulary size."""
        return len(self.tokenizer.word_index) + 1

    def save(self, file_path):
        with open(file_path, 'w') as f:
            config = self.tokenizer.to_json()
            f.write(config)

    @classmethod
    def load(cls, file_path):
        with open(file_path) as f:
            tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(f.read())
            vocab = cls()
            vocab.tokenizer = tokenizer
        return vocab


def normalize_number(text, reduce=True):
    """ Replace numbers with 0. """
    if reduce:
        normalized_text = re.sub(r'\d+', '0', text)
    else:
        # Keep the length same. 
        normalized_text = re.sub(r'\d', '0', text)
    return normalized_text


def preprocess_dataset(sequences):
    sequences = [[normalize_number(w) for w in words] for words in sequences]
    return sequences


def create_dataset(sequences, vocab):
    print('before encode:', sequences[:5])
    sequences = vocab.encode(sequences)
    print('after encode:', sequences[:5])
    # Padding
    sequences = pad_sequences(sequences, padding='post')
    return sequences

# models.py

In [0]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM
from tensorflow.keras.layers import Bidirectional


class UnidirectionalModel:

    def __init__(self, input_dim, output_dim, emb_dim=100, hid_dim=100, embeddings=None):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       weights=[embeddings],
                                       name='embedding')
        self.lstm = LSTM(hid_dim,
                         return_sequences=True, # Point! True: Sequence Labeling
                         name='lstm')
        # output_dim: label_vocab.size()
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        lstm = self.lstm(embedding)
        y = self.fc(lstm)
        return Model(inputs=x, outputs=y)


class BidirectionalModel:

    def __init__(self, input_dim, output_dim, emb_dim=100, hid_dim=100, embeddings=None):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       weights=[embeddings],
                                       name='embedding')
        lstm = LSTM(hid_dim,
                    return_sequences=True,
                    name='lstm')
        # Wrap the LSTM with Bidirectional. 
        self.bilstm = Bidirectional(lstm, name='bilstm')
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        bilstm = self.bilstm(embedding)
        y = self.fc(bilstm)
        return Model(inputs=x, outputs=y)

# inference.py

In [0]:
"""
Inference API.
"""
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences


class InferenceAPI:
    """A model API that generates output sequence.

    Attributes:
        model: Model
        words_vocab: vocabulary of words
        labels_vocab: vocabulary of labels
    """

    def __init__(self, model, words_vocab, labels_vocab):
        self.model = model
        self.words_vocab = words_vocab
        self.labels_vocab = labels_vocab

    def predict_from_sequences(self, sequences):
        lengths = map(len, sequences)
        # Convert words to ids. 
        sequences = self.words_vocab.encode(sequences)
        sequences = pad_sequences(sequences, padding='post')
        # Predict. 
        y_pred = self.model.predict(sequences)
        print('y_pred after predict:', y_pred[:5])
        y_pred = np.argmax(y_pred, axis=-1)
        print('y_pred after argmax:', y_pred[:5])
        # Convert ids of labels to labels.
        y_pred = self.labels_vocab.decode(y_pred)  
        print('y_pred after decode:', y_pred[:5])
        # ??
        y_pred = [y[:l] for y, l in zip(y_pred, lengths)]
        print('y_pred after the last line:', y_pred[:5])
        return y_pred

# train.py

In [0]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import load_model
from seqeval.metrics import classification_report
from google.colab import files 
import io 


def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/model_{}.h5'
    # model_path = 'models/bidirectional_model_{}.h5'
    # num_words = 15000 # Max size of vocabulary
    num_words = 30522 # Max size of vocabulary
    # num_words = 150000 # Max size of vocabulary

    # Load data. 
    # x: sentences, y: labels
    x, y = load_dataset('./test_empty_line_inserted.tsv')    
    
    # Upload file from local. 
#     uploaded = files.upload() 
# 　　　　　　　ner_labeled_data = pd.read_csv(io.BytesIO(uploaded['test_empty_line_inserted.tsv'])) 

    # Pre-process data. 
    x = preprocess_dataset(x) # Normalize numbers. 
    # Split into train and test. 
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    # Create vocabularies. 
    words_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
    labels_vocab = Vocab(lower=False).fit(y_train)
    
    # Convert words to ids. 
    x_train = create_dataset(x_train, words_vocab)
    print('train words:', x_train[:5])
    y_train = create_dataset(y_train, labels_vocab)
    print('train labels:', y_train[:5])

    # Prepare word embedding.
    print('loading fastText...')
    wv = load_fasttext('data/cc.en.300.vec.gz')
    print('filtering embeddings...')
    wv = filter_embeddings(wv, words_vocab.tokenizer.word_index, num_words)

    # Build models.
    models = [
        # LSTM
        # UnidirectionalModel(num_words, labels_vocab.size).build(),
        # UnidirectionalModel(num_words, labels_vocab.size, embeddings=wv).build(),

        # Bi-LSTM
        # BidirectionalModel(num_words, labels_vocab.size).build(),
        BidirectionalModel(num_words, labels_vocab.size, embeddings=wv).build(),
    ]

    for i, model in enumerate(models):
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

        # Preparing callbacks.
        callbacks = [
            EarlyStopping(patience=3),
            ModelCheckpoint(model_path.format(i), save_best_only=True)
        ]

        # Train the model.
        model.fit(x=x_train,
                  y=y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_split=0.1,
                  callbacks=callbacks,
                  shuffle=True)

        # Inference.
        model = load_model(model_path.format(i))
        api = InferenceAPI(model, words_vocab, labels_vocab)
        y_pred = api.predict_from_sequences(x_test)
        print(classification_report(y_test, y_pred, digits=4))


if __name__ == '__main__':
    main()

before encode: [['The', '0-year-old', 'former', 'analyst', 'says', 'he', 'provided', 'information', 'to', 'an', 'official', 'at', 'the', 'Israeli', 'embassy', 'and', 'to', 'two', 'members', 'of', 'a', 'lobbying', 'group', 'called', 'the', 'American', 'Israel', 'Public', 'Affairs', 'Committee', '.'], ['But', 'he', 'said', 'he', 'will', 'not', 'accept', 'any', 'Pakistani', 'proposal', 'that', 'involves', 'redrawing', 'the', 'line', 'of', 'control', 'that', 'separates', 'Indian-', 'from', 'Pakistani-controlled', 'Kashmir', '.'], ['The', 'Swiss', 'star', 'was', 'upset', 'Wednesday', 'by', 'German', 'Tommy', 'Haas', 'in', 'the', 'opening', 'match', 'of', 'the', 'Kooyong', 'Classic', 'in', 'Melbourne', '.'], ['After', 'taking', 'office', 'in', '0', ',', 'the', 'SPENCER', 'government', 'adopted', 'an', 'ambitious', 'fiscal', 'reform', 'program', ',', 'and', 'was', 'successful', 'in', 'reducing', 'its', 'public', 'debt-to-GDP', 'ratio', 'from', '0', '%', 'to', 'about', '0', '%', 'in', '0', '.'

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


filtering embeddings...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
texts in encode(): ["In\tTehran\t,\tthe\tchief\tof\tIran\t's\tRevolutionary\tGuards\t,\tGeneral\tYahya\tRahim\tSafavi\t,\tsaid\tSaturday\this\tcountry\twould\tuse\tballistic\tmissiles\tto\tdefend\titself\tif\tattacked\t.", 'Even\tthough\tboth\tsites\tare\tnow\tfunctioning\t,\tTwitter\tsays\tusers\twill\tcontinue\tto\texperience\tlonger\tload\ttimes\tand\tslow\tresponse\t.', 'Suspected\tU.S.\tdrones\thave\tcarried\tout\tat\tleast\t0\tmissile\tstrikes\ton\tmilitant\ttargets\tin\tnorthwest\tPakistan\tover\tthe\tpast\tyear\t.', "President\tBarack\tObama\thas\treaffirmed\this\tbelief\tin\ta\twoman\t's\tright\tto\tchoose\twhether\tto\thave\tan\tabortion\tas\ttens\tof\tthousands\tof\tabortion\topponents\theld\ttheir\tannual\trally\tin\tWashington\t.", "They\tall\tdecided\tthat\tone\tperson\tshould\tget\toff\t,\tbecause\tif\tthey\tdid\tn't\t,\tthe\trope\twould\tbreak\tand\teveryone\twould\tdie\t."]
