# Setup

In [0]:
%tensorflow_version 2.x

In [0]:
# Type of GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Apr 15 00:34:04 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
# Memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [0]:
!pip install seqeval transformers

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 6.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.1MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 37.7MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonho

In [0]:
!mkdir data
!mkdir models
# !wget https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/ja.wikipedia.conll -P data/

--2020-04-15 00:34:17--  https://raw.githubusercontent.com/Hironsan/IOB2Corpus/master/ja.wikipedia.conll
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1297592 (1.2M) [text/plain]
Saving to: ‘data/ja.wikipedia.conll’


2020-04-15 00:34:17 (20.0 MB/s) - ‘data/ja.wikipedia.conll’ saved [1297592/1297592]



# utils.py

In [0]:
"""
Utilities.
"""
from seqeval.metrics import classification_report


def load_dataset(filename, encoding='utf-8'):
    """Loads data and label from a file.
    Args:
        filename (str): path to the file.
        encoding (str): file encoding format.
        The file format is tab-separated values.
        A blank line is required at the end of a sentence.
        For example:
        ```
        EU	B-ORG
        rejects	O
        German	B-MISC
        call	O
        to	O
        boycott	O
        British	B-MISC
        lamb	O
        .	O
        Peter	B-PER
        Blackburn	I-PER
        ...
        ```
    Returns:
        tuple(numpy array, numpy array): data and labels.
    Example:
        >>> filename = 'conll2003/en/ner/train.txt'
        >>> data, labels = load_data_and_labels(filename)
    """
    sentences, labels = [], []
    words, tags = [], []
    with open(filename, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, tag = line.split('\t')
                words.append(word)
                tags.append(tag)
            else:
                sentences.append(words)
                labels.append(tags)
                words, tags = [], []
        if words:
            sentences.append(words)
            labels.append(tags)

    return sentences, labels

# For BERT
def evaluate(model, labels_vocab, features, label_ids_true):
    # Predict. 
    label_ids_pred = model.predict(features)
    print('label_ids_pred after predict:\n', label_ids_pred)
    label_ids_pred = np.argmax(label_ids_pred[0], axis=-1) # label_ids_pred[0] <= typo corrected! 
    print('label_ids_pred after argmax:\n', label_ids_pred)
    print('label_ids_true:\n', label_ids_true)

    y_pred = [[] for _ in range(label_ids_pred.shape[0])]
    y_true = [[] for _ in range(label_ids_pred.shape[0])]
    for i in range(label_ids_pred.shape[0]):
        for j in range(label_ids_pred.shape[1]):
            if label_ids_true[i][j] == 0:
                continue
            y_pred[i].append(label_ids_pred[i][j])
            y_true[i].append(label_ids_true[i][j])

    y_pred = labels_vocab.decode(y_pred)
    y_true = labels_vocab.decode(y_true)
    print(classification_report(y_true, y_pred, digits=4))


# preprocessing.py

In [0]:
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences


class Vocab:

    def __init__(self, num_words=None, lower=True, oov_token=None):
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=num_words, # max size of vocabulary
            oov_token=oov_token,
            filters='',
            lower=lower,
            split='\t'
        )

    def fit(self, sequences):
        texts = self._texts(sequences)
        # Create vocabulary. 
        self.tokenizer.fit_on_texts(texts)
        return self

    def encode(self, sequences):
        """ Convert words to ids """
        texts = self._texts(sequences)
        # print('texts in encode():', texts[:5]) # list of strings (one string per sentence)
        return self.tokenizer.texts_to_sequences(texts) # For one string, change string to list of ids. 

    def decode(self, sequences):
        # print('sequences in decode:\n', sequences)
        texts = self.tokenizer.sequences_to_texts(sequences)
        return [text.split(' ') for text in texts]

    def _texts(self, sequences):
        return ['\t'.join(words) for words in sequences]

    def get_index(self, word):
        return self.tokenizer.word_index.get(word)

    @property
    def size(self):
        """Return vocabulary size."""
        return len(self.tokenizer.word_index) + 1

    def save(self, file_path):
        with open(file_path, 'w') as f:
            config = self.tokenizer.to_json()
            f.write(config)

    @classmethod
    def load(cls, file_path):
        with open(file_path) as f:
            tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(f.read())
            vocab = cls()
            vocab.tokenizer = tokenizer
        return vocab


def normalize_number(text, reduce=True):
    """ Replace numbers with 0. """
    if reduce:
        normalized_text = re.sub(r'\d+', '0', text)
    else:
        # Keep the length same. 
        normalized_text = re.sub(r'\d', '0', text)
    return normalized_text


def preprocess_dataset(sequences):
    sequences = [[normalize_number(w) for w in words] for words in sequences]
    return sequences


def create_dataset(sequences, vocab):
    # print('before encode:', sequences[:5])
    sequences = vocab.encode(sequences)
    # print('after encode:', sequences[:5])
    # Padding
    sequences = pad_sequences(sequences, padding='post')
    return sequences


# Create inputs for BERT. 
def convert_examples_to_features(x, # x_train
                                 y, # y_train
                                 vocab, # Vocabulary of lebels
                                 max_seq_length,
                                 tokenizer):
    pad_token = 0
    features = {
        'input_ids': [],
        'attention_mask': [],
        'token_type_ids': [],
        'label_ids': []
    }
    for words, labels in zip(x, y):
        # print('words:', words) # sentence
        # print('labels:', labels) # labels in the sentence

        # For each sentence 
        tokens = [tokenizer.cls_token] # [CLS]
        # print('tokens:', tokens)

        label_ids = [pad_token]
        for word, label in zip(words, labels):
            # For each word  
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            label_id = vocab.get_index(label)
            label_ids.extend([label_id] + [pad_token] * (len(word_tokens) - 1))

        tokens += [tokenizer.sep_token] # [SEP]

        # print('tokens before convert_tokens_to_ids:\n', tokens) 
        # ['[CLS]', 'キ', '##ケ', '##ロ', 'は', '、', 'カエサル', 'と', 'は', ..., 'こと', 'と', 'なっ', 'た', '。', '[SEP]']

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # print('input_ids:\n', input_ids)
        # [2, 185, 28719, 28505, 9, 6, 18936, 13, ... ]

        attention_mask = [1] * len(input_ids)
        # print('attention_mask:\n', attention_mask)
        # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... ]

        token_type_ids = [pad_token] * max_seq_length
        # print('token_type_ids:\n', token_type_ids)
        # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... ]

        # print('label_ids:\n', label_ids)
        # [0, 7, 0, 0, 1, 1, 7, 1, 1, 1, 1, 16, 15, 1, 1, ... ]

        features['input_ids'].append(input_ids)
        features['attention_mask'].append(attention_mask)
        features['token_type_ids'].append(token_type_ids)
        features['label_ids'].append(label_ids)

    # Padding
    for name in features:
        features[name] = pad_sequences(features[name], padding='post', maxlen=max_seq_length)

    x = [features['input_ids'], features['attention_mask'], features['token_type_ids']]
    y = features['label_ids']
    return x, y

# models.py

In [0]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM
from tensorflow.keras.layers import Bidirectional
from transformers import TFBertForTokenClassification, BertConfig


class UnidirectionalModel:

    def __init__(self, input_dim, output_dim, emb_dim=100, hid_dim=100, embeddings=None):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       weights=[embeddings],
                                       name='embedding')
        self.lstm = LSTM(hid_dim,
                         return_sequences=True, # Point! True: Sequence Labeling
                         name='lstm')
        # output_dim: label_vocab.size()
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        lstm = self.lstm(embedding)
        y = self.fc(lstm)
        return Model(inputs=x, outputs=y)


class BidirectionalModel:

    def __init__(self, input_dim, output_dim, emb_dim=100, hid_dim=100, embeddings=None):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       weights=[embeddings],
                                       name='embedding')
        lstm = LSTM(hid_dim,
                    return_sequences=True,
                    name='lstm')
        # Wrap the LSTM with Bidirectional. 
        self.bilstm = Bidirectional(lstm, name='bilstm')
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        bilstm = self.bilstm(embedding)
        y = self.fc(bilstm)
        return Model(inputs=x, outputs=y)

# For BERT
def build_model(pretrained_model_name_or_path, num_labels):
    # BertConfig holds configuration for BERT. 
    # Read configuration from pre-trained model. 
    config = BertConfig.from_pretrained(
        pretrained_model_name_or_path,
        num_labels=num_labels
    )
    # BERT for sequence labelling
    model = TFBertForTokenClassification.from_pretrained(
        pretrained_model_name_or_path,
        config=config
    )
    # Add a Dense layer with softmax to the last layer.  
    model.layers[-1].activation = tf.keras.activations.softmax
    
    return model


def loss_func(num_labels):
    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)

    def loss(y_true, y_pred):
        input_mask = tf.not_equal(y_true, 0)
        logits = tf.reshape(y_pred, (-1, num_labels))
        active_loss = tf.reshape(input_mask, (-1,))
        # Remove paddings. 
        active_logits = tf.boolean_mask(logits, active_loss)
        train_labels = tf.reshape(y_true, (-1,))
        # Remove paddings. 
        active_labels = tf.boolean_mask(train_labels, active_loss)
        cross_entropy = loss_fct(active_labels, active_logits)
        return cross_entropy

    return loss

# inference.py

In [0]:
"""
Inference API.
"""
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences


class InferenceAPI:
    """A model API that generates output sequence.

    Attributes:
        model: Model
        words_vocab: vocabulary of words
        labels_vocab: vocabulary of labels
    """

    def __init__(self, model, words_vocab, labels_vocab):
        self.model = model
        self.words_vocab = words_vocab
        self.labels_vocab = labels_vocab

    def predict_from_sequences(self, sequences):
        lengths = map(len, sequences)
        # Convert words to ids. 
        sequences = self.words_vocab.encode(sequences)
        sequences = pad_sequences(sequences, padding='post')
        # Predict. 
        y_pred = self.model.predict(sequences)
        print('y_pred after predict:', y_pred[:5])
        y_pred = np.argmax(y_pred, axis=-1)
        print('y_pred after argmax:', y_pred[:5])
        # Convert ids of labels to labels.
        y_pred = self.labels_vocab.decode(y_pred)  
        print('y_pred after decode:', y_pred[:5])
        # ??
        y_pred = [y[:l] for y, l in zip(y_pred, lengths)]
        print('y_pred after the last line:', y_pred[:5])
        return y_pred

# train.py

In [0]:
# from sklearn.model_selection import train_test_split
# from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
# from tensorflow.keras.models import load_model
# from seqeval.metrics import classification_report
# from google.colab import files 
# import io 


# def main():
#     # Set hyper-parameters.
#     batch_size = 32
#     epochs = 100
#     # model_path = 'models/model_{}.h5'
#     model_path = 'models/bidirectional_model_{}.h5'
#     num_words = 15000 # Max size of vocabulary
#     # num_words = 150000 # Max size of vocabulary

#     # Data loading.
#     # x: sentences, y: labels
#     # x, y = load_dataset('./data/ja.wikipedia.conll')
#     x, y = load_dataset('./test_empty_line_inserted.tsv')    
    
#     # Upload file from local. 
# #     uploaded = files.upload() 
# # 　　　　　　　ner_labeled_data = pd.read_csv(io.BytesIO(uploaded['test_empty_line_inserted.tsv'])) 


#     # Pre-processing.
#     x = preprocess_dataset(x) # Normalize numbers. 
#     # Train test split
#     x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
#     # Create vocabularies. 
#     words_vocab = Vocab(num_words=num_words, oov_token='<UNK>').fit(x_train)
#     labels_vocab = Vocab(lower=False).fit(y_train)
    
#     # Convert words to ids. 
#     x_train = create_dataset(x_train, words_vocab)
#     print('train words:', x_train[:5])
#     y_train = create_dataset(y_train, labels_vocab)
#     print('train labels:', y_train[:5])

#     # Build models.
#     models = [
#         # UnidirectionalModel(num_words, labels_vocab.size).build(),
#         BidirectionalModel(num_words, labels_vocab.size).build(),
#     ]

#     for i, model in enumerate(models):
#         model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

#         # Preparing callbacks.
#         callbacks = [
#             EarlyStopping(patience=3),
#             ModelCheckpoint(model_path.format(i), save_best_only=True)
#         ]

#         # Train the model.
#         model.fit(x=x_train,
#                   y=y_train,
#                   batch_size=batch_size,
#                   epochs=epochs,
#                   validation_split=0.1,
#                   callbacks=callbacks,
#                   shuffle=True)

#         # Inference.
#         model = load_model(model_path.format(i))
#         api = InferenceAPI(model, words_vocab, labels_vocab)
#         y_pred = api.predict_from_sequences(x_test)
#         print(classification_report(y_test, y_pred, digits=4))


# if __name__ == '__main__':
#     main()

# train_bert.py



In [0]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer, DistilBertTokenizer

# from models import build_model, loss_func
# from preprocessing import convert_examples_to_features, Vocab, preprocess_dataset
# from utils import load_dataset, evaluate


def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    # epochs = 20 # For debugging
    model_path = 'models/'
    # Pre-trained model trained on the DistilBERT model 
    # distilled from the BERT model bert-base-cased
    # pretrained_model_name_or_path = 'distilbert-base-cased'
    # Pre-trained model trained on cased English text.
    pretrained_model_name_or_path = 'bert-base-cased'
    maxlen = 250

    # Data loading.
    x, y = load_dataset('./test_empty_line_inserted.tsv')    

    # Tokenizer from BERT
    # tokenizer = DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, do_word_tokenize=False)
    tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path, do_word_tokenize=False)

    # Pre-processing.
    x = preprocess_dataset(x) # Normalize numbers.
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    labels_vocab = Vocab(lower=False).fit(y_train)

    features_train, labels_train = convert_examples_to_features(
        x_train,
        y_train,
        # x_train[:5], # For debugging
        # y_train[:5],
        labels_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer
    )
    
    features_test, labels_test = convert_examples_to_features(
        x_test,
        y_test,
        # x_test[:5], # For debugging
        # y_test[:5], 
        labels_vocab,
        max_seq_length=maxlen,
        tokenizer=tokenizer
    )

    # Build model.
    model = build_model(pretrained_model_name_or_path, labels_vocab.size)
    # # Load trained model. 
    # model = build_model(model_path, labels_vocab.size)
    model.compile(optimizer='sgd', loss=loss_func(labels_vocab.size))

    # Preparing callbacks.
    callbacks = [
        EarlyStopping(patience=3),
    ]

    # Train the model.
    model.fit(x=features_train,
              y=labels_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_split=0.1,
              callbacks=callbacks,
              shuffle=True)
    model.save_pretrained(model_path)

    # Evaluation.
    evaluate(model, labels_vocab, features_test, labels_test)


if __name__ == '__main__':
    main()


label_ids_pred after predict:
 (array([[[3.84216546e-05, 9.91067827e-01, 1.51462213e-03, ...,
         3.36913508e-05, 1.93526575e-04, 8.07656397e-05],
        [7.76241052e-07, 9.99349892e-01, 2.89793446e-04, ...,
         9.14278814e-07, 5.11719009e-06, 1.04136654e-06],
        [8.14651648e-05, 2.78127816e-04, 9.35651302e-01, ...,
         4.98961126e-05, 3.91494192e-04, 8.05361196e-05],
        ...,
        [3.85773092e-06, 9.96182024e-01, 9.98642761e-04, ...,
         8.40593839e-06, 2.24059186e-05, 6.60109345e-06],
        [3.31167007e-06, 9.96991158e-01, 7.23084202e-04, ...,
         6.84519409e-06, 1.96578640e-05, 5.55329143e-06],
        [2.03866671e-06, 9.98490214e-01, 3.33804317e-04, ...,
         3.63085201e-06, 1.14772984e-05, 3.38397103e-06]],

       [[1.16683383e-04, 9.75640655e-01, 2.81129475e-03, ...,
         8.11719001e-05, 5.65593888e-04, 2.46033160e-04],
        [2.44198645e-07, 9.99945521e-01, 3.63537424e-06, ...,
         1.53141258e-07, 9.79434503e-07, 3.52907279

In [0]:
# # Download files.

# from google.colab import files
# files.download('models/tf_model.h5')
# files.download('models/config.json')

----------------------------------------
Exception happened during processing of request from ('::ffff:127.0.0.1', 59920, 0, 0)
Traceback (most recent call last):
  File "/usr/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/lib/python3.6/http/server.py", line 418, in handle
    self.handle_one_request()
  File "/usr/lib/python3.6/http/server.py", line 406, in handle_one_request
    method()
  File "/usr/lib/python3.6/http/server.py", line 639, in do_GET
    self.copyfile(f, self.wfile)
  File "/usr/lib/python3.6/http/server.py", line 800, in copyfile
    shutil.copyfil

In [0]:
# Mount my Google Drive. 
from google.colab import drive 
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# Stash the trained models to Google Drive. 
!cp -r ./models/ drive/'My Drive'/bert/bert/trained_models