# Setup

In [0]:
# Mount my Google Drive. 
from google.colab import drive 
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!mkdir data
!mkdir models

In [0]:
# Copy over the trained models in Google Drive.
!cp -r drive/'My Drive'/cs663/bert/trained_models/model_03 ./

In [0]:
# Copy over the dataset in Google Drive.
!cp -r drive/'My Drive'/cs663/bert/test_empty_line_inserted.tsv ./

In [0]:
%tensorflow_version 2.x

In [0]:
# Type of GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime → "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

Wed Apr 22 05:13:56 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.64.00    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [0]:
# Memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime → "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [0]:
!pip install seqeval transformers

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 7.3MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |████████████████████████████████| 3.7MB 22.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 49.0MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.python

# utils.py

In [0]:
"""
Utilities.
"""
from seqeval.metrics import classification_report, accuracy_score
import numpy as np

def load_dataset(filename, encoding='utf-8'):
    """Loads data and label from a file.
    Args:
        filename (str): path to the file.
        encoding (str): file encoding format.
        The file format is tab-separated values.
        A blank line is required at the end of a sentence.
        For example:
        ```
        EU	B-ORG
        rejects	O
        German	B-MISC
        call	O
        to	O
        boycott	O
        British	B-MISC
        lamb	O
        .	O
        Peter	B-PER
        Blackburn	I-PER
        ...
        ```
    Returns:
        tuple(numpy array, numpy array): data and labels.
    Example:
        >>> filename = 'conll2003/en/ner/train.txt'
        >>> data, labels = load_data_and_labels(filename)
    """
    sentences, labels = [], []
    words, tags = [], []
    with open(filename, encoding=encoding) as f:
        for line in f:
            line = line.rstrip()
            if line:
                word, tag = line.split('\t')
                words.append(word)
                tags.append(tag)
            else:
                sentences.append(words)
                labels.append(tags)
                words, tags = [], []
        if words:
            sentences.append(words)
            labels.append(tags)

    return sentences, labels

# For BERT
def evaluate(model, labels_vocab, features, label_ids_true):
    # Predict. 
    label_ids_pred = model.predict(features)
    print('label_ids_pred after predict:\n', label_ids_pred)
    label_ids_pred = np.argmax(label_ids_pred[0], axis=-1) # label_ids_pred[0] <= typo corrected! 
    print('label_ids_pred after argmax:\n', label_ids_pred)
    print('label_ids_true:\n', label_ids_true)

    y_pred = [[] for _ in range(label_ids_pred.shape[0])]
    y_true = [[] for _ in range(label_ids_pred.shape[0])]
    for i in range(label_ids_pred.shape[0]):
        for j in range(label_ids_pred.shape[1]):
            if label_ids_true[i][j] == 0:
                continue
            y_pred[i].append(label_ids_pred[i][j])
            y_true[i].append(label_ids_true[i][j])

    y_pred = labels_vocab.decode(y_pred)
    print('y_pred:\n', y_pred)
    y_true = labels_vocab.decode(y_true)
    print('y_true:\n', y_true)
    print(classification_report(y_true, y_pred, digits=4))
    print('Accuracy:', accuracy_score(y_true, y_pred))


# preprocessing.py

In [0]:
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences


class Vocab:

    def __init__(self, num_words=None, lower=True, oov_token=None):
        self.tokenizer = tf.keras.preprocessing.text.Tokenizer(
            num_words=num_words, # max size of vocabulary
            oov_token=oov_token,
            filters='',
            lower=lower,
            split='\t'
        )

    def fit(self, sequences):
        texts = self._texts(sequences)
        # Create vocabulary. 
        self.tokenizer.fit_on_texts(texts)
        return self

    def encode(self, sequences):
        """ Convert words to ids """
        texts = self._texts(sequences)
        # print('texts in encode():', texts[:5]) # list of strings (one string per sentence)
        return self.tokenizer.texts_to_sequences(texts) # For one string, change string to list of ids. 

    def decode(self, sequences):
        # print('sequences in decode:\n', sequences)
        texts = self.tokenizer.sequences_to_texts(sequences)
        return [text.split(' ') for text in texts]

    def _texts(self, sequences):
        return ['\t'.join(words) for words in sequences]

    def get_index(self, word):
        return self.tokenizer.word_index.get(word)

    @property
    def size(self):
        """Return vocabulary size."""
        return len(self.tokenizer.word_index) + 1

    def save(self, file_path):
        with open(file_path, 'w') as f:
            config = self.tokenizer.to_json()
            f.write(config)

    @classmethod
    def load(cls, file_path):
        with open(file_path) as f:
            tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(f.read())
            vocab = cls()
            vocab.tokenizer = tokenizer
        return vocab


def normalize_number(text, reduce=True):
    """ Replace numbers with 0. """
    if reduce:
        normalized_text = re.sub(r'\d+', '0', text)
    else:
        # Keep the length same. 
        normalized_text = re.sub(r'\d', '0', text)
    return normalized_text


def preprocess_dataset(sequences):
    sequences = [[normalize_number(w) for w in words] for words in sequences]
    return sequences


def create_dataset(sequences, vocab):
    # print('before encode:', sequences[:5])
    sequences = vocab.encode(sequences)
    # print('after encode:', sequences[:5])
    # Padding
    sequences = pad_sequences(sequences, padding='post')
    return sequences


# Create inputs for BERT. 
def convert_examples_to_features(x, # features
                                 y, # labels
                                 vocab, # Vocabulary of lebels
                                 max_seq_length,
                                 tokenizer):
    pad_token = 0
    features = {
        'input_ids': [],
        'attention_mask': [],
        'token_type_ids': [],
        'label_ids': []
    }
    for words, labels in zip(x, y):
        # print('words:', words) # sentence
        # print('labels:', labels) # labels in the sentence

        # For each sentence 
        tokens = [tokenizer.cls_token] # [CLS]
        # print('tokens:', tokens)

        label_ids = [pad_token]
        for word, label in zip(words, labels):
            # For each word  
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)
            label_id = vocab.get_index(label)
            label_ids.extend([label_id] + [pad_token] * (len(word_tokens) - 1))

        tokens += [tokenizer.sep_token] # [SEP]

        # print('tokens before convert_tokens_to_ids:\n', tokens) 
        # ['[CLS]', 'キ', '##ケ', '##ロ', 'は', '、', 'カエサル', 'と', 'は', ..., 'こと', 'と', 'なっ', 'た', '。', '[SEP]']

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        # print('input_ids:\n', input_ids)
        # [2, 185, 28719, 28505, 9, 6, 18936, 13, ... ]

        attention_mask = [1] * len(input_ids)
        # print('attention_mask:\n', attention_mask)
        # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... ]

        token_type_ids = [pad_token] * max_seq_length
        # print('token_type_ids:\n', token_type_ids)
        # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... ]

        # print('label_ids:\n', label_ids)
        # [0, 7, 0, 0, 1, 1, 7, 1, 1, 1, 1, 16, 15, 1, 1, ... ]

        features['input_ids'].append(input_ids)
        features['attention_mask'].append(attention_mask)
        features['token_type_ids'].append(token_type_ids)
        features['label_ids'].append(label_ids)

    # Padding
    for name in features:
        features[name] = pad_sequences(features[name], padding='post', maxlen=max_seq_length)

    x = [features['input_ids'], features['attention_mask'], features['token_type_ids']]
    y = features['label_ids']
    return x, y

# models.py

In [0]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Embedding, LSTM
from tensorflow.keras.layers import Bidirectional
from transformers import TFBertForTokenClassification, BertConfig


class UnidirectionalModel:

    def __init__(self, input_dim, output_dim, emb_dim=100, hid_dim=100, embeddings=None):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       weights=[embeddings],
                                       name='embedding')
        self.lstm = LSTM(hid_dim,
                         return_sequences=True, # Point! True: Sequence Labeling
                         name='lstm')
        # output_dim: label_vocab.size()
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        lstm = self.lstm(embedding)
        y = self.fc(lstm)
        return Model(inputs=x, outputs=y)


class BidirectionalModel:

    def __init__(self, input_dim, output_dim, emb_dim=100, hid_dim=100, embeddings=None):
        self.input = Input(shape=(None,), name='input')
        if embeddings is None:
            self.embedding = Embedding(input_dim=input_dim,
                                       output_dim=emb_dim,
                                       mask_zero=True,
                                       name='embedding')
        else:
            self.embedding = Embedding(input_dim=embeddings.shape[0],
                                       output_dim=embeddings.shape[1],
                                       mask_zero=True,
                                       weights=[embeddings],
                                       name='embedding')
        lstm = LSTM(hid_dim,
                    return_sequences=True,
                    name='lstm')
        # Wrap the LSTM with Bidirectional. 
        self.bilstm = Bidirectional(lstm, name='bilstm')
        self.fc = Dense(output_dim, activation='softmax')

    def build(self):
        x = self.input
        embedding = self.embedding(x)
        bilstm = self.bilstm(embedding)
        y = self.fc(bilstm)
        return Model(inputs=x, outputs=y)

# For BERT
def build_model(pretrained_model_name_or_path, num_labels):
    # BertConfig holds configuration for BERT. 
    # Read configuration from pre-trained model. 
    config = BertConfig.from_pretrained(
        pretrained_model_name_or_path,
        num_labels=num_labels
    )
    # BERT for sequence labelling
    model = TFBertForTokenClassification.from_pretrained(
        pretrained_model_name_or_path,
        config=config
    )
    # Add a Dense layer with softmax to the last layer.  
    model.layers[-1].activation = tf.keras.activations.softmax
    
    return model


def loss_func(num_labels):
    loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)

    def loss(y_true, y_pred):
        input_mask = tf.not_equal(y_true, 0)
        logits = tf.reshape(y_pred, (-1, num_labels))
        active_loss = tf.reshape(input_mask, (-1,))
        # Remove paddings. 
        active_logits = tf.boolean_mask(logits, active_loss)
        train_labels = tf.reshape(y_true, (-1,))
        # Remove paddings. 
        active_labels = tf.boolean_mask(train_labels, active_loss)
        cross_entropy = loss_fct(active_labels, active_logits)
        return cross_entropy

    return loss

# train_bert.py



In [0]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from transformers import BertTokenizer

# Set hyper-parameters.
batch_size = 32
epochs = 100
# epochs = 20 # For debugging
model_path = 'models/'
# Pre-trained model trained on cased English text.
pretrained_model_from_original = 'bert-base-cased'
pretrained_model = 'model_03'
maxlen = 250

# Data loading.
x, y = load_dataset('./test_empty_line_inserted.tsv')    

# Tokenizer from BERT
tokenizer = BertTokenizer.from_pretrained(pretrained_model_from_original, do_word_tokenize=False)

# Pre-processing.
x = preprocess_dataset(x) # Normalize numbers.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
labels_vocab = Vocab(lower=False).fit(y_train)

features_train, labels_train = convert_examples_to_features(
    x_train,
    y_train,
    # x_train[:5], # For debugging
    # y_train[:5], # For debugging
    labels_vocab,
    max_seq_length=maxlen,
    tokenizer=tokenizer
)

features_test, labels_test = convert_examples_to_features(
    x_test,
    y_test,
    # x_test[:5], # For debugging
    # y_test[:5], # For debugging 
    labels_vocab,
    max_seq_length=maxlen,
    tokenizer=tokenizer
)

# Build model from my trained model. 
model = build_model(pretrained_model, labels_vocab.size)
model.compile(optimizer='sgd', loss=loss_func(labels_vocab.size))

# Evaluation.
evaluate(model, labels_vocab, features_test, labels_test)

In [0]:
model.summary()

Model: "tf_bert_for_token_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  13842     
Total params: 108,324,114
Trainable params: 108,324,114
Non-trainable params: 0
_________________________________________________________________


# Apply the model to a new sentence

In [0]:
# Create inputs for BERT. 
def create_inputs(x, # features
                  vocab, # Vocabulary of lebels
                  max_seq_length,
                  tokenizer):
    pad_token = 0
    features = {
        'input_ids': [],
        'attention_mask': [],
        'token_type_ids': [],
    }
    for words in x:
        print('words:', words) # sentence

        # For each sentence 
        tokens = [tokenizer.cls_token] # [CLS]
        print('tokens:', tokens)

        for word in words:
            # For each word  
            word_tokens = tokenizer.tokenize(word)
            tokens.extend(word_tokens)

        tokens += [tokenizer.sep_token] # [SEP]

        print('tokens before convert_tokens_to_ids:\n', tokens) 
        # ['[CLS]', 'キ', '##ケ', '##ロ', 'は', '、', 'カエサル', 'と', 'は', ..., 'こと', 'と', 'なっ', 'た', '。', '[SEP]']

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        print('input_ids:\n', input_ids)
        # [2, 185, 28719, 28505, 9, 6, 18936, 13, ... ]

        attention_mask = [1] * len(input_ids)
        print('attention_mask:\n', attention_mask)
        # [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ... ]

        token_type_ids = [pad_token] * max_seq_length
        print('token_type_ids:\n', token_type_ids)
        # [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ... ]

        features['input_ids'].append(input_ids)
        features['attention_mask'].append(attention_mask)
        features['token_type_ids'].append(token_type_ids)

    # Padding
    for name in features:
        features[name] = pad_sequences(features[name], padding='post', maxlen=max_seq_length)

    x = [features['input_ids'], features['attention_mask'], features['token_type_ids']]
    return x, tokens

In [0]:
# test_sentence = """
# Mr. Trump’s tweets began just moments after a Fox News report by Mike Tobin, a 
# reporter for the network, about protests in Minnesota and elsewhere. 
# """

# test_sentence = """
# The problems mainly happen with rapid tests,” said Dr. Giorgio Palù, an Italian microbiologist 
# and former president of the European Society for Virology. “They will never be able to tell 
# the spread of the virus because they do not have the required sensitivity and specificity. 
# """

# test_sentence = """
# This month, the F.D.A. warned that some firms marketing their antibody tests in the United States 
# were falsely claiming that they had formal federal approval, or that they could diagnose Covid-19. 
# """

test_sentence = """
Jim bought 300 shares of Acme Corp. in 2006. 
"""


test_sentence = [test_sentence.split()]
test_sentence

[['Jim', 'bought', '300', 'shares', 'of', 'Acme', 'Corp.', 'in', '2006.']]

In [0]:
x, tokens = create_inputs(test_sentence, labels_vocab, maxlen, tokenizer)

words: ['Jim', 'bought', '300', 'shares', 'of', 'Acme', 'Corp.', 'in', '2006.']
tokens: ['[CLS]']
tokens before convert_tokens_to_ids:
 ['[CLS]', 'Jim', 'bought', '300', 'shares', 'of', 'A', '##c', '##me', 'Corp', '.', 'in', '2006', '.', '[SEP]']
input_ids:
 [101, 3104, 3306, 3127, 6117, 1104, 138, 1665, 3263, 13619, 119, 1107, 1386, 119, 102]
attention_mask:
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
token_type_ids:
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [0]:
# Predict. 
label_ids_pred = model.predict(x)
print('label_ids_pred after predict:\n', label_ids_pred)
label_ids_pred = np.argmax(label_ids_pred[0], axis=-1) # label_ids_pred[0] <= typo corrected! 
print('label_ids_pred after argmax:\n', label_ids_pred)

y_pred = [[] for _ in range(label_ids_pred.shape[0])]
# y_true = [[] for _ in range(label_ids_pred.shape[0])]
for i in range(label_ids_pred.shape[0]):
    for j in range(len(tokens)):
    # for j in range(label_ids_pred.shape[1]):
        # if label_ids_true[i][j] == 0:
        #     continue
        y_pred[i].append(label_ids_pred[i][j])
        # y_true[i].append(label_ids_true[i][j])

y_pred = labels_vocab.decode(y_pred)
print('y_pred:\n', y_pred)

label_ids_pred after predict:
 (array([[[3.4810786e-04, 9.2185426e-01, 1.0244295e-02, ...,
         2.6739223e-04, 1.2339789e-03, 6.3977827e-04],
        [1.0570589e-04, 1.7173028e-03, 2.9354591e-03, ...,
         5.5777517e-05, 2.7726963e-04, 9.0894187e-05],
        [2.3801859e-07, 9.9995172e-01, 2.2949707e-06, ...,
         2.5984363e-07, 5.5892161e-07, 3.7659345e-07],
        ...,
        [4.1827859e-04, 4.4166663e-01, 6.7779571e-02, ...,
         6.0617918e-04, 1.0953184e-03, 4.5898106e-04],
        [1.3111862e-04, 8.2915807e-01, 2.0256037e-02, ...,
         2.4486790e-04, 4.2682173e-04, 1.7149434e-04],
        [3.2199372e-04, 5.9775680e-01, 5.9321303e-02, ...,
         5.0845358e-04, 8.5004308e-04, 3.4872879e-04]]], dtype=float32),)
label_ids_pred after argmax:
 [[1 6 1 1 1 1 4 7 7 7 7 1 3 1 1 1 1 1 1 1 1 1 4 4 7 7 7 3 3 1 1 1 1 1 1 1
  1 1 4 7 7 7 7 3 1 1 1 1 1 1 1 1 1 1 4 4 7 7 7 3 3 1 1 1 1 1 1 1 1 1 4 1
  4 7 1 7 1 1 1 1 1 1 1 1 1 1 1 4 4 7 7 7 3 3 1 1 1 1 1 1 1 1 1 4 4 7 7 7


In [0]:
#  ['[CLS]', 'Mr', '.', 'Trump', '’', 's', 't', '##weet', '##s', 'began', 'just', 'moments', 'after', 'a', 'Fox', 'News', 'report', 'by', 'Mike', 'To', '##bin', ',', 'a', 'reporter', 'for', 'the', 'network', ',', 'about', 'protests', 'in', 'Minnesota', 'and', 'elsewhere', '.', '[SEP]']

new_tokens, new_labels = [], []
for token, label in zip(tokens, y_pred[0]):
  if token.startswith("##"): 
    # Concatenate the word pieces to one word.    
    new_tokens[-1] = new_tokens[-1] + token[2:]
  else:
    new_labels.append(label)
    new_tokens.append(token)

In [0]:
for token, label in zip(new_tokens, new_labels):
    print("{}\t\t\t{}".format(token, label))

[CLS]			O
Jim			B-per
bought			O
300			O
shares			O
of			O
Acme			B-org
Corp			I-org
.			I-org
in			O
2006			B-tim
.			O
[SEP]			O


In [0]:
###

In [0]:
###