## Environment Setup

In [1]:
!pip install tensorflow==1.15
!pip install sentencepiece
!pip show sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 3.4MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.85
Name: sentencepiece
Version: 0.1.85
Summary: SentencePiece python wrapper
Home-page: https://github.com/google/sentencepiece
Author: Taku Kudo
Author-email: taku@google.com
License: Apache
Location: /usr/local/lib/python3.6/dist-packages
Requires: 
Required-by: 


In [2]:
import os
import subprocess
import sys
from pprint import pprint

cmd_download = (
    "wget -q https://storage.googleapis.com/"
    "xlnet/released_models/cased_L-24_H-1024_A-16.zip")
cmd_unzip = "unzip -o cased_L-24_H-1024_A-16.zip"

if os.getcwd() == '/content' and not os.path.isdir('xlnet_cased_L-24_H-1024_A-16'):
    for cmd in [cmd_download, cmd_unzip]:
        subprocess.call(cmd.split())
        
    DIR_MODEL = 'xlnet_cased_L-24_H-1024_A-16'
    for itm in os.scandir(DIR_MODEL):
        print(itm.name, itm.stat().st_size)

    cmd_repo = 'git clone https://github.com/zihangdai/xlnet.git'
    if not os.path.isdir('xlnet'):
        subprocess.call(cmd_repo.split())

print(os.getcwd())

xlnet_model.ckpt.data-00000-of-00001 1441203208
xlnet_model.ckpt.meta 1300035
spiece.model 798011
xlnet_model.ckpt.index 13743
xlnet_config.json 178
/content


In [3]:
!mv /content/xlnet_cased_L-24_H-1024_A-16/* /content/xlnet/
os.chdir('xlnet')
!ls -la

total 1409960
drwxr-xr-x 6 root root       4096 Dec 16 19:24 .
drwxr-xr-x 1 root root       4096 Dec 16 19:24 ..
-rw-r--r-- 1 root root       4494 Dec 16 19:24 classifier_utils.py
-rw-r--r-- 1 root root      29868 Dec 16 19:24 data_utils.py
-rw-r--r-- 1 root root      13542 Dec 16 19:24 function_builder.py
drwxr-xr-x 8 root root       4096 Dec 16 19:24 .git
-rw-r--r-- 1 root root       1203 Dec 16 19:24 .gitignore
-rw-r--r-- 1 root root       2358 Dec 16 19:24 gpu_utils.py
-rw-r--r-- 1 root root          0 Dec 16 19:24 __init__.py
-rw-r--r-- 1 root root      11343 Dec 16 19:24 LICENSE
drwxr-xr-x 2 root root       4096 Dec 16 19:24 misc
-rw-r--r-- 1 root root      28460 Dec 16 19:24 modeling.py
-rw-r--r-- 1 root root      14078 Dec 16 19:24 model_utils.py
drwxr-xr-x 2 root root       4096 Dec 16 19:24 notebooks
-rw-r--r-- 1 root root       4546 Dec 16 19:24 prepro_utils.py
-rw-r--r-- 1 root root      18415 Dec 16 19:24 README.md
-rw-r--r-- 1 root root      29961 Dec 16 19:24 run_classif

## Google Cloud Storage

In [4]:
from google.colab import auth

# Project id for storing the data and models
project_id = 'sandbox-project-htcai'

auth.authenticate_user()

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [5]:
!gcloud config set project sandbox-project-htcai

Updated property [core/project].


Copy the pre-trained XLNet model to the Google Cloud Storage bucket.

In [6]:
!gsutil cp xlnet_model.ckpt* gs://xlnet_hcai/

Copying file://xlnet_model.ckpt.data-00000-of-00001 [Content-Type=application/octet-stream]...
/ [0 files][    0.0 B/  1.3 GiB]                                                ==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://xlnet_model.ckpt.index [Content-Type=application/octet-stream]...
Copying file://xlnet_model.ckpt.meta [Content-Type=application/octe

## Data Preparation

In [10]:
# Upload data
from google.colab import files

DIVS = ['train', 'validation', 'test']
# DIVS = ['train', 'test']

if not all(os.path.isfile('../{}.tsv'.format(div)) for div in DIVS):
    uploaded = files.upload()
    
for div in DIVS:
    file_data = div + '.tsv'
    if os.path.isfile(file_data):
        os.rename(file_data, '../' + file_data)

!ls ../ -la

total 1309728
drwxr-xr-x 1 root root       4096 Dec 16 19:28 .
drwxr-xr-x 1 root root       4096 Dec 16 18:15 ..
-rw-r--r-- 1 root root       2608 Dec 16 19:25 adc.json
-rw-r--r-- 1 root root 1338042341 Jun 19 20:14 cased_L-24_H-1024_A-16.zip
drwxr-xr-x 1 root root       4096 Dec 16 19:25 .config
drwxr-xr-x 1 root root       4096 Dec 12 16:48 sample_data
-rw-r--r-- 1 root root     307606 Dec 16 19:27 test.tsv
-rw-r--r-- 1 root root    2608086 Dec 16 19:28 train.tsv
-rw-r--r-- 1 root root     151834 Dec 16 19:28 validation.tsv
drwxr-xr-x 6 root root       4096 Dec 16 19:28 xlnet
drwxr-x--- 2 root root       4096 Dec 16 19:24 xlnet_cased_L-24_H-1024_A-16


In [12]:
import numpy as np
import pandas as pd
import re

USER_NAME = ' USERNAME '
URL = ' URL '
COLUMNS = ['tweet_id', 'user_id', 'text', 'drug', 'label']
# LABELS = ['definite', 'possible', 'mention']
LABELS = list('123')

def prepro_text(text):
    # Mask user names
    text = re.sub(r'(?<!\w)@[a-z0-9_]+', USER_NAME, text, flags=re.I)
    # URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                  URL, text, flags=re.I)
    return text

def load_data(file_name):
    if not os.path.isfile(file_name):
        file_name = '../' + file_name
    df = pd.read_csv(file_name,
                     delimiter='\t',
                     header=None,
                     names=COLUMNS,
                     encoding='utf-8',
                     dtype=str)
    df['text'] = df['text'].apply(prepro_text)
    # df['label'] = df['label'].apply(lambda x: '123'.index(x))
    # df['label'] = df['label'].apply(lambda x: LABELS[int(x) - 1])
    df['label'] = df['label'].apply(lambda x: str(x))
    return df

df_train = load_data('train.tsv')
df_validation = load_data('validation.tsv')
df_test = load_data('test.tsv')
df_train.head(10)

Unnamed: 0,tweet_id,user_id,text,drug,label
0,722156172946628608,nomipalony,USERNAME USERNAME USERNAME USERNAME I ...,paracetamol,2
1,768649308061085696,JuliMarieCarmon,Considering overdosing on ibuprofen rn I'm in ...,ibuprofen,2
2,631136822408540161,BLUEYES008,USERNAME I'd need a Valium and a stiff drink...,valium,2
3,727345166550335490,fartdaughterY2K,we just heard a gunshot and my xanax induced r...,xanax,2
4,445227333432008704,laurenrich,"Forget Prozac, Psychobiotics Are the Future of...",prozac,3
5,692948824164306944,Savon_Da_Don,"I have a headache , this Tylenol need to hurry...",tylenol,1
6,672584573000138752,16963242,my friend from puerto rico tried adderall for ...,adderall,3
7,710138849540833280,all4jR_,At doc about to get my steroid shot for my sin...,steroid,1
8,781229534204919808,KelliBruns,I had to go back to the doctor again because I...,steroid,1
9,731872100764504065,4858080873,i have just been in mark poysers head and look...,morphine,3


In [13]:
import sentencepiece as spm

sp_model = spm.SentencePieceProcessor()
sp_model.Load('spiece.model')

True

In [0]:
import unicodedata

def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
    outputs = ' '.join(inputs.strip().split())
    outputs = outputs.replace("``", '"').replace("''", '"')

    if not keep_accents:
        outputs = unicodedata.normalize('NFKD', outputs)
        outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
    if lower:
        outputs = outputs.lower()

    return outputs


In [15]:
txt = 'An input text string.'
SPIECE_UNDERLINE = '▁'

def encode_pieces(text):
    pieces = sp_model.EncodeAsPieces(preprocess_text(text))
#     pprint(pieces)
    new_pieces = []
    for piece in pieces:
        if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
            cur_pieces = sp_model.EncodeAsPieces(
                piece[:-1].replace(SPIECE_UNDERLINE, ''))
            if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
                if len(cur_pieces[0]) == 1:
                    cur_pieces = cur_pieces[1:]
                else:
                    cur_pieces[0] = cur_pieces[0][1:]
            cur_pieces.append(piece[-1])
            new_pieces.extend(cur_pieces)
        else:
            new_pieces.append(piece)
    return [sp_model.PieceToId(piece) for piece in new_pieces]

pprint(encode_pieces(txt))

[460, 5272, 1758, 4905, 9]


In [0]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
    When running eval/predict on the TPU, we need to pad the number of examples
    to be a multiple of the batch size, because the TPU requires a fixed batch
    size. The alternative is to drop the last batch, which is bad because it means
    the entire output data won't be generated.
    We use this class instead of `None` because treating `None` as padding
    battches could cause silent errors.
    """


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self,
                 input_ids,
                 input_mask,
                 segment_ids,
                 label_id,
                 is_real_example=True):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
        self.is_real_example = is_real_example


In [0]:
# Data preprocessing

import tensorflow as tf

tf.logging.set_verbosity(tf.logging.INFO)

special_symbols = {
    "<unk>"  : 0,
    "<s>"    : 1,
    "</s>"   : 2,
    "<cls>"  : 3,
    "<sep>"  : 4,
    "<pad>"  : 5,
    "<mask>" : 6,
    "<eod>"  : 7,
    "<eop>"  : 8,
}
CLS_ID = special_symbols["<cls>"]
SEP_ID = special_symbols["<sep>"]

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
        
def convert_single_example(ex_index, example, label_list, max_seq_length,
                           tokenize_fn):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        return InputFeatures(
            input_ids=[0] * max_seq_length,
            input_mask=[1] * max_seq_length,
            segment_ids=[0] * max_seq_length,
            label_id=0,
            is_real_example=False)

    if label_list is not None:
        label_map = {}
        for i, label in enumerate(label_list):
            label_map[label] = i

    tokens_a = tokenize_fn(example.text_a)
    tokens_b = None
    if example.text_b:
        tokens_b = tokenize_fn(example.text_b)

    if tokens_b:
        # Modifies `tokens_a` and `tokens_b` in place so that the total
        # length is less than the specified length.
        # Account for two [SEP] & one [CLS] with "- 3"
        _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
    else:
        # Account for one [SEP] & one [CLS] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:max_seq_length - 2]

    tokens = []
    segment_ids = []
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)

    if tokens_b:
        for token in tokens_b:
            tokens.append(token)
            segment_ids.append(SEG_ID_B)
        tokens.append(SEP_ID)
        segment_ids.append(SEG_ID_B)

    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)

    input_ids = tokens

    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length.
    if len(input_ids) < max_seq_length:
        delta_len = max_seq_length - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if label_list is not None:
        label_id = label_map[example.label]
    else:
        label_id = example.label
    if ex_index < 5:
        tf.logging.info("*** Example ***")
        tf.logging.info("guid: %s" % (example.guid))
        tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
        tf.logging.info("label: {} (id = {})".format(example.label, label_id))

    feature = InputFeatures(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids,
        label_id=label_id)
    return feature

In [0]:
import collections

def example_to_file(examples,
                    label_list,
                    max_seq_length,
                    tokenize_fn,
                    output_file,
                    num_passes=1):
    print("Create new tfrecord {}.".format(output_file))
    
    writer = tf.python_io.TFRecordWriter(output_file)
    
    if num_passes > 1:
        examples *= num_passes
        
    for ex_index, example in enumerate(examples):

        feature = convert_single_example(ex_index, example, label_list,
                                         max_seq_length, tokenize_fn)

        def create_int_feature(values):
            f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
            return f

        def create_float_feature(values):
            f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
            return f

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(feature.input_ids)
        features["input_mask"] = create_float_feature(feature.input_mask)
        features["segment_ids"] = create_int_feature(feature.segment_ids)
        if label_list is not None:
            features["label_ids"] = create_int_feature([feature.label_id])
        else:
            features["label_ids"] = create_float_feature([float(feature.label_id)])
        features["is_real_example"] = create_int_feature(
            [int(feature.is_real_example)])

        tf_example = tf.train.Example(features=tf.train.Features(feature=features))
        writer.write(tf_example.SerializeToString())
    writer.close()

In [0]:
def create_examples(df, col_text='text', col_label='label'):
    return [
        InputExample(guid='unused_id',
                     text_a=row[col_text],
                     text_b=None,
                     label=row[col_label])
        for _, row in df.iterrows()
    ]

In [20]:
PATH_TRAIN = 'train.tf_record'
PATH_DEV = 'dev.tf_record'
PATH_TEST = 'test.tf_record'
MAX_SEQ_LEN = 32

dst_to_df = {
    PATH_TRAIN: (df_train, 3),
    PATH_DEV: (df_validation, 1),
    PATH_TEST: (df_test, 1)
}

for dst_path, (df, n_ps) in dst_to_df.items():
    examples = create_examples(df)
    if os.path.isfile(dst_path):
        # continue
        os.remove(dst_path)
    example_to_file(examples=examples,
                    label_list=LABELS,
                    max_seq_length=MAX_SEQ_LEN,
                    tokenize_fn=encode_pieces,
                    output_file=dst_path,
                    num_passes=n_ps)

!ls -la

Create new tfrecord train.tf_record.
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: unused_id
INFO:tensorflow:input_ids: 0 276 28113 23847 276 28113 23847 276 28113 23847 276 28113 23847 35 345 5815 1138 751 10868 21 17 150 3494 3143 12761 99 94 10519 369 23 4 3
INFO:tensorflow:input_mask: 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
INFO:tensorflow:label: 2 (id = 1)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: unused_id
INFO:tensorflow:input_ids: 17 22134 95 14334 56 31 17 150 3494 3143 12761 17 7591 35 26 98 25 102 178 1593 17 0 17 7967 1315 267 57 343 14625 29 4 3
INFO:tensorflow:input_mask: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2
INFO:tensorflow:label: 2 (id = 1)
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: unused_id
INFO:tensorflow

Copying tf_record files to the Google Cloud Storage bucket

In [0]:
print(os.getcwd())
!gsutil cp *.tf_record gs://xlnet_hcai/

In [0]:
# Input function
SHUFFLE_BUFFER = 1024

def file_based_input_fn_builder(input_file, seq_length, is_training,
                                drop_remainder):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""


    name_to_features = {
        "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "input_mask": tf.FixedLenFeature([seq_length], tf.float32),
        "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
        "label_ids": tf.FixedLenFeature([], tf.int64),
        "is_real_example": tf.FixedLenFeature([], tf.int64),
    }

    tf.logging.info("Input tfrecord file {}".format(input_file))

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.parse_single_example(record, name_to_features)

        # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
        # So cast all int64 to int32.
        for name in list(example.keys()):
            t = example[name]
            if t.dtype == tf.int64:
                t = tf.cast(t, tf.int32)
            example[name] = t

        return example

    def input_fn(params, input_context=None):
        """The actual input function."""
        batch_size = params["batch_size"]

        d = tf.data.TFRecordDataset(input_file)
        # Shard the dataset to difference devices
        if input_context is not None:
            tf.logging.info("Input pipeline id %d out of %d",
                            input_context.input_pipeline_id,
                            input_context.num_replicas_in_sync)
            d = d.shard(input_context.num_input_pipelines,
                        input_context.input_pipeline_id)

        # For training, we want a lot of parallel reading and shuffling.
        # For eval, we want no shuffling and parallel reading doesn't matter.
        if is_training:
            d = d.shuffle(buffer_size=SHUFFLE_BUFFER)
            d = d.repeat()

        d = d.apply(
            tf.contrib.data.map_and_batch(
                lambda record: _decode_record(record, name_to_features),
                batch_size=batch_size,
                drop_remainder=drop_remainder))

        return d

    return input_fn


In [23]:
train_input_fn = file_based_input_fn_builder(
    input_file='gs://xlnet_hcai/train.tf_record',
    seq_length=MAX_SEQ_LEN,
    is_training=True,
    drop_remainder=True)

INFO:tensorflow:Input tfrecord file gs://xlnet_hcai/train.tf_record


## Model

In [0]:
import shutil

MODEL_DIR = 'gs://xlnet_hcai/med_intake'

In [0]:
class XLNetConfig(object):
    """XLNetConfig contains hyperparameters that are specific to a model checkpoint;
    i.e., these hyperparameters should be the same between
    pretraining and finetuning.

    The following hyperparameters are defined:
        n_layer: int, the number of layers.
        d_model: int, the hidden size.
        n_head: int, the number of attention heads.
        d_head: int, the dimension size of each attention head.
        d_inner: int, the hidden size in feed-forward layers.
        ff_activation: str, "relu" or "gelu".
        untie_r: bool, whether to untie the biases in attention.
        n_token: int, the vocab size.
    """

    def __init__(self, json_path):
        """Constructing an XLNetConfig.
        json_path should be provided."""

        self.keys = ["n_layer", "d_model", "n_head", "d_head", "d_inner",
                     "ff_activation", "untie_r", "n_token"]

        self.init_from_json(json_path)

    def init_from_json(self, json_path):
        with tf.gfile.Open(json_path) as f:
            json_data = json.load(f)
            for key in self.keys:
                setattr(self, key, json_data[key])

    def to_json(self, json_path):
        """Save XLNetConfig to a json file."""
        json_data = {}
        for key in self.keys:
            json_data[key] = getattr(self, key)

        json_dir = os.path.dirname(json_path)
        if not tf.gfile.Exists(json_dir):
            tf.gfile.MakeDirs(json_dir)
            with tf.gfile.Open(json_path, "w") as f:
                json.dump(json_data, f, indent=4, sort_keys=True)


In [0]:
class RunConfig(object):
    """RunConfig contains hyperparameters that could be different
    between pretraining and finetuning.
    These hyperparameters can also be changed from run to run.
    We store them separately from XLNetConfig for flexibility.
    """

    def __init__(self, is_training, use_tpu, use_bfloat16, dropout, dropatt,
                 init="normal", init_range=0.1, init_std=0.02, mem_len=None,
                 reuse_len=None, bi_data=False, clamp_len=-1, same_length=False):
        """
        Args:
          is_training: bool, whether in training mode.
          use_tpu: bool, whether TPUs are used.
          use_bfloat16: bool, use bfloat16 instead of float32.
          dropout: float, dropout rate.
          dropatt: float, dropout rate on attention probabilities.
          init: str, the initialization scheme, either "normal" or "uniform".
          init_range: float, initialize the parameters with a uniform distribution
            in [-init_range, init_range]. Only effective when init="uniform".
          init_std: float, initialize the parameters with a normal distribution
            with mean 0 and stddev init_std. Only effective when init="normal".
          mem_len: int, the number of tokens to cache.
          reuse_len: int, the number of tokens in the currect batch to be cached
            and reused in the future.
          bi_data: bool, whether to use bidirectional input pipeline.
            Usually set to True during pretraining and False during finetuning.
          clamp_len: int, clamp all relative distances larger than clamp_len.
            -1 means no clamping.
          same_length: bool, whether to use the same attention length for each token.
        """

        self.init = init
        self.init_range = init_range
        self.init_std = init_std
        self.is_training = is_training
        self.dropout = dropout
        self.dropatt = dropatt
        self.use_tpu = use_tpu
        self.use_bfloat16 = use_bfloat16
        self.mem_len = mem_len
        self.reuse_len = reuse_len
        self.bi_data = bi_data
        self.clamp_len = clamp_len
        self.same_length = same_length

def create_run_config(is_training, is_finetune):
    kwargs = dict(
        is_training=is_training,
        use_tpu=True,
        use_bfloat16=False,
        dropout=0.1,
        dropatt=0.1,
        init='normal',
        init_range=0.1,
        init_std=0.02,
        clamp_len=-1)

    if not is_finetune:
        kwargs.update(dict(
            mem_len=384,
            reuse_len=256,
            bi_data=True,
            clamp_len=-1,
            same_length=False))

    return RunConfig(**kwargs)

In [27]:
print(os.getcwd())
!ls -la

/content/xlnet
total 1430124
drwxr-xr-x 6 root root       4096 Dec 16 19:30 .
drwxr-xr-x 1 root root       4096 Dec 16 19:28 ..
-rw-r--r-- 1 root root       4494 Dec 16 19:24 classifier_utils.py
-rw-r--r-- 1 root root      29868 Dec 16 19:24 data_utils.py
-rw-r--r-- 1 root root     381652 Dec 16 19:30 dev.tf_record
-rw-r--r-- 1 root root      13542 Dec 16 19:24 function_builder.py
drwxr-xr-x 8 root root       4096 Dec 16 19:24 .git
-rw-r--r-- 1 root root       1203 Dec 16 19:24 .gitignore
-rw-r--r-- 1 root root       2358 Dec 16 19:24 gpu_utils.py
-rw-r--r-- 1 root root          0 Dec 16 19:24 __init__.py
-rw-r--r-- 1 root root      11343 Dec 16 19:24 LICENSE
drwxr-xr-x 2 root root       4096 Dec 16 19:24 misc
-rw-r--r-- 1 root root      28460 Dec 16 19:24 modeling.py
-rw-r--r-- 1 root root      14078 Dec 16 19:24 model_utils.py
drwxr-xr-x 2 root root       4096 Dec 16 19:24 notebooks
-rw-r--r-- 1 root root       4546 Dec 16 19:24 prepro_utils.py
-rw-r--r-- 1 root root      18415 Dec 1

In [0]:
import modeling
from xlnet import XLNetConfig, XLNetModel

PATH_CONFIG = 'xlnet_config.json'
TASK_NAME = 'med_intake'

def get_classification_loss(features, n_class, is_training):
    """Loss for downstream classification tasks."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = XLNetConfig(json_path=PATH_CONFIG)
    run_config = create_run_config(is_training, True)

    xlnet_model = XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)

    summary = xlnet_model.get_pooled_out('last', True)

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        cls_scope = "classification_{}".format(TASK_NAME)

        per_example_loss, logits = modeling.classification_loss(
            hidden=summary,
            labels=label,
            n_class=n_class,
            initializer=xlnet_model.get_initializer(),
            scope=cls_scope,
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits



In [0]:
def get_assignment_map_from_checkpoint(tvars, init_checkpoint):
    """Compute the union of the current variables and checkpoint variables."""
    initialized_variable_names = {}

    name_to_variable = collections.OrderedDict()
    for var in tvars:
        name = var.name
        m = re.match("^(.*):\\d+$", name)
        if m is not None:
            name = m.group(1)
        name_to_variable[name] = var

    init_vars = tf.train.list_variables(init_checkpoint)

    assignment_map = collections.OrderedDict()
    for x in init_vars:
        (name, var) = (x[0], x[1])
        # tf.logging.info('original name: %s', name)
        if name not in name_to_variable:
            continue
        # assignment_map[name] = name
        assignment_map[name] = name_to_variable[name]
        initialized_variable_names[name] = 1
        initialized_variable_names[name + ":0"] = 1

    return (assignment_map, initialized_variable_names)


def init_from_checkpoint(global_vars=False):
    tvars = tf.global_variables() if global_vars else tf.trainable_variables()
    scaffold_fn = None
    init_checkpoint = 'gs://xlnet_hcai/xlnet_model.ckpt'

    tf.logging.info(
        "Initialize from the ckpt {}".format(init_checkpoint))

    (assignment_map, initialized_variable_names
    ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
    def tpu_scaffold():
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
        return tf.train.Scaffold()

    scaffold_fn = tpu_scaffold

    # Log customized initialization
    tf.logging.info("**** Global Variables ****")
    for var in tvars:
        init_string = ""
        if var.name in initialized_variable_names:
            init_string = ", *INIT_FROM_CKPT*"
        tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                        init_string)
    return scaffold_fn



In [0]:
WARMUP_STEPS = 500
LEARNING_RATE = 2e-5
DECAY_METHOD = 'poly'
TRAIN_STEPS = 4000
MIN_LR_RATIO = 0.0
ADAM_EPSILON = 1e-8
CLIP = 1.0

def get_train_op(total_loss, grads_and_vars=None):
    global_step = tf.train.get_or_create_global_step()

    # increase the learning rate linearly
    if WARMUP_STEPS > 0:
        warmup_lr = (tf.cast(global_step, tf.float32)
                     / tf.cast(WARMUP_STEPS, tf.float32)
                     * LEARNING_RATE)
    else:
        warmup_lr = 0.0

    # decay the learning rate
    if DECAY_METHOD== "poly":
        decay_lr = tf.train.polynomial_decay(
            LEARNING_RATE,
            global_step=global_step - WARMUP_STEPS,
            decay_steps=TRAIN_STEPS - WARMUP_STEPS,
            end_learning_rate=LEARNING_RATE * MIN_LR_RATIO)
    elif DECAY_METHOD == "cos":
        decay_lr = tf.train.cosine_decay(
            LEARNING_RATE,
            global_step=global_step - WARMUP_STEPS,
            decay_steps=TRAIN_STEPS - WARMUP_STEPS,
            alpha=MIN_LR_RATIO)
    else:
        raise ValueError(DECAY_METHOD)

    learning_rate = tf.where(global_step < WARMUP_STEPS,
                             warmup_lr, decay_lr)

    optimizer = tf.train.AdamOptimizer(
        learning_rate=learning_rate,
        epsilon=ADAM_EPSILON)

    optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer)

    if grads_and_vars is None:
        grads_and_vars = optimizer.compute_gradients(total_loss)
    gradients, variables = zip(*grads_and_vars)
    clipped, gnorm = tf.clip_by_global_norm(gradients, CLIP)

    train_op = optimizer.apply_gradients(
        zip(clipped, variables), global_step=global_step)

    return train_op, learning_rate, gnorm

In [0]:
def get_model_fn(n_class):
    def model_fn(features, labels, mode, params):
        #### Training or Evaluation
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        (total_loss,
         per_example_loss,
         logits) = get_classification_loss(features, n_class, is_training)

        #### Check model parameters
        num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()])
        tf.logging.info('#params: {}'.format(num_params))

        #### load pretrained models
        scaffold_fn = init_from_checkpoint()

        #### Evaluation mode
        if mode == tf.estimator.ModeKeys.EVAL:
#             assert FLAGS.num_hosts == 1

            def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                eval_input_dict = {
                    'labels': label_ids,
                    'predictions': predictions,
                    'weights': is_real_example
                }
                accuracy = tf.metrics.accuracy(**eval_input_dict)

                loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
                return {
                    'eval_accuracy': accuracy,
                    'eval_loss': loss}

            is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)

            #### Constucting evaluation TPUEstimatorSpec with new cache.
            label_ids = tf.reshape(features['label_ids'], [-1])

            metric_fn = metric_fn
            metric_args = [per_example_loss, label_ids, logits, is_real_example]

            eval_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=(metric_fn, metric_args),
                scaffold_fn=scaffold_fn)

            return eval_spec

        elif mode == tf.estimator.ModeKeys.PREDICT:
            label_ids = tf.reshape(features["label_ids"], [-1])

            predictions = {
                "logits": logits,
                "labels": label_ids,
                "is_real": features["is_real_example"]
            }

            output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
            return output_spec

        #### Configuring the optimizer
        train_op, learning_rate, _ = get_train_op(total_loss)

        monitor_dict = {}
        monitor_dict["lr"] = learning_rate

        #### Constucting training TPUEstimatorSpec with new cache.
        train_spec = tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, loss=total_loss, train_op=train_op, host_call=None,
            scaffold_fn=scaffold_fn)

        return train_spec
    return model_fn

## TPU Config

In [32]:
if 'COLAB_TPU_ADDR' not in os.environ:
    print('ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!')
else:
    tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
    print ('TPU address is', tpu_address)

    with tf.Session(tpu_address) as session:
        devices = session.list_devices()

    print('TPU devices:')
    pprint(devices)

TPU address is grpc://10.126.33.210:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 14342951192820388017),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 16711432350381237889),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 12456503432055999800),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 8209435398226862732),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 5104131553521866037),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 11410043235275841143),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 15228935842972482799),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 797708036941162615),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 36362352051

In [0]:
# def add_op(x, y):
#     return x + y
  
# x = tf.placeholder(tf.float32, [10,])
# y = tf.placeholder(tf.float32, [10,])
# tpu_ops = tf.contrib.tpu.rewrite(add_op, [x, y])
  
# session = tf.Session(tpu_address)
# try:
#     print('Initializing...')
#     session.run(tf.contrib.tpu.initialize_system())
#     print('Running ops')
#     print(session.run(tpu_ops, {x: np.arange(10), y: np.arange(10)}))
# finally:
#     # For now, TPU sessions must be shutdown separately from
#     # closing the session.
#     session.run(tf.contrib.tpu.shutdown_system())
#     session.close()

In [33]:
tpu_cluster = tf.contrib.cluster_resolver.TPUClusterResolver(tpu_address)
master = tpu_cluster.get_master()
# tf.contrib.distribute.initialize_tpu_system(tpu_cluster)
session_config = tf.ConfigProto(allow_soft_placement=True)
# strategy = tf.contrib.distribute.TPUStrategy(resolver)
strategy = None

tpu_config = tf.contrib.tpu.TPUConfig(num_shards=8,
                                      iterations_per_loop=100)
run_config = tf.contrib.tpu.RunConfig(master=master,
                                      model_dir=MODEL_DIR,
#                                       session_config=session_config,
                                      tpu_config=tpu_config,
                                      save_checkpoints_steps=1000,
                                      train_distribute=strategy)

model_fn = get_model_fn(len(LABELS))
print(master)

grpc://10.126.33.210:8470


In [34]:
TRAIN_BATCH_SIZE = 32
PREDICT_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 32

# Estimator
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)

INFO:tensorflow:Using config: {'_model_dir': 'gs://xlnet_hcai/med_intake', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f660b116438>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.126.33.210:8470', '_evaluation_master': 'grpc://10.126.33.210:8470', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterati

In [35]:
estimator.train(input_fn=train_input_fn, max_steps=TRAIN_STEPS)

INFO:tensorflow:Querying Tensorflow master (grpc://10.126.33.210:8470) for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 14342951192820388017)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 12456503432055999800)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 8209435398226862732)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 5104131553521866037)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 11410043235275841143)
INFO:tensorflow:*** Available Device: _Dev

<tensorflow_estimator.python.estimator.tpu.tpu_estimator.TPUEstimator at 0x7f660b116240>

In [36]:
pred_input_fn = file_based_input_fn_builder(
    input_file='gs://xlnet_hcai/test.tf_record',
    seq_length=MAX_SEQ_LEN,
    is_training=False,
    drop_remainder=False)

pred = estimator.predict(
    input_fn=pred_input_fn,
    yield_single_examples=True)


INFO:tensorflow:Input tfrecord file gs://xlnet_hcai/test.tf_record


In [37]:
pred_y = list(pred)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:memory input None
INFO:tensorflow:Use float type <dtype: 'float32'>
INFO:tensorflow:#params: 361320451
INFO:tensorflow:Initialize from the ckpt gs://xlnet_hcai/xlnet_model.ckpt
INFO:tensorflow:**** Global Variables ****
INFO:tensorflow:  name = model/transformer/r_w_bias:0, shape = (24, 16, 64), *INIT_FROM_CKPT*
INFO:tensorflow:  name = model/transformer/r_r_bias:0, shape = (24, 16, 64), *INIT_FROM_CKPT*
INFO:tensorflow:  name = model/transformer/word_embedding/lookup_table:0, shape = (32000, 1024), *INIT_FROM_CKPT*
INFO:tensorflow:  name = model/transformer/r_s_bias:0, shape = (24, 16, 64), *INIT_FROM_CKPT*
INFO:tensorflow:  name = model/transformer/seg_embed:0, shape = (24, 2, 16, 64), *INIT_FROM_CKPT*
INFO:tensorflow:  name = model/transformer/layer_0/rel_attn/q/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
INFO:tensorflow:  name = model/transformer/layer_0/rel_attn/k/kernel:0, shape = (1024, 16, 64), *INIT_FROM_CKPT*
INFO:tenso

In [38]:
from sklearn.metrics import accuracy_score, confusion_matrix,\
    precision_recall_fscore_support

df_test = df_test.assign(pred=[LABELS[np.argmax(prd['logits'])] for prd in pred_y])

scores = precision_recall_fscore_support(
    df_test['label'],
    df_test['pred'],
    labels=LABELS)
# pprint(scores)
pprint(confusion_matrix(
    df_test['label'],
    df_test['pred'],
    labels=LABELS))
print(accuracy_score(df_test['label'], [LABELS[np.argmax(prd['logits'])] for prd in pred_y]))

array([[411,  82,  12],
       [ 80, 498,  83],
       [ 28, 126, 958]])
0.8195785776997366


In [39]:
from tabulate import tabulate

header = ['Scores'] + LABELS
names = ['precision', 'recall', 'f1', 'counts']
table = [[scr] + list(row) for scr, row in zip(names, scores)]
print(tabulate(table, headers=header))

Scores              1           2            3
---------  ----------  ----------  -----------
precision    0.791908    0.705382     0.909782
recall       0.813861    0.753404     0.861511
f1           0.802734    0.728603     0.884988
counts     505         661         1112


In [0]:
# df_test.to_csv('test_pred.tsv', sep='\t', index=False)

In [0]:
# !gsutil cp test_pred.tsv gs://xlnet_hcai/

Copying file://test_pred.tsv [Content-Type=text/tab-separated-values]...
/ [1 files][297.7 KiB/297.7 KiB]                                                
Operation completed over 1 objects/297.7 KiB.                                    
