In [None]:
%%bash
pip install tensorflow-transform
pip install apache-beam[gcp]

In [None]:
import os
if not os.path.exists('trainer'):
    os.mkdir('trainer')

In [21]:
%%writefile trainer/model.py
#!/usr/bin/env python

from __future__ import print_function, division, absolute_import # python 2 compatibility
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import pandas as pd
import tensorflow as tf
import tensorflow.contrib.learn as tflearn
import tensorflow.contrib.metrics as metrics
from tensorflow_transform.saved import input_fn_maker, saved_transform_io
from tensorflow_transform.tf_metadata import metadata_io
from tensorflow_transform.beam.tft_beam_io import beam_metadata_io
import tensorflow_model_analysis as tfma
import tensorflow_hub as hub
import apache_beam as beam
import shutil
import os
import config
import variables
from sepcnn_model import cnn_model_fn
print(tf.__version__)
tf.logging.set_verbosity(tf.logging.INFO)

Overwriting trainer/model.py


# Cloud Setup
This section is required only if running on cloud (ML Engine)

In [None]:
os.environ['PROJECT'] = config.PROJECT
os.environ['BUCKET'] = config.BUCKET
os.environ['REGION'] = config.REGION
os.environ['TFVERSION'] = config.TFVERSION

In [None]:
%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Set up Model as a Package

## Pre-requisites
Data is assumed to have been prepared in the `TFRecords` format with GZIP compression. This gets us the best performance and scalability compared to csv files. The conversion of `csv` to `TFRecords` should be done in the previous notebook, `02-tf_transform.ipynb`

## Package Setup
We need to set up our model as a package for training and serving.

- `model.py` provides the code for data inputs and the model itself
- `setup.py` provides metadata about the package
- `task.py` sets up the package to be used from the command line, with arguments that specify hyperparameters to the model as well as GCP resources 

In [22]:
%%writefile trainer/model.py --append


def build_estimator(model_dir, model_type, embedding_type, learning_rate,
                    hidden_units, dropout,
                    l1_regularization_strength, l2_regularization_strength,
                    blocks, filters, kernel_size, pool_size):

    if embedding_type == 'nnlm':
        module_url = 'https://tfhub.dev/google/nnlm-en-dim128/1'
        embedding_size = 128
    elif embedding_type == 'universal-sentence-encoder':
        module_url = 'https://tfhub.dev/google/universal-sentence-encoder/2'
        embedding_size = 512
    elif embedding_type == 'elmo':
        module_url = 'https://tfhub.dev/google/elmo/2'
        embedding_size = 1024
    elif embedding_type == 'wikiwords250':
        module_url = 'https://tfhub.dev/google/Wiki-words-250/1'
        embedding_size = 250
    else:
        raise InputError('Embedding type must be one of "nnlm", "universal-sentence-encoder", "elmo"')
    
    if model_type in ('linear', 'dnn-linear-combined'):
        bow_indices = tf.feature_column.categorical_column_with_identity('bow_indices', num_buckets=config.MAX_TOKENS+1)
        weighted_bow = tf.feature_column.weighted_categorical_column(bow_indices, 'bow_weight')
    if model_type in ('dnn', 'dnn-linear-comnbined'):
        embedding = hub.text_embedding_column(config.TOKENIZE_COL, module_url, trainable=False)
    if model_type in ('cnn', 'rnn'):
        pass
    
    if model_type == 'linear':
        feature_columns = [weighted_bow]
        
        estimator = tf.estimator.LinearClassifier(
            feature_columns=feature_columns,
            n_classes=variables.N_CLASSES,
            label_vocabulary=variables.LABEL_VOCABULARY,
            model_dir=model_dir,
            optimizer=tf.train.FtrlOptimizer(
                learning_rate=learning_rate,
                l1_regularization_strength=l1_regularization_strength,
                l2_regularization_strength=l2_regularization_strength
            )
        )
    elif model_type == 'dnn':
        feature_columns = [embedding]
        
        estimator = tf.estimator.DNNClassifier(
            feature_columns=feature_columns,
            hidden_units=hidden_units,
            n_classes=variables.N_CLASSES,
            label_vocabulary=variables.LABEL_VOCABULARY,
            model_dir=model_dir,
            optimizer=tf.train.AdamOptimizer(
                learning_rate=learning_rate,
            ),
            dropout=dropout
        )
    elif model_type == 'dnn-linear-combined':
        dnn_features = [embedding]
        linear_features = [weighted_bow]
        
        estimator = tf.estimator.DNNLinearCombinedClassifier(
            linear_feature_columns=linear_features,
            linear_optimizer=tf.train.FtrlOptimizer(
                learning_rate=learning_rate,
                l1_regularization_strength=l1_regularization_strength,
                l2_regularization_strength=l2_regularization_strength
            ),
            dnn_feature_columns=dnn_features,
            dnn_optimizer=tf.train.AdamOptimizer(
                learning_rate=learning_rate,
            ),
            dnn_dropout=dropout,
            dnn_hidden_units=hidden_units,
            n_classes=variables.N_CLASSES,
            label_vocabulary=variables.LABEL_VOCABULARY,
            model_dir=model_dir,
            batch_norm=True
        )
    elif model_type == 'sepcnn':
        params = {
            'learning_rate': learning_rate,
            'blocks': blocks,
            'filters': filters,
            'kernel_size': kernel_size,
            'dropout_rate': dropout,
            'pool_size': pool_size,
            'num_classes': variables.N_CLASSES,
            'module_url': module_url,
            'is_embedding_trainable': False,
            'embedding_size': embedding_size
        }
        
        estimator = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir=model_dir, params=params)
    elif model_type == 'rnn':
        embed = hub.Module(module_url, trainable=False)
        def seq_embed(batch_of_text):
            words = tf.string_split(tf.squeeze(batch_of_text))
            batch_size = words.dense_shape[0]
            dense_words = tf.sparse_to_dense(
                sparse_indices=words.indices,
                sparse_values=words.values,
                default_value='',
                output_shape=(batch_size, config.MAX_SEQ_LEN)
            )
            embeddings = tf.map_fn(lambda token: embed(token), dense_words, dtype=tf.float32)
            
            return embeddings
            
        text_input = tf.keras.layers.Input(shape=(config.MAX_SEQ_LEN, embedding_size), name=config.TOKENIZE_COL, dtype=tf.float32)
        processed = tf.keras.layers.Lambda(seq_embed)(text_input)
        processed = tf.keras.layers.Lambda(seq_embed)(processed)
        for unit in hidden_units:
            processed = tf.keras.layers.LSTM(unit)(processed)
        processed = tf.keras.layers.Dense(128, activation='relu')(processed)
        processed = tf.keras.layers.Dropout(dropout)(processed)
        output = tf.keras.layers.Dense(variables.N_CLASSES, activation='sigmoid', name='probabilities')(processed)
        
        model = tf.keras.Model(inputs=text_input, outputs=output)
        
        model.compile(
            loss='categorical_crossentropy',
            optimizer='adam',
            metrics=['accuracy']
        )
        
        estimator = tf.keras.estimator.model_to_estimator(model)
        
    else:
        raise InputErorr('Model type must be one of "linear" or "dnn"')
        
    if len(config.PASSTHROUGH_COLS) > 0:
        estimator = tf.contrib.estimator.forward_features(estimator, config.PASSTHROUGH_COLS)
        
    def get_model_fn_with_removed_outputs(estimator):
        def _model_fn(features, labels, mode):
            config = estimator.config
            model_fn_ops = estimator._model_fn(features=features, labels=labels, mode=mode, config=config)
            model_fn_ops.predictions['probability'] = tf.reduce_max(model_fn_ops.predictions['probabilities'], axis=-1)
            for key in ('logits', 'logistic', 'probabilities', 'class_ids'):
                try:
                    model_fn_ops.predictions.pop(key)
                except KeyError:
                    pass
            return model_fn_ops
        return _model_fn
    
    if model_type != 'sepcnn':
        estimator = tf.estimator.Estimator(model_fn=get_model_fn_with_removed_outputs(estimator))
    
    return estimator
        
# Serving input function
def make_serving_input_fn_for_base64_json(args):
    raw_metadata = metadata_io.read_metadata(
        os.path.join(args['metadata_path'], 'rawdata_metadata'))
    transform_savedmodel_dir = (
        os.path.join(args['metadata_path'], 'transform_fn'))
    return input_fn_maker.build_parsing_transforming_serving_input_receiver_fn(
        raw_metadata,
        transform_savedmodel_dir,
        exclude_raw_keys=[config.LABEL_COL]
    )

def make_serving_input_fn(args):
    transform_savedmodel_dir = (
        os.path.join(args['metadata_path'], 'transform_fn'))
    
    def _input_fn():
        feature_placeholders = {
            column_name: tf.placeholder(tf.string, [None]) for column_name in config.STRING_COLS
        }
        feature_placeholders.update({
            column_name: tf.placeholder(tf.float32, [None]) for column_name in config.NUMERIC_COLS
        })
        feature_placeholders.pop(config.LABEL_COL)
        
        _, features = saved_transform_io.partially_apply_saved_transform(
            transform_savedmodel_dir,
            feature_placeholders
        )
        
        # so that outputs are consistently in lists
        if len(config.PASSTHROUGH_COLS) > 0:
            for col in config.PASSTHROUGH_COLS:
                features[col] = tf.expand_dims(tf.identity(feature_placeholders[col]), axis=1)
        
        return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)
    
    return _input_fn


def make_eval_input_fn(args):
    transform_savedmodel_dir = (
        os.path.join(args['metadata_path'], 'transform_fn'))
    
    def _input_fn():
        metadata = beam_metadata_io.metadata_io.read_metadata('data/tft/metadata/rawdata_metadata/')
        raw_feature_spec = metadata.schema.as_feature_spec()

        serialized_tf_example = tf.placeholder(dtype=tf.string, shape=[None], name='input_example_tensor')

        features = tf.parse_example(serialized_tf_example, raw_feature_spec)
        
        _, transformed_features = saved_transform_io.partially_apply_saved_transform(
            transform_savedmodel_dir,
            features
        )
        
        receiver_tensors = {'examples': serialized_tf_example}
        
        return tfma.export.EvalInputReceiver(
            features=transformed_features,
            receiver_tensors=receiver_tensors,
            labels=transformed_features[config.LABEL_COL]
        )
    
    return _input_fn

# training, eval and test input function
def read_dataset(args, mode):
    batch_size = args['train_batch_size']
    if mode == tf.estimator.ModeKeys.TRAIN:
        input_paths = args['train_data_paths']
    elif mode == tf.estimator.ModeKeys.EVAL:
        input_paths = args['eval_data_paths']
    else:
        input_paths = args['test_data_paths']
    
    transformed_metadata = metadata_io.read_metadata(
        os.path.join(args['metadata_path'], 'transformed_metadata'))
    
    return input_fn_maker.build_training_input_fn(
        metadata=transformed_metadata,
        file_pattern = (input_paths[0] if len(input_paths) == 1 else input_paths),
        training_batch_size=batch_size,
        label_keys=[config.LABEL_COL + '_ one_hot' if args['model_type'] in ('rnn', 'cnn') else config.LABEL_COL],
        reader=gzip_reader_fn,
        randomize_input=(mode == tf.estimator.ModeKeys.TRAIN),
        num_epochs=(None if mode == tf.estimator.ModeKeys.TRAIN else 1)
    )


# create tf.estimator train and evaluate function
def train_and_evaluate(args):
    # figure out train steps based on no. of epochs, no. of rows in dataset and batch size
    tfrecord_options = tf.python_io.TFRecordOptions(compression_type=tf.python_io.TFRecordCompressionType.GZIP)
    nrows = sum(
        sum(1 for _ in tf.python_io.tf_record_iterator(f, options=tfrecord_options)) 
        for f in tf.gfile.Glob(args['train_data_paths'])
    )
    num_epochs = args['num_epochs']
    batch_size = args['train_batch_size']
    if batch_size > nrows:
        batch_size = nrows
    max_steps = num_epochs * nrows / batch_size
    
    # modify according to build_estimator function
    estimator = build_estimator(
        model_dir=args['model_dir'],
        model_type=args['model_type'],
        embedding_type=args['embedding_type'],
        learning_rate=args['learning_rate'],
        hidden_units=args['hidden_units'].split(' '),
        dropout=args['dropout'],
        l1_regularization_strength=args['l1_regularization_strength'],
        l2_regularization_strength=args['l2_regularization_strength'],
        blocks=args['blocks'],
        filters=args['filters'],
        kernel_size=args['kernel_size'],
        pool_size=args['pool_size']
    )
    
    train_spec = tf.estimator.TrainSpec(
        input_fn=read_dataset(args, tf.estimator.ModeKeys.TRAIN),
        max_steps=max_steps
    )
    
    exporter = tf.estimator.LatestExporter('exporter', make_serving_input_fn(args))
    
    eval_spec = tf.estimator.EvalSpec(
        input_fn=read_dataset(args, tf.estimator.ModeKeys.EVAL),
        steps=None,
        exporters=exporter
    )
    
    
    if args['debug'] == 'True':
        
        train_spec = tf.estimator.TrainSpec(
            input_fn=read_dataset(args, tf.estimator.ModeKeys.TRAIN),
            max_steps=5
        )
        
        eval_spec = tf.estimator.EvalSpec(
            input_fn=read_dataset(args, tf.estimator.ModeKeys.EVAL),
            steps=1,
            exporters=exporter
        )
        
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
        
        for result in estimator.predict(input_fn=read_dataset(args, mode=tf.estimator.ModeKeys.EVAL)):
            print(result)
    else:
        
        tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    
        tfma.export.export_eval_savedmodel(
            estimator=estimator,
            export_dir_base='model_trained/eval/tfma/',
            eval_input_receiver_fn=make_eval_input_fn(args)
        )
    
    # export results
#     if not os.path.exists('data/output'):
#         os.mkdir('data/output')
#     eval_preds = pd.DataFrame(list(estimator.predict(input_fn=read_dataset(args, tf.estimator.ModeKeys.EVAL))))
#     probabilities = list(list(arr) for arr in eval_preds['probabilities']) # pandas is weird with how it stores arrays
#     with tf.Session() as sess:
#         eval_preds['probability'] = sess.run(tf.reduce_max(probabilities, reduction_indices=[1]))
#     eval_preds['pred_' + config.LABEL_COL] = eval_preds['classes'].map(lambda x: x[0]) # predictions come in a list per row
#     eval_preds = eval_preds[['pred_' + config.LABEL_COL, 'probability']]
#     raw_eval_df = pd.concat([
#         pd.read_csv(f, sep=config.DELIM, names=config.RENAMED_COLS)
#         for f in tf.gfile.Glob('data/split/eval*.csv')], 
#         axis=0, ignore_index=True)
#     cols = list(raw_eval_df.columns)
#     cols.remove(config.LABEL_COL)
#     raw_eval_df = raw_eval_df[cols + [config.LABEL_COL]]
#     for col in ['pred_' + config.LABEL_COL, 'probability']:
#         raw_eval_df[col] = eval_preds[col]
#     raw_eval_df['wrong'] = (raw_eval_df['pred_' + config.LABEL_COL] != raw_eval_df[config.LABEL_COL]).astype(int)
#     raw_eval_df.to_excel('data/output/eval_with_preds.xlsx', index=False)
    
    
def gzip_reader_fn():
    return tf.TFRecordReader(options=tf.python_io.TFRecordOptions(
        compression_type=tf.python_io.TFRecordCompressionType.GZIP))

Appending to trainer/model.py


In [23]:
%%writefile trainer/setup.py

from setuptools import find_packages
from setuptools import setup

REQUIRED_PACKAGES = [
]

setup(
    name='{name_of_model}',
    version='0.1',
    author = '{name of author}',
    author_email = '{email@example.com}',
    install_requires=REQUIRED_PACKAGES,
    packages=find_packages(),
    include_package_data=True,
    description='{Some description}',
    requires=[]
)

Overwriting trainer/setup.py


In [24]:
%%writefile trainer/task.py
import traceback
import argparse
import json
import os

import model

import tensorflow as tf

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    # Input Arguments
    parser.add_argument(
        '--train_data_paths',
        help = 'GCS or local path to training data',
        required = True
    )
    parser.add_argument(
        '--train_batch_size',
        help = 'Batch size for training steps',
        type = int,
        default = 512
    )
    parser.add_argument(
        '--eval_batch_size',
        help = 'Batch size for evaluation steps',
        type = int,
        default = 512
    )
    parser.add_argument(
        '--num_epochs',
        help = 'Epochs to run the training job for',
        type = int,
        default = 50
    )
    parser.add_argument(
        '--eval_steps',
        help = 'Number of steps to run evalution for at each checkpoint',
        default = 10,
        type = int
    )
    parser.add_argument(
        '--eval_data_paths',
        help = 'GCS or local path to evaluation data',
        required = True
    )
    # TensorFlow Transform args
    parser.add_argument(
        '--metadata_path',
        help = 'GCS or local path to transformed metadata if using TFT',
        default = '../../data/tft/metadata'
    )
    # Training arguments
    parser.add_argument(
        '--model_dir',
        help = 'GCS location to write checkpoints and export models',
        required = True
    )
    parser.add_argument(
        '--job-dir',
        help = 'this model ignores this field, but it is required by gcloud',
        default = 'junk'
    )
    # Eval arguments
    parser.add_argument(
        '--eval_delay_secs',
        help = 'How long to wait before running first evaluation',
        default = 10,
        type = int
    )
    parser.add_argument(
        '--min_eval_frequency',
        help = 'Minimum number of training steps between evaluations',
        default = 1,
        type = int
    )
    # Model Specific arguments
    parser.add_argument(
        '--model_type',
        help='Type of ML model, either "linear" or "dnn"',
        default='linear',
        type=str
    )
    parser.add_argument(
        '--embedding_type',
        help='Embedding to use, one of "nnlm", "universal-sentence-encoder", "elmo"',
        default='nnlm',
        type=str
    )
    parser.add_argument(
        '--learning_rate',
        help='Learning rate',
        default=0.01,
        type=float
    )
    parser.add_argument(
        '--hidden_units',
        help='Hidden units of the DNN or (units of multi-layer) RNN model, separated by space e.g. "128 64"',
        default='128 64',
        type=str
    )
    parser.add_argument(
        '--dropout',
        help='Dropout rate (between 0 and 1)',
        default=0.0,
        type=float
    )
    parser.add_argument(
        '--l1_regularization_strength',
        help='L1 regularisation strength; controls how sparse the linear model will be',
        default=0.01,
        type=float
    )
    parser.add_argument(
        '--l2_regularization_strength',
        help='L2 regularisation strength; controls the magnitude of the weights in the linear model',
        default=0.01,
        type=float
    )
    parser.add_argument(
        '--blocks',
        help='No. of blocks in the sepCNN model. Good numbers to try are 1, 2, 4',
        default=2,
        type=int
    )
    parser.add_argument(
        '--filters',
        help='No. of conv filters within each layer in the sepCNN model. Good numbers to try are 8, 16, 32, 64, 128.',
        default=32,
        type=int
    )
    parser.add_argument(
        '--kernel_size',
        help='Kernel size of each conv filter in the sepCNN model. Good numbers to try are 3 and 5',
        default=3,
        type=int
    )
    parser.add_argument(
        '--pool_size',
        help='Pool size of each pooling layer in the sepCNN model',
        default=3,
        type=int
    )
    parser.add_argument(
        '--debug',
        help='use this while testing out if the model works',
        default='False',
        type=str
    )

    args = parser.parse_args()
    arguments = args.__dict__

    # Unused args provided by service
    arguments.pop('job_dir', None)
    arguments.pop('job-dir', None)

    output_dir = arguments['model_dir']

    # Append trial_id to path if we are doing hptuning
    # This code can be removed if you are not using hyperparameter tuning
    output_dir = os.path.join(
        output_dir,
        json.loads(
            os.environ.get('TF_CONFIG', '{}')
        ).get('task', {}).get('trial', '')
    )

    # Run the training job
    try:
        model.train_and_evaluate(arguments)
    except:
        traceback.print_exc()

Overwriting trainer/task.py


In [25]:
%%writefile trainer/__init__.py
#

Overwriting trainer/__init__.py


# Train Model

In [32]:
%%bash
export PYTHONPATH=${PYTHONPATH}:$PWD
rm -rf model_trained
python -m trainer.task \
    --train_data_paths='./data/tft/train*' \
    --eval_data_paths='./data/tft/eval*' \
    --model_dir='./model_trained' \
    --num_epochs=50 \
    --train_batch_size=256 \
    --eval_batch_size=256 \
    --metadata_path='./data/tft/metadata' \
    \
    --model_type='sepcnn' \
    --embedding_type='wikiwords250' \
    --learning_rate=0.01 \
    --hidden_units='128 64' \
    --dropout=0.4 \
    --l1_regularization_strength=0.01 \
    --l2_regularization_strength=0.01 \
    --blocks=2 \
    --filters=32 \
    --kernel_size=3 \
    --pool_size=3 \
    --debug='False'

Process is terminated.


# Serve Model
Better to run this in an actual terminal rather than here, so you can continue running other stuff.

1. Replace 'model_trained' with whatever OUTPUT_DIR you have specified
1. Replace 'exporter' with whatever you specified in `tf.estimator.LatestExporter`

In [None]:
%%bash
tensorflow_model_server \
    --rest_api_port=9000 \
    --model_base_path=${PWD}/model_trained/export/exporter/

# Predictions
The REST API can be called using the following signature: `http:{URI}:{PORT}/v1/models/{MODEL_NAME}[/versions/{VERSION}]:{VERB}`

where

- MODEL_NAME is "default" if no model name is specified when exporting the model
- Specifying the version is optional
- VERB is one of 'classify', 'regress', 'predict'. For serving, you should be using 'predict'
- signature_name should be 'predict' when serving

In [13]:
%%writefile debug.json
{
    "signature_name": "serving_default",
    "instances": [
        {
            "text": "FreeMsg Hey there darling its been 3 weeks now and no word back! Id like some fun you up for it still? Tb ok! XxX std chgs to send, 1.50 to rcv"
        },
        {
            "text": "The quick brown fox jumps over a lazy dog."
        }
    ]
}

Overwriting debug.json


In [31]:
%%bash
curl v -H "Content-Type: application/json" -X POST \
    http://localhost:9000/v1/models/default:predict \
    -d @debug.json

{
    "predictions": [
        {
            "prediction": ["ham"],
            "probabilities": [5.07454e-17]
        },
        {
            "prediction": ["ham"],
            "probabilities": [5.37494e-14]
        }
    ]
}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0curl: (6) Could not resolve host: v
100   558  100   227  100   331   1923   2805 --:--:-- --:--:-- --:--:--  4728
