In [0]:
%%bash
pip uninstall -y google-cloud-dataflow
pip install --upgrade --force tensorflow_transform apache-beam[gcp]

In [0]:
import tensorflow as tf
import tensorflow_transform as tft
import tensorflow_hub as hub
from tensorflow_transform.beam import impl as beam_impl
import shutil
import os
import pandas as pd
import datetime
import apache_beam as beam

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format
print(tf.__version__)

In [0]:
REGION = 'asia-east1'
BUCKET = '{BUCKET}'
PROJECT = '{PROJECT}'

# Cloud Setup
This section is only required if running on cloud

In [0]:
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [0]:
%%bash
gcloud config set project $PROJECT
gcloud config set compute/region $REGION

# Split Dataset
Example uses 80-10-10 split for train, eval and test - change if necessary

In [0]:
df = pd.read_csv('https://dl.dropboxusercontent.com/s/y7lm7aton223abm/spam.csv')[['v1', 'v2']]
df

In [0]:
RANDOM_SEED = 42
train = df.sample(frac=0.8, random_state=RANDOM_SEED)
eval = df.drop(train.index)
test = eval.sample(frac=0.5, random_state=RANDOM_SEED)
eval = eval.drop(test.index)

In [0]:
def export_datasets(on_cloud=False):
  if on_cloud:
    data_dir = 'gs://{bucket}/spam-classification/data/split'.format(bucket=BUCKET)
  else:
    data_dir = 'data/split'
  
  train.to_csv(os.path.join(data_dir, 'train.csv'))
  eval.to_csv(os.path.join(data_dir, 'eval.csv'))
  test.to_csv(os.path.join(data_dir, 'test.csv'))

# Create Dataset using tf.transform

In [0]:
%writefile requirements.txt
tensorflow
tensorflow-transform
apache-beam

In [0]:
def get_dataset_path(phase, on_cloud=False):
  if on_cloud:
    data_dir = 'gs://{bucket}/spam-classification/data/split'.format(bucket=BUCKET)
  else:
    data_dir = 'data/split'
  
  if phase == 'train':
    dataset_dir = os.path.join(data_dir, 'train*.csv')
  elif pahse == 'eval':
    dataset_dir = os.path.join(data_dir, 'eval*.csv')
  else:
    dataset_dir = os.path.join(data_dir, 'test*.csv')
    
  return dataset_dir
    

def is_valid(inputs):
  try:
    text = inputs['text']
    spam = inputs['spam']
    return (text in ('ham', 'spam')) and (len(text) > 0)
  except:
    return False


def preprocess_tft(inputs):
  module_url = "https://tfhub.dev/google/nnlm-en-dim128-with-normalization/1"
  embed = hub.Module(module_url)
  result = {}
  result['text_embedding'] = embed([inputs])[0]
  result['spam'] = tf.identity(inputs['spam'])
  return result


def preprocess(on_cloud=False):
  import datetime
  import os
  import tempfile
  from apache_beam.io import tfrecordio
  from tensorflow_transform.coders import example_proto_coder
  from tensorflow_transform.tf_metadata import dataset_metadata
  from tensorflow_transform.tf_metadata import dataset_schema
  from tensorflow_transform.beam import tft_beam_io
  from tensorflow_transform.beam.tft_beam_io import transform_fn_io
  
  job_name = 'preprocess-spam-dataset' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')
  
  if on_cloud:
    print('Launching Dataflow job {} ... hang on'.format(job_name))
    OUTPUT_DIR = 'gs://{bucket}/spam-classification/preproc_tft/'.format(bucket=BUCKET)
    import subprocess
    subprocess.call('gsutil rm -r {}'.format(OUTPUT_DIR).split())
  else:
    import shutil
    print('Launching local job ... hang on')
    OUTPUT_DIR = './preproc_tft'
    shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
    
  options = {
    'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
    'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
    'job_name': job_name,
    'project': PROJECT,
    'max_num_workers': 24,
    'teardown_policy': 'TEARDOWN_ALWAYS',
    'no_save_main_session': True,
    'requirements_file': 'requirements.txt'
  }
  opts = beam.pipeline.PipelineOptions(flags=[], **options)
  
  if on_cloud:
    RUNNER = 'DataflowRunner'
  else:
    RUNNER = 'DirectRunner'
    
  # set up metadata
  raw_data_schema = {
      colname: dataset_schema.ColumnSchema(
          tf.string, 
          [], 
          dataset_schema.FixedColumnRepresentation()
      ) 
      for colname in 'spam,text'.split(',')
  }
  # raw_data_schema.update({
  #     colname: dataset_schema.ColumnSchema(
  #         tf.float64, 
  #         [], 
  #         dataset_schema.FixedColumnRepresentation()
  #     )
  #     for colname in 'somecolname,anothercolname'.split(',')
  #   })
  raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.Schema(raw_data_schema))
  
  # run Beam
  with beam.Pipeline(RUNNER, options=opts) as p:
    with beam_impl.Context(temp_dir=os.path.join(OUTPUT_DIR, 'tmp')):
      # save the raw data metadata
      _ = (
          raw_data_metadata
          | 'write_input_metadata' >> tft_beam_io.WriteMetadata(
              os.path.join(OUTPUT_DIR, 'metadata/rawdata_metadata'),
              pipeline=p))
      
      # analyze and transform training
      raw_train_data = (
          p 
          | 'train_read' >> beam.io.Read(beam.io.ReadFromText(get_dataset_path(phase='train', on_cloud=on_cloud)))
          | 'train_filter' >> beam.Filter(is_valid))
      
      raw_train_dataset = (raw_train_data, raw_data_metadata)
      transformed_train_dataset, transform_fn = (
          raw_train_dataset | beam_impl.AnalyzeAndTransformDataset(preprocessing_fn=preprocess_tft))
      transformed_train_data, transformed_metadata = transformed_train_dataset
      
      # write transformed training data
      _ = (
          transformed_train_data
          | 'write_train_data' >> tfrecordio.WriteToTFRecord(
              os.path.join(OUTPUT_DIR, 'train'),
              file_name_suffix='.gz',
              coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))
      
      # transform eval data
      raw_eval_data = (
          p
          | 'eval_read' >> beam.io.Read(beam.io.ReadFromText(get_dataset_path(phase='eval', on_cloud=on_cloud)))
          | 'eval_filter' >> beam.Filter(is_valid))
      raw_eval_dataset = (raw_eval_data, raw_data_metadata)
      transformed_eval_dataset = (
          (raw_eval_dataset, transform_fn) | beam_impl.TransformDataset())
      transformed_eval_data, _ = transformed_eval_dataset
      
      # write eval data
      _ = (
          transformed_test_data
          | 'write_test_data' >> tfrecordio.tfrecordio.WriteToTFRecord(
              os.path.join(OUTPUT_DIR, 'eval'),
              file_name_suffix='.gz',
              coder=example_proto_coder.ExampleProtoCoder(transformed_metadata.schema)))
      
      # write transform function for serving
      _ = (
          transform_fn
          | 'write_transform_fn' >> transform_fn_io.WriteTransformFn(os.path.join(OUTPUT_DIR, 'metadata'))
      )
      
preprocess(on_cloud=False)