In [None]:
import tensorflow as tf
import datetime
import os
print tf.__version__

## Set gcloud commands' variables

In [None]:
PROJECT = 'ksalama-gcp-playground'
BUCKET = 'ksalama-gcs-cloudml'
REGION = 'europe-west1'
CURRENT_DATE = datetime.datetime.now().strftime('%y%m%d%H%M%S') 

os.environ['PROJECT'] = PROJECT
os.environ['BUCKET'] = BUCKET
os.environ['REGION'] = REGION

## Download the ml-package from GCS

In [None]:
%%bash
rm -rf nyc-taxifare-trainer 
gsutil -m cp -r gs://$BUCKET/ml-packages/nyc-taxifare-trainer .
ls nyc-taxifare-trainer/trainer

## Run ml-package locally using gcloud ml-engine command

In [None]:
# %%bash
# rm -rf trained_models/dnn_combined_regression_model_gcloud

# gcloud ml-engine local train \
#    --module-name=trainer.task \
#    --package-path=nyc-taxifare-trainer/trainer \
#    -- \
#    --train-files=../data/train-data.csv \
#    --eval-files=../data/test-data.csv  \
#    --num-epochs=2 \
#    --job-dir=../trained_models/dnn_combined_regression_model_gcloud

## Run ml-package on Google Cloud ML Engine (big data + GPUs)

In [None]:
%%bash

OUTDIR=gs://${BUCKET}/ml-models/taxifare/dnn-combined-regression-big-gpu
JOBNAME=train_taxifare_model_$(date -u +%y%m%d%H%M%S)

echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
   --region=$REGION \
   --module-name=trainer.task \
   --package-path=nyc-taxifare-trainer/trainer \
   --job-dir=$OUTDIR \
   --staging-bucket=gs://stagging-ksalama-gcs-cloudml \
   --scale-tier=BASIC_GPU \
   --runtime-version=1.2 \
   #--config=nyc-taxifare-trainer/hyperparams.yaml \
   -- \
#    --train_data_paths="gs://${BUCKET}/data/nyc-taxifare/big/train*" \
#    --eval_data_paths="gs://${BUCKET}/data/nyc-taxifare/big/valid*"  \
   --train_data_paths="gs://${BUCKET}/data/nyc-taxifare/train-data.csv" \
   --eval_data_paths="gs://${BUCKET}/data/nyc-taxifare/valid-data.csv"  \
   --output_dir=$OUTDIR \
   --num_epochs=1000 --train_batch_size=10000 --nbuckets=16 --hidden_units="64 64 64 8"
   

## Run ml-package on Google Cloud ML Engine (small data + standard cluster)

In [None]:
%%bash

OUTDIR=gs://${BUCKET}/ml-models/taxifare/dnn-combined-regression-small
JOBNAME=train_taxifare_model_$(date -u +%y%m%d%H%M%S)

echo $OUTDIR $REGION $JOBNAME
gsutil -m rm -rf $OUTDIR
gcloud ml-engine jobs submit training $JOBNAME \
   --region=$REGION \
   --module-name=trainer.task \
   --package-path=nyc-taxifare-trainer/trainer \
   --job-dir=$OUTDIR \
   --staging-bucket=gs://stagging-ksalama-gcs-cloudml \
   --scale-tier=STANDARD_1 \
   --runtime-version=1.2 \
   -- \
   --train_data_paths="gs://${BUCKET}/data/nyc-taxifare/train-data.csv" \
   --eval_data_paths="gs://${BUCKET}/data/nyc-taxifare/valid-data.csv"  \
   --output_dir=$OUTDIR \
   --num_epochs=10

## Results comparison

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.DataFrame({
              'Method' : pd.Series(['Basline', 'Linear Reg', 'DNN', ' Comb DNN + Feature Eng', '+ Hyperparam Tuning', '+ Big Data']),
              'RMSE': pd.Series([8.89, 11.15, 14.94, 7.9, 5.42, 3.01]) })

plt.figure(figsize=(15, 8))
ax = sns.barplot(data=df, x='Method', y='RMSE')
ax.set_ylabel('RMSE (dollars)')
ax.set_xlabel('Method')
plt.plot(np.linspace(-20,120,1000), [7.9]*1000, 'b');