# **Checkpoint Design Pattern**

### ***Loading Libraries***

In [2]:
!pip install tensorflow-io

Collecting tensorflow-io
  Downloading tensorflow_io-0.37.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (49.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-io
Successfully installed tensorflow-io-0.37.0


In [3]:
# Operating Systems
import os
import shutil

# Numerical Computing
import numpy as np

# Data Manipuation
import pandas as pd

# SciPy
import scipy
from scipy import stats

# Data Visualization
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

# BigQuery
from google.cloud import bigquery
from google.colab import auth

# Scikit-Learn
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer

# Extreme Gradient Boosting
import xgboost as xgb

# TensorFlow
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
from tensorflow.keras import Model
import tensorflow_datasets as tfds
from tensorflow_hub import KerasLayer
from tensorflow import feature_column as fc
from tensorflow.python.framework import dtypes
from tensorflow.keras.preprocessing import text
from tensorflow.keras.utils import to_categorical
from tensorflow_io.bigquery import BigQueryClient
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import callbacks, layers, models, utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Embedding, Input, Flatten, Conv2D, MaxPooling2D

In [5]:
def features_and_labels(features):
  label = features.pop('tolls_amount')
  return features, tf.cast(label > 0, dtypes.int64, name='threshold')

def read_dataset(client, row_restriction, batch_size=2048, infinite=True):
    GCP_PROJECT_ID='ai-analytics-solutions'
    COL_NAMES = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'tolls_amount']
    COL_TYPES = [dtypes.float64] * len(COL_NAMES)
    DATASET_GCP_PROJECT_ID, DATASET_ID, TABLE_ID,  = 'bigquery-public-data.new_york.tlc_green_trips_2015'.split('.')
    bqsession = client.read_session(
        "projects/" + GCP_PROJECT_ID,
        DATASET_GCP_PROJECT_ID, TABLE_ID, DATASET_ID,
        COL_NAMES, COL_TYPES,
        requested_streams=2,
        row_restriction=row_restriction + ' AND pickup_longitude > -80 AND dropoff_longitude < -70')
    dataset = bqsession.parallel_read_rows()
    dataset = dataset.prefetch(1).map(features_and_labels).shuffle(batch_size*10).batch(batch_size)
    if infinite:
        dataset = dataset.repeat()
    return dataset

client = BigQueryClient()

In [6]:
temp_df = read_dataset(client, "pickup_datetime BETWEEN '2015-01-01' AND '2015-03-31'", 2)

for row in temp_df:
    print(row)

In [7]:
BATCH_SIZE=2048

train_df = read_dataset(client, "pickup_datetime BETWEEN '2015-01-01' AND '2015-03-31'", BATCH_SIZE)
eval_df = read_dataset(client, "pickup_datetime BETWEEN '2015-04-01' AND '2015-04-30'", BATCH_SIZE, infinite=False)

### ***Model Building***

In [8]:
metrics = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='roc_auc'),
]


inputs = {
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='float64')
    for colname in ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']
}
input_fc = [tf.feature_column.numeric_column(colname) for colname in inputs.keys()]

transformed = inputs.copy()
input_layer = tf.keras.layers.DenseFeatures(input_fc, name='features')(transformed)


d1 = tf.keras.layers.Dense(16, activation='relu', name='d1')(input_layer)
d2 = tf.keras.layers.Dropout(0.25, name='d2')(d1)
d3 = tf.keras.layers.Dense(16, activation='relu', name='d3')(d2)
output = tf.keras.layers.Dense(1, activation='sigmoid', name='d4', bias_initializer=tf.keras.initializers.Constant())(d3)

model = tf.keras.Model(inputs, output)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=metrics)
tf.keras.utils.plot_model(model, rankdir='LR')

In [9]:
class_weight = {0: 0.5, 1: 25.0}

OUTDIR='trained'
import shutil
shutil.rmtree(OUTDIR, ignore_errors=True)

NUM_TRAINING_EXAMPLES = 1000 * 1000 * 5
STOP_POINT = 3.5
TOTAL_TRAINING_EXAMPLES = int(STOP_POINT * NUM_TRAINING_EXAMPLES)
NUM_CHECKPOINTS = 10
steps_per_epoch = (TOTAL_TRAINING_EXAMPLES //
                   (BATCH_SIZE*NUM_CHECKPOINTS))

checkpoint_path = '{}/checkpoints/taxi'.format(OUTDIR)
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_path,
                                                 save_weights_only=False,
                                                 verbose=1)

history = model.fit(train_df, validation_data=eval_df,
                    epochs=NUM_CHECKPOINTS,
                    steps_per_epoch=steps_per_epoch,
                    class_weight=class_weight)