<a href="https://colab.research.google.com/github/imcinstitute/ML-labs/blob/main/LabVIII_Advanced_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project setup

In [None]:
pip install tensorflow-gpu==2.0.0-rc0


import numpy as np
import pandas as pd
import tensorflow as tf
import time


tf.__version__  
  


# Staging data

In [None]:
%%bigquery flights_df --verbose

SELECT 

  -- Departure delay
  departure_delay,
    
  -- Distance
  distance,

  -- Airlines
  airline,
    
  -- Airports 
  departure_airport,
  arrival_airport, 

  -- Date information
  CAST(EXTRACT(DAYOFWEEK FROM departure_date) AS STRING) as departure_weekday,
  CAST(EXTRACT(MONTH FROM departure_date) AS STRING) as departure_month,

 -- Target column
  CASE WHEN (arrival_delay >= 15) THEN 1 ELSE 0 END as delayed
FROM ( 
    
    -- Inner Query
    SELECT
      
      departure_delay,
      ROUND(ST_DISTANCE(ST_GEOGPOINT(departure_lon, departure_lat), ST_GEOGPOINT(arrival_lon, arrival_lat))/1000) as distance,
      airline,
      arrival_airport,
      departure_airport,
      PARSE_DATE("%Y-%m-%d", date) AS departure_date,
      
      arrival_delay
      
      
    FROM
      `bigquery-samples.airline_ontime_data.flights`
    WHERE date >= '2009-01-01' 
    AND date <= '2009-12-31'
    AND departure_delay > 0
    
  )
%%bigquery high_traffic_airports --verbose

SELECT * FROM
 
 (SELECT departure_airport as airport_code,
  COUNT(*) as flights
  
  FROM
    `bigquery-samples.airline_ontime_data.flights`    
  
  WHERE date >= '2009-01-01' 
    AND date <= '2009-12-31'
    
  GROUP BY departure_airport
  ORDER BY airport_code)

WHERE flights > 10000
%%bigquery airline_codes --verbose

SELECT DISTINCT(airline)
  
FROM
    `bigquery-samples.airline_ontime_data.flights`
    
WHERE date >= '2009-01-01' 
    AND date <= '2009-12-31'
    
ORDER BY airline


# Explore data

In [None]:
flights_df.shape

flights_df.sample(n = 5)

flights_df.dtypes


# Data Processing

In [None]:
#Training-Testing-Split

train_df = flights_df.sample(frac=0.8,random_state=123)
test_df = flights_df.drop(train_df.index)


#Check Label distribution
print(round(flights_df.delayed.mean(),3)*100, '% delay in total dataset')

#Build a tf.data.Dataset

def dataframe_to_dataset(dataframe, labels = 'delayed', shuffle=True, batch_size=32):
    # Creates a tf.data dataset from a Pandas Dataframe
    dataframe = dataframe.copy()
    labels = dataframe.pop(labels)
    dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        dataset = dataset.shuffle(buffer_size=len(dataframe))
    dataset = dataset.batch(batch_size)
    return dataset

batch_size = 256

tf.keras.backend.set_floatx('float64')
train_ds = dataframe_to_dataset(train_df, batch_size=batch_size)
test_ds = dataframe_to_dataset(test_df, shuffle=False, batch_size=batch_size)

#Build Features using tf.feature_column

departure_delay_bins = [2, 3, 6, 9, 13, 19, 28, 44, 76]
distance_bins = [600, 1200]
airports_voc = high_traffic_airports['airport_code']
airlines_voc = airline_codes['airline']
weekdays_voc = ['1', '2', '3', '4', '5', '6', '7']
months_voc = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']


#Build the input pipeline

feature_columns = []

# bucketized columns
distance = tf.feature_column.numeric_column("distance")
distance_buckets = tf.feature_column.bucketized_column(distance, boundaries = distance_bins)
feature_columns.append(distance_buckets)

departure_delay = tf.feature_column.numeric_column("departure_delay")
departure_delay_buckets = tf.feature_column.bucketized_column(departure_delay, boundaries = departure_delay_bins)
feature_columns.append(departure_delay_buckets) 

# categorical columns
arrival_airports = tf.feature_column.categorical_column_with_vocabulary_list('arrival_airport', airports_voc)
arrival_airports_dummy = tf.feature_column.indicator_column(arrival_airports)
feature_columns.append(arrival_airports_dummy)

departure_airports = tf.feature_column.categorical_column_with_vocabulary_list('departure_airport', airports_voc)
departure_airports_dummy = tf.feature_column.indicator_column(departure_airports)
feature_columns.append(departure_airports_dummy)

airlines = tf.feature_column.categorical_column_with_vocabulary_list('airline', airlines_voc)
airlines_dummy = tf.feature_column.indicator_column(airlines)
feature_columns.append(airlines_dummy)

weekdays = tf.feature_column.categorical_column_with_vocabulary_list('departure_weekday', weekdays_voc)
weekdays_dummy = tf.feature_column.indicator_column(weekdays)
feature_columns.append(weekdays_dummy)

months = tf.feature_column.categorical_column_with_vocabulary_list('departure_month', months_voc)
months_dummy = tf.feature_column.indicator_column(months)
feature_columns.append(months_dummy)


# Defining non-distribution model

In [None]:
#Define the feature layer
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

#Build Non-distributed model
model_normal = tf.keras.models.Sequential([
    
    feature_layer,
    tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.0001))
    
    ])

model_normal.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy']
             )


# Defining distribution model

In [None]:
#Creating the Mirrored Strategy instance
distribute = tf.distribute.MirroredStrategy()

#Build distributed model
with distribute.scope():
    model_distributed = tf.keras.models.Sequential([
        feature_layer,
        tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.0001))
        ])

    model_distributed.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy']
                 )


# Training model normal training

In [None]:
start_time = time.time()
history = model_normal.fit(train_ds, 
                    epochs = 5,
                    callbacks = [tf.keras.callbacks.TensorBoard("logs/normal_training")])
print("Normal training took: {}".format(time.time() - start_time))



# Traning model distributed training

In [None]:
start_time = time.time()
history = model_distributed.fit(train_ds,
                    epochs = 5,
                    callbacks = [tf.keras.callbacks.TensorBoard("logs/distributed_training")])
print("Distributed training took: {}".format(time.time() - start_time))
