# Sample Model to Predict Taxi Cab Trip Time

##### This notebook will contain the model script to generate predictions for trip time to be used with the Kubeflow pipelines established. Any model can replace this one to illustrate the functionality of the MLOPS environment.

In [1]:
##load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from io import BytesIO
from google.cloud import storage
import tensorflow as tf

In [2]:
import numpy as np
import json

In [8]:
storage_client = storage.Client()
bk = storage_client.get_bucket('taxi-mlops-demo-kubeflowpipelines-default')
blob = bk.get_blob('mlops_demo/model_metrics.json') 
fileData = json.loads(blob.download_as_string())   
new_mae = fileData['curr_mae']

In [9]:
int(1e6)

1000000

In [11]:
file1 = open("output.txt","a")

In [12]:
file1.write("Hello \n")

7

In [13]:
file1.close()

In [14]:
file1 = open("output.txt","r+")

In [15]:
file1.read()

'Hello \n'

In [7]:
##connect to training dataset
storage_client = storage.Client()
bucket = storage_client.get_bucket('taxi-mlops-demo-kubeflowpipelines-default')
    # Get the file we want
blob = bucket.get_blob('mlops_demo/eval/model_metrics.json')
fileData = json.loads(blob.download_as_string())

#with open("gs://taxi-mlops-demo-kubeflowpipelines-default/mlops_demo/eval/model_metrics.json") as f:
   #   curr_mae = json.load(f)['curr_mae']

AttributeError: 'NoneType' object has no attribute 'download_as_string'

In [45]:
fileData['new_mae']

413708

In [3]:
##connect to training dataset
storage_client = storage.Client()
bucket = storage_client.get_bucket('mlops_model_input')
file = '/data_input/Taxi_Trips_3mo.csv'
blob = bucket.blob(file)
path = "gs://mlops_model_input" + file
df = pd.read_csv(path)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
df.dtypes

Trip ID                        object
Taxi ID                        object
Trip Start Timestamp           object
Trip End Timestamp             object
Trip Seconds                   object
Trip Miles                    float64
Pickup Census Tract           float64
Dropoff Census Tract          float64
Pickup Community Area         float64
Dropoff Community Area        float64
Fare                           object
Tips                          float64
Tolls                         float64
Extras                         object
Trip Total                     object
Payment Type                   object
Company                        object
Pickup Centroid Latitude      float64
Pickup Centroid Longitude     float64
Pickup Centroid Location       object
Dropoff Centroid Latitude     float64
Dropoff Centroid Longitude    float64
Dropoff Centroid  Location     object
dtype: object

In [6]:
df.head()

Unnamed: 0,Trip ID,Taxi ID,Trip Start Timestamp,Trip End Timestamp,Trip Seconds,Trip Miles,Pickup Census Tract,Dropoff Census Tract,Pickup Community Area,Dropoff Community Area,...,Extras,Trip Total,Payment Type,Company,Pickup Centroid Latitude,Pickup Centroid Longitude,Pickup Centroid Location,Dropoff Centroid Latitude,Dropoff Centroid Longitude,Dropoff Centroid Location
0,3e1ac82501797e2e6cd020d2eb6a47774add503f,51e37146cd6467734e50a00ffe2ebbb77ee35e5f207817...,01/01/2021 12:15:00 AM,01/01/2021 12:30:00 AM,1140,6.4,,,28.0,6.0,...,0.0,25.35,Credit Card,Medallion Leasin,41.874005,-87.663518,POINT (-87.6635175498 41.874005383),41.944227,-87.655998,POINT (-87.6559981815 41.9442266014)
1,3adf277e43154ac0cd099c7bb7137f75dcfded95,d24314a66ebc6319a50cc335d6896612b845ca15f702ee...,01/01/2021 12:15:00 AM,01/01/2021 12:15:00 AM,420,1.3,,,8.0,8.0,...,0.0,7.0,Cash,Top Cab Affiliation,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111)
2,352e025da59c35567204a5025d0a7ebbb41e7e93,67d5ca2736337fa9f349db0f29887fe290c0cd84263055...,01/01/2021 12:15:00 AM,01/01/2021 12:30:00 AM,660,4.7,,,7.0,3.0,...,0.0,14.25,Cash,Medallion Leasin,41.922686,-87.649489,POINT (-87.6494887289 41.9226862843),41.965812,-87.655879,POINT (-87.6558787862 41.96581197)
3,3276bc172c61ea58ce803d2f34611f6c074c14f1,35057a271731c5b976bda25efe85aa0c1901d0a5fc9ba2...,01/01/2021 12:15:00 AM,01/01/2021 12:15:00 AM,0,0.0,,,8.0,8.0,...,0.0,25.03,Credit Card,Chicago Independents,41.899602,-87.633308,POINT (-87.6333080367 41.899602111),41.899602,-87.633308,POINT (-87.6333080367 41.899602111)
4,46e7d47c11aa83bc3488d7df84f901a92b961461,19e804a8eab9224b352e6a384007418b519864a3e7c2d9...,01/01/2021 12:15:00 AM,01/01/2021 12:15:00 AM,199,1.1,,,14.0,13.0,...,0.0,5.75,Cash,Medallion Leasin,41.968069,-87.721559,POINT (-87.7215590627 41.968069),41.983636,-87.723583,POINT (-87.7235831853 41.9836363072)


In [4]:
df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])

In [5]:
df['start_hr'] = df['Trip Start Timestamp'].dt.hour

In [6]:
df['start_dow'] = df['Trip Start Timestamp'].dt.day_name()

#### Our Model will use and try to predict the total number of Trip Seconds and we will set that to be the dependent variable

In [7]:
x_cols = ["Pickup Centroid Latitude","Pickup Centroid Longitude", "Dropoff Centroid Longitude", 
          "Dropoff Centroid Latitude",'start_hr',
    'start_dow','Trip Miles']

In [8]:
df = df.dropna(subset=x_cols+['Trip Seconds'])

In [9]:
df['Trip Seconds'] = df['Trip Seconds'].str.replace(',','').astype(int)

In [10]:
df['Trip Seconds'] = df['Trip Seconds'].astype(int)

In [11]:
df = df.loc[df['Trip Seconds'] < 5000,:]

In [12]:
df = df[df['Trip Seconds'] > 300]

### Subset to main columns & get train_df

In [13]:
train_df = df.loc[:,x_cols+['Trip Seconds']]

In [14]:
categorical_columns = ['start_dow']
numeric_columns = ["Pickup Centroid Latitude","Pickup Centroid Longitude", 
                   "Dropoff Centroid Longitude", 
          "Dropoff Centroid Latitude",'start_hr','Trip Miles','Trip Seconds']

In [15]:
def preprocess(dataframe, cat_columns,numeric_columns):
    """Converts categorical features to numeric. Removes unused columns.

    Args:
      dataframe: Pandas dataframe with raw data

    Returns:
      Dataframe with preprocessed data
    """

    
    
    #1h-encode
    for col in cat_columns:
        dataframe = dataframe.join(pd.get_dummies(dataframe[col]))
        dataframe = dataframe.drop(columns=[col])
    
    # Convert integer valued (numeric) columns to floating point
    dataframe = dataframe.astype('float32')
    
        
    return dataframe

In [16]:
random_seed = 42
tf.random.set_seed(random_seed)
np.random.seed(random_seed)

In [17]:
prepped_train_df = preprocess(train_df,categorical_columns,numeric_columns)

In [19]:
_LABEL_COLUMN = 'Trip Seconds'

In [20]:
# Split train and test data with labels.
# The pop() method will extract (copy) and remove the label column from the dataframe
train_x, train_y = prepped_train_df, prepped_train_df.pop(_LABEL_COLUMN)

# Reshape label columns for use with tf.data.Dataset
train_y = np.asarray(train_y).astype('float32').reshape((-1, 1))

In [21]:
def standardize(dataframe):
  """Scales numerical columns using their means and standard deviation to get
  z-scores: the mean of each numerical column becomes 0, and the standard
  deviation becomes 1. This can help the model converge during training.

  Args:
    dataframe: Pandas dataframe

  Returns:
    Input dataframe with the numerical columns scaled to z-scores
  """
  dtypes = list(zip(dataframe.dtypes.index, map(str, dataframe.dtypes)))
  # Normalize numeric columns.
  for column, dtype in dtypes:
      if dtype == 'float32':
          dataframe[column] -= dataframe[column].mean()
          dataframe[column] /= dataframe[column].std()
  return dataframe


# Join train_x and eval_x to normalize on overall means and standard
# deviations. Then separate them again.
train_x = standardize(train_x)

In [22]:
train_x.head(3)

Unnamed: 0,Pickup Centroid Latitude,Pickup Centroid Longitude,Dropoff Centroid Longitude,Dropoff Centroid Latitude,start_hr,Trip Miles,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
0,-1.160408,2.806463,4.199026,-0.203595,-2.898304,0.130545,2.265493,-0.429313,-0.374679,-0.323708,-0.422757,-0.423209,-0.435342
1,-0.819246,3.166057,4.601823,-0.807461,-2.898304,-0.754448,2.265493,-0.429313,-0.374679,-0.323708,-0.422757,-0.423209,-0.435342
2,-0.51154,2.973456,4.201193,0.088531,-2.898304,-0.164452,2.265493,-0.429313,-0.374679,-0.323708,-0.422757,-0.423209,-0.435342


In [23]:
train_x.head(1).values

array([[-1.1604081 ,  2.8064632 ,  4.199026  , -0.20359468, -2.8983045 ,
         0.13054544,  2.2654932 , -0.4293134 , -0.37467915, -0.32370767,
        -0.42275685, -0.42320913, -0.43534166]], dtype=float32)

In [25]:
model = tf.keras.models.load_model('mlops_demo/')

In [28]:
model.evaluate(train_x,train_y)



488964.4375

In [None]:
keras_model.summary()

In [None]:
mae = round(keras_model.evaluate(train_x,train_y),0)

In [None]:
mae = int(mae)

In [None]:
mae

In [None]:
import json
with open('my_model/model_metrics.json', 'w') as json_file:
    mae_dict = {'curr_mae':mae}
    json.dump(mae_dict, json_file)

In [None]:
CATEGORICAL_COLUMNS+NUMERIC_COLUMNS

In [None]:
xt_ordered = x_train[CATEGORICAL_COLUMNS+NUMERIC_COLUMNS]

In [None]:
train_y[:10]

In [None]:
xt_ordered.head(1).T.to_dict()[0]

In [None]:
import json

with open('prediction_input.json', 'w') as json_file:
  for row in train_x.head(10).values.tolist():
    json.dump(row, json_file)
    json_file.write('\n')

In [None]:
import json

jdict = {}
#with open('prediction_input.json', 'w') as json_file:
inst_ls = []
ct = 1
#inst_ls.append(xt_ordered.head(1).T.to_dict()[0])
#with open('prediction_input.json', 'w') as json_file:
    #for row in xt_ordered.head(1).values.tolist():
    #   json.dump(row, json_file)
    #    json_file.write('\n')
    
         #inst_ls.append(row)

#inst_ls.append(xt_ordered.head(1).T.to_dict()[0])#,"key":ct})    
inst_ls = [-0.1721081 ,  0.06131365, -0.01878417,  0.84623253, -2.8942175 ,
         0.27650514,  2.262631  , -0.4293633 , -0.37099934, -0.31818122,
        -0.42654246, -0.4253335 , -0.43786868]
    #ct+=1
    #json.dump(row, json_file)
    #json_file.write('\n')
    
jdict = {'instances': inst_ls}

with open('prediction_input.json', 'w') as json_file:
  json.dump(jdict, json_file)

In [None]:
jdict

In [None]:
xt_ordered.head(1).T.to_dict()[0]

In [None]:
xt_ordered.head(1).values

In [None]:
feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
print(feature_spec)
serving_input_receiver_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)

In [None]:
tf.saved_model.load("gs://taxi-mlops-demo-kubeflowpipelines-default/mlops-demo/model/1620678644/saved_model.pb")

In [None]:
xt_ordered.head(1).values.tolist()[0]

In [None]:
jdict

In [None]:
xt_ordered.head(1).T.to_dict()[0]

In [None]:
jdict

In [None]:
## Train the model
linear_est = tf.estimator.LinearRegressor(feature_columns=feature_columns)
linear_est.train(train_input_fn)
#result = linear_est.evaluate(eval_input_fn)
##remove null values, the error is related to that.

In [None]:
inputFn = tf.estimator.export.build_parsing_serving_input_receiver_fn(tf.feature_column.make_parse_example_spec(feature_columns))

modelPath = linear_est.export_saved_model('md', inputFn)

In [None]:
importedModel = tf.saved_model.load('md/1620831681')

In [None]:
importedModel.__dict__.keys()

In [None]:
importedModel.variables