Import Required Libraries

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import dayofweek, year, month, hour
from pyspark.ml import Pipeline
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Create Spark session

In [2]:
spark = SparkSession.builder \
  .appName('Chicagotaxi') \
  .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.11:0.15.1-beta') \
  .getOrCreate()

Create a spark dataframe with subset of data from Bigquery.  Removing the filter condition from the below option will give full dataset from the database

In [3]:
df_master = spark.read.format('bigquery') \
            .option("credentialsFile", '/Users/karthikeyangurusamy/Documents/GCP/key.json') \
            .option('parentproject', 'zeta-treat-276509') \
            .option('project', 'zeta-treat-276509') \
            .option('table', 'bigquery-public-data:chicago_taxi_trips.taxi_trips') \
            .option("filter",
                    "EXTRACT(MONTH from trip_start_timestamp) = 3 and "
                    "EXTRACT(DAYOFWEEK from trip_start_timestamp) = 3 and "
                    "EXTRACT(YEAR from trip_start_timestamp) = 2019") \
            .load()

For Model purpose, lets choose only the below fields
1. trip_start_timestamp
2. pickup_latitude, pickup_longitude
3. dropoff_latitude, dropoff_longitude
4. compare
5. fare - This field will be our label to predict

In [4]:
df = df_master[['trip_start_timestamp','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','company','fare']]
df.schema

StructType(List(StructField(trip_start_timestamp,TimestampType,true),StructField(pickup_latitude,DoubleType,true),StructField(pickup_longitude,DoubleType,true),StructField(dropoff_latitude,DoubleType,true),StructField(dropoff_longitude,DoubleType,true),StructField(company,StringType,true),StructField(fare,DoubleType,true)))

In [5]:
df.summary()

DataFrame[summary: string, pickup_latitude: string, pickup_longitude: string, dropoff_latitude: string, dropoff_longitude: string, company: string, fare: string]

Drop the rows that have blank values

In [6]:
df = df.dropna()

Remove the rows that have fare less than $2.70, which is the minium taxi fare in chicago

In [7]:
df = df.filter(df.fare >= 2.70)

Convert the given timestamp to CST

In [8]:
df = df.withColumn('trip_start_timestamp_dt',F.to_timestamp(F.unix_timestamp('trip_start_timestamp', 'yyy-MM-dd HH:mm:ss Z').cast('timestamp')))

In [9]:
df = df.withColumn('trip_start_timestamp_cst', F.from_utc_timestamp('trip_start_timestamp_dt', 'CST'))

Get Day of the week, trip year, trip momnth, trip hour

In [10]:
df = df.withColumn('Trip_Day_Of_Week', dayofweek(df.trip_start_timestamp))

In [11]:
df = df.withColumn('Trip_Year', year(df.trip_start_timestamp))

In [12]:
df = df.withColumn('Trip_Month', month(df.trip_start_timestamp))

In [13]:
df = df.withColumn('Trip_Hour', hour(df.trip_start_timestamp_cst))

In [14]:
df.show(5)

+--------------------+---------------+----------------+----------------+-----------------+----------------+----+-----------------------+------------------------+----------------+---------+----------+---------+
|trip_start_timestamp|pickup_latitude|pickup_longitude|dropoff_latitude|dropoff_longitude|         company|fare|trip_start_timestamp_dt|trip_start_timestamp_cst|Trip_Day_Of_Week|Trip_Year|Trip_Month|Trip_Hour|
+--------------------+---------------+----------------+----------------+-----------------+----------------+----+-----------------------+------------------------+----------------+---------+----------+---------+
| 2019-03-18 22:15:00|      41.968069|   -87.721559063|    41.983636307|    -87.723583185|       Flash Cab| 7.0|    2019-03-18 22:15:00|     2019-03-18 17:15:00|               2|     2019|         3|       17|
| 2019-03-05 05:30:00|    41.96581197|   -87.655878786|     41.96581197|    -87.655878786|       Flash Cab|4.25|    2019-03-05 05:30:00|     2019-03-04 23:30:00

Lets create a numeric index column corresponding to the 'Company' column

In [15]:
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol='company', outputCol='Comp_Index')

In [16]:
updated_data = stringIndexer.fit(df).transform(df)
updated_data.show(5)

+--------------------+---------------+----------------+----------------+-----------------+---------+----+-----------------------+------------------------+----------------+---------+----------+---------+----------+
|trip_start_timestamp|pickup_latitude|pickup_longitude|dropoff_latitude|dropoff_longitude|  company|fare|trip_start_timestamp_dt|trip_start_timestamp_cst|Trip_Day_Of_Week|Trip_Year|Trip_Month|Trip_Hour|Comp_Index|
+--------------------+---------------+----------------+----------------+-----------------+---------+----+-----------------------+------------------------+----------------+---------+----------+---------+----------+
| 2019-03-05 02:00:00|      41.968069|   -87.721559063|       41.968069|    -87.721559063|Flash Cab|3.75|    2019-03-05 02:00:00|     2019-03-04 20:00:00|               3|     2019|         3|       20|       1.0|
| 2019-03-04 19:00:00|      41.968069|   -87.721559063|       41.968069|    -87.721559063|Flash Cab|7.75|    2019-03-04 19:00:00|     2019-03-04

Convert all the numerical data columns to a numpy array

In [17]:
import numpy as np
data_array =  np.array(updated_data.select('pickup_latitude','pickup_longitude','dropoff_latitude',
                                        'dropoff_longitude','Trip_Day_Of_Week','Trip_Year',
                                        'Trip_Month','Trip_Hour','fare').collect())

Apply Minimum Maximum Scalar to scale the numerical features and label

In [18]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
data_array = scaler.fit_transform(data_array)
data_array[0]

array([8.81100295e-01, 5.07141618e-01, 9.09029009e-01, 5.01797002e-01,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 7.39130435e-01,
       4.72320885e-04])

Lets collect the categorical index column to a separate numpy array

In [20]:
cat_data = np.array(updated_data.select('Comp_Index').collect())

In [21]:
data_array.shape

(191210, 9)

In [22]:
cat_data.shape

(191210, 1)

Apply one hot encoding to categorical data

In [23]:
from tensorflow.keras.utils import to_categorical
# one hot encode
encoded = to_categorical(cat_data)
print(encoded)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


Concatenate the scaled numerical data column and one hot encoded column to get to the final numpy array

In [24]:
final_data = np.concatenate((data_array[:,:-1],encoded,data_array[:,-1].reshape(191210,1)),axis=1)

In [26]:
final_data.shape

(191210, 58)

Split the data into train and test sets

In [27]:
np.random.shuffle(final_data)
test, training = final_data[:10000,:], final_data[10000:,:]

In [28]:
print(training.shape)
print(test.shape)

(181210, 58)
(10000, 58)


In [29]:
X_train = training[:,:-1]
y_train = training[:,-1]
X_test = test[:,:-1]
y_test = test[:,-1]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(181210, 57) (181210,) (10000, 57) (10000,)


Create and compile a Keras model to apply

In [30]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(57,)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1),
])

In [31]:
model.compile(optimizer='adam',
              loss='MeanSquaredError',
              metrics=[tf.keras.metrics.RootMeanSquaredError()])

Run the training data set through the model

In [32]:
hist = model.fit(X_train, y_train,
          batch_size=32, epochs=5,
          validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
