Import Required Libraries

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import dayofweek, year, month, hour
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import RegressionMetrics

Create Spark session

In [2]:
spark = SparkSession.builder \
  .appName('Chicagotaxi') \
  .config('spark.jars.packages', 'com.google.cloud.spark:spark-bigquery-with-dependencies_2.11:0.15.1-beta,com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc2') \
  .config("spark.jars.repositories", "https://mmlspark.azureedge.net/maven") \
  .getOrCreate()

Import libraries from LBGM

In [3]:
from mmlspark.featurize import AssembleFeatures
from mmlspark.lightgbm import LightGBMRegressor

Create a spark dataframe with subset of data from Bigquery.  Removing the filter condition from the below option will give full dataset from the database

In [4]:
df_master = spark.read.format('bigquery') \
            .option("credentialsFile", 'key.json') \
            .option('parentproject', 'zeta-treat-276509') \
            .option('project', 'zeta-treat-276509') \
            .option('table', 'bigquery-public-data:chicago_taxi_trips.taxi_trips') \
            .option("filter",
                    "EXTRACT(MONTH from trip_start_timestamp) = 3 and "
                    "EXTRACT(DAYOFWEEK from trip_start_timestamp) = 3 and "
                    "EXTRACT(YEAR from trip_start_timestamp) = 2019") \
            .load()

For Model purpose, lets choose only the below fields
1. trip_start_timestamp
2. pickup_latitude, pickup_longitude
3. dropoff_latitude, dropoff_longitude
4. compare
5. fare - This field will be our label to predict

In [5]:
df = df_master[['trip_start_timestamp','pickup_latitude','pickup_longitude','dropoff_latitude','dropoff_longitude','company','fare']]
df.schema

StructType(List(StructField(trip_start_timestamp,TimestampType,true),StructField(pickup_latitude,DoubleType,true),StructField(pickup_longitude,DoubleType,true),StructField(dropoff_latitude,DoubleType,true),StructField(dropoff_longitude,DoubleType,true),StructField(company,StringType,true),StructField(fare,DoubleType,true)))

In [6]:
df.summary()

DataFrame[summary: string, pickup_latitude: string, pickup_longitude: string, dropoff_latitude: string, dropoff_longitude: string, company: string, fare: string]

Drop the rows that have blank values

In [7]:
df = df.dropna()

Remove the rows that have fare less than $2.70, which is the minium taxi fare in chicago

In [8]:
df = df.filter(df.fare >= 2.70)

Convert the given timestamp to CST

In [9]:
df = df.withColumn('trip_start_timestamp_dt',F.to_timestamp(F.unix_timestamp('trip_start_timestamp', 'yyy-MM-dd HH:mm:ss Z').cast('timestamp')))

In [10]:
df = df.withColumn('trip_start_timestamp_cst', F.from_utc_timestamp('trip_start_timestamp_dt', 'CST'))

Get Day of the week, trip year, trip momnth, trip hour

In [11]:
df = df.withColumn('Trip_Day_Of_Week', dayofweek(df.trip_start_timestamp))

In [12]:
df = df.withColumn('Trip_Year', year(df.trip_start_timestamp))

In [13]:
df = df.withColumn('Trip_Month', month(df.trip_start_timestamp))

In [14]:
df = df.withColumn('Trip_Hour', hour(df.trip_start_timestamp_cst))

In [15]:
df.show(5)

+--------------------+---------------+----------------+----------------+-----------------+--------------------+----+-----------------------+------------------------+----------------+---------+----------+---------+
|trip_start_timestamp|pickup_latitude|pickup_longitude|dropoff_latitude|dropoff_longitude|             company|fare|trip_start_timestamp_dt|trip_start_timestamp_cst|Trip_Day_Of_Week|Trip_Year|Trip_Month|Trip_Hour|
+--------------------+---------------+----------------+----------------+-----------------+--------------------+----+-----------------------+------------------------+----------------+---------+----------+---------+
| 2019-03-12 13:30:00|      41.968069|   -87.721559063|       41.968069|    -87.721559063|Star North Manage...|6.25|    2019-03-12 13:30:00|     2019-03-12 08:30:00|               3|     2019|         3|        8|
| 2019-03-11 20:45:00|      41.968069|   -87.721559063|       41.968069|    -87.721559063|Star North Manage...|6.25|    2019-03-11 20:45:00|    

Split the data as test and train set

In [16]:
(train, test) = df.randomSplit([.90, 0.10], seed=42)

In [17]:
print((df.count(), len(df.columns)))

(191210, 13)


In [18]:
df.show(5)

+--------------------+---------------+----------------+----------------+-----------------+---------+----+-----------------------+------------------------+----------------+---------+----------+---------+
|trip_start_timestamp|pickup_latitude|pickup_longitude|dropoff_latitude|dropoff_longitude|  company|fare|trip_start_timestamp_dt|trip_start_timestamp_cst|Trip_Day_Of_Week|Trip_Year|Trip_Month|Trip_Hour|
+--------------------+---------------+----------------+----------------+-----------------+---------+----+-----------------------+------------------------+----------------+---------+----------+---------+
| 2019-03-05 02:00:00|      41.968069|   -87.721559063|       41.968069|    -87.721559063|Flash Cab|3.75|    2019-03-05 02:00:00|     2019-03-04 20:00:00|               3|     2019|         3|       20|
| 2019-03-04 19:00:00|      41.968069|   -87.721559063|       41.968069|    -87.721559063|Flash Cab|7.75|    2019-03-04 19:00:00|     2019-03-04 13:00:00|               2|     2019|       

Create an assembler and lgbm model

In [19]:
assembler = AssembleFeatures(columnsToFeaturize=['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude',
                                      'Trip_Day_Of_Week', 'Trip_Year', 'Trip_Month', 'Trip_Hour','company'],
                            numberOfFeatures=9)
lgbm = LightGBMRegressor(learningRate=0.001,
                           numIterations=50,
                           featuresCol='features',
                           labelCol='fare')
STAGES = [assembler, lgbm]

Train the model on train data set

In [20]:
train_pip = Pipeline(stages=STAGES)
model = train_pip.fit(train)

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:59424)
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/py4j/java_gateway

Py4JError: An error occurred while calling o109.fit

Predict the fare for test set

In [None]:
results = model.transform(test)

In [None]:
results.show(5)

Calculate Root Mean Square Error of the precited results

In [None]:
valuesandpreds = results.rdd.map(lambda p: (float(p.prediction), p.fare))
metric = RegressionMetrics(valuesandpreds)
print('RMSE for Light GBM is ', metric.rootMeanSquaredError)