# Trip Prediction with Machine Learning in Spark

We are interested in seeing how weather and temperature impact daily trips. 

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import StringType
from pyspark.sql.functions import *
from datetime import datetime

import sys
import operator
import re, string
assert sys.version_info >= (3, 5)  # make sure we have Python 3.5+

DATA_PATH='/user/chowkec/capitalbikeshare/data/'
OUTPUT_PATH='/user/chowkec/capitalbikeshare/output/'

spark = SparkSession.builder.appName('machineLearning').getOrCreate()
#assert spark.version >= '2.4' # make sure we have Spark 2.4+
spark.sparkContext.setLogLevel('WARN')

## read in tripdata and weather data

In [2]:
# create tripdate schema types
tripdata_schema = types.StructType([
    types.StructField('duration', types.IntegerType()),
    types.StructField('start_date', types.TimestampType()),
    types.StructField('end_date', types.TimestampType()),
    types.StructField('start_station_number', types.StringType()),
    types.StructField('start_station', types.StringType()),
    types.StructField('end_station_number', types.StringType()),
    types.StructField('end_station', types.StringType()),
    types.StructField('bike_number', types.StringType()),
    types.StructField('member_type', types.StringType()),  
])

In [3]:
# using spark to read in the original csv file
trip_data = spark.read.csv(DATA_PATH+'trip', header=True, schema=tripdata_schema)

In [4]:
# read in weather data 
weather = spark.read.csv(DATA_PATH+'weather', header=True)

In [5]:
weather.printSchema

<bound method DataFrame.printSchema of DataFrame[STATION: string, NAME: string, DATE: string, AWND: string, FMTM: string, PGTM: string, PRCP: string, SNOW: string, SNWD: string, TAVG: string, TMAX: string, TMIN: string, WDF2: string, WDF5: string, WESD: string, WSF2: string, WSF5: string, WT01: string, WT02: string, WT03: string, WT04: string, WT05: string, WT06: string, WT07: string, WT08: string, WT09: string, WT11: string, WT13: string, WT14: string, WT15: string, WT16: string, WT17: string, WT18: string, WT21: string, WT22: string]>

In [7]:
# tripdate includes "duration", "start_date", "end_date", "start_station_number", "start_station", 
# "end_station_number", "end_station", "bike_number", and "member_type"
trip_data.show(5)

+--------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+-----------+-----------+
|duration|         start_date|           end_date|start_station_number|       start_station|end_station_number|         end_station|bike_number|member_type|
+--------+-------------------+-------------------+--------------------+--------------------+------------------+--------------------+-----------+-----------+
|    2762|2017-07-01 00:01:09|2017-07-01 00:47:11|               31289|Henry Bacon Dr & ...|             31289|Henry Bacon Dr & ...|     W21474|     Casual|
|    2763|2017-07-01 00:01:24|2017-07-01 00:47:27|               31289|Henry Bacon Dr & ...|             31289|Henry Bacon Dr & ...|     W22042|     Casual|
|     690|2017-07-01 00:01:45|2017-07-01 00:13:16|               31122| 16th & Irving St NW|             31299|Connecticut Ave &...|     W01182|     Member|
|     134|2017-07-01 00:01:46|2017-07-01 00:04:00|        

## Data Transform

In [21]:
weather.createOrReplaceTempView('weather_table')

# select DATE, PRCP (precipitation), SNOW, TMAX (max temperature), TMIN (min temperature)
weather_df = spark.sql("SELECT DATE, PRCP, SNOW AS SNOW_AMOUNT, TMAX, TMIN FROM weather_table")
weather_df.createOrReplaceTempView('weather_table')

# create columns "rain", "snow", "both" with binary number indicators according to the numbers
weather_df = weather_df.withColumn("rain", when(weather_df["PRCP"] != 0.00, 1).otherwise(0))
weather_df = weather_df.withColumn("snow", when(weather_df["SNOW_AMOUNT"] != 0.0, 1).otherwise(0))
weather_df = weather_df.withColumn("both", when((weather_df["SNOW_AMOUNT"] != 0.0) & (weather_df["PRCP"] != 0.00), 1).otherwise(0))

# create column "weather"
weather_df = weather_df.withColumn("weather",
      expr("case when both = 1 then 'both' " +
                       "when rain = '1' and snow = '0' then 'rain' " +
                       "when snow = '1' and rain = '0' then 'snow' " +
                       "else 'sunny' end"))

weather_df.show()

+----------+----+-----------+----+----+----+----+----+-------+
|      DATE|PRCP|SNOW_AMOUNT|TMAX|TMIN|rain|snow|both|weather|
+----------+----+-----------+----+----+----+----+----+-------+
|2010-01-01|0.00|        0.0|  44|  29|   0|   0|   0|  sunny|
|2010-01-02|0.00|        0.0|  30|  20|   0|   0|   0|  sunny|
|2010-01-03|0.00|        0.0|  28|  16|   0|   0|   0|  sunny|
|2010-01-04|0.00|        0.0|  34|  23|   0|   0|   0|  sunny|
|2010-01-05|0.00|        0.0|  37|  27|   0|   0|   0|  sunny|
|2010-01-06|0.00|        0.0|  39|  29|   0|   0|   0|  sunny|
|2010-01-07|0.00|        0.0|  39|  29|   0|   0|   0|  sunny|
|2010-01-08|0.03|        1.0|  32|  23|   1|   1|   1|   both|
|2010-01-09|0.00|        0.0|  36|  23|   0|   0|   0|  sunny|
|2010-01-10|0.00|        0.0|  32|  19|   0|   0|   0|  sunny|
|2010-01-11|0.00|        0.0|  36|  20|   0|   0|   0|  sunny|
|2010-01-12|0.00|        0.0|  35|  29|   0|   0|   0|  sunny|
|2010-01-13|0.00|        0.0|  44|  27|   0|   0|   0| 

In [22]:
weather_df.printSchema

<bound method DataFrame.printSchema of DataFrame[DATE: string, PRCP: string, SNOW_AMOUNT: string, TMAX: string, TMIN: string, rain: int, snow: int, both: int, weather: string]>

In [23]:
tripdata.createOrReplaceTempView('tripdata_table')

In [24]:
#####################################################################################
# Number of daily trip
daily_df = spark.sql("""SELECT CAST(start_date AS DATE) """ + \
                       """AS DATE FROM tripdata_table""")

# daily_df = daily_df.withColumn("trips", daily_df.groupBy('date').count())

daily_df = daily_df.groupBy('DATE').count()
# daily_df.createOrReplaceTempView('daily_table')
# weekday_df = spark.sql("""SELECT start_date, end_date,""" + \
#                        """ WEEKDAY(only_date) As date_description FROM date_table""")
# weekday_df.createOrReplaceTempView('weekday_table')
# weekday_df = spark.sql("""SELECT date_description, COUNT(*) AS number_of_trip FROM """ + \
#                        """weekday_table GROUP BY date_description ORDER BY date_description""")
daily_df.show(5)

+----------+-----+
|      DATE|count|
+----------+-----+
|2017-08-11|12389|
|2017-09-11|13401|
|2014-11-12| 9539|
|2016-03-01| 8588|
|2012-04-17| 6619|
+----------+-----+
only showing top 5 rows



In [25]:
# join weather_df and daily_df 

joined_df = weather_df.join(daily_df, 'DATE')
# left.join(right, "name")

joined_df = joined_df.orderBy('DATE')
# to_date(joined_df.DATE, "YYYY-MM-DD").alias("DATE")
# joined_df.printSchema
joined_df.show(5)

+----------+----+-----------+----+----+----+----+----+-------+-----+
|      DATE|PRCP|SNOW_AMOUNT|TMAX|TMIN|rain|snow|both|weather|count|
+----------+----+-----------+----+----+----+----+----+-------+-----+
|2010-09-20|0.00|        0.0|  81|  64|   0|   0|   0|  sunny|  212|
|2010-09-21|0.00|        0.0|  80|  56|   0|   0|   0|  sunny|  324|
|2010-09-22|0.00|        0.0|  95|  67|   0|   0|   0|  sunny|  377|
|2010-09-23|0.00|        0.0|  93|  71|   0|   0|   0|  sunny|  373|
|2010-09-24|0.00|        0.0|  99|  72|   0|   0|   0|  sunny|  362|
+----------+----+-----------+----+----+----+----+----+-------+-----+
only showing top 5 rows



In [26]:
joined_df.printSchema

<bound method DataFrame.printSchema of DataFrame[DATE: string, PRCP: string, SNOW_AMOUNT: string, TMAX: string, TMIN: string, rain: int, snow: int, both: int, weather: string, count: bigint]>

In [27]:
joined_df.coalesce(1).write.mode("overwrite").csv(OUTPUT_PATH+"daily_weather", header=True)

In [28]:
# create daily trip data schema 

dailydata_schema = types.StructType([
    types.StructField('DATE', types.TimestampType()),
    types.StructField('PRCP', types.FloatType()),
    types.StructField('SNOW_AMOUNT', types.FloatType()),
    types.StructField('TMAX', types.IntegerType()),
    types.StructField('TMIN', types.IntegerType()),
    types.StructField('rain', types.IntegerType()),
    types.StructField('snow', types.IntegerType()),
    types.StructField('both', types.IntegerType()),
    types.StructField('weather', types.StringType()),
    types.StructField('count', types.IntegerType()),  
])


daily_data = spark.read.csv(OUTPUT_PATH+'daily_weather', header=True, schema=dailydata_schema)

In [29]:
daily_data.printSchema

<bound method DataFrame.printSchema of DataFrame[DATE: timestamp, PRCP: float, SNOW_AMOUNT: float, TMAX: int, TMIN: int, rain: int, snow: int, both: int, weather: string, count: int]>

In [30]:
daily_data.show(5)

+-------------------+----+-----------+----+----+----+----+----+-------+-----+
|               DATE|PRCP|SNOW_AMOUNT|TMAX|TMIN|rain|snow|both|weather|count|
+-------------------+----+-----------+----+----+----+----+----+-------+-----+
|2010-09-20 00:00:00| 0.0|        0.0|  81|  64|   0|   0|   0|  sunny|  212|
|2010-09-21 00:00:00| 0.0|        0.0|  80|  56|   0|   0|   0|  sunny|  324|
|2010-09-22 00:00:00| 0.0|        0.0|  95|  67|   0|   0|   0|  sunny|  377|
|2010-09-23 00:00:00| 0.0|        0.0|  93|  71|   0|   0|   0|  sunny|  373|
|2010-09-24 00:00:00| 0.0|        0.0|  99|  72|   0|   0|   0|  sunny|  362|
+-------------------+----+-----------+----+----+----+----+----+-------+-----+
only showing top 5 rows



In [31]:
# create day of week columns 

from pyspark.sql.functions import date_format
df_dof = daily_data.select('DATE', date_format('DATE', 'u').alias('dow_number'), date_format('DATE', 'E').alias('dow_string'))
df_dof.show(5)

+-------------------+----------+----------+
|               DATE|dow_number|dow_string|
+-------------------+----------+----------+
|2010-09-20 00:00:00|         1|       Mon|
|2010-09-21 00:00:00|         2|       Tue|
|2010-09-22 00:00:00|         3|       Wed|
|2010-09-23 00:00:00|         4|       Thu|
|2010-09-24 00:00:00|         5|       Fri|
+-------------------+----------+----------+
only showing top 5 rows



In [32]:
daily_data = daily_data.join(df_dof, 'DATE')
# daily_data = daily_data.withColumn(dow_number, daily_data["dow_number"].cast(IntegerType()))
daily_data.show()

+-------------------+----+-----------+----+----+----+----+----+-------+-----+----------+----------+
|               DATE|PRCP|SNOW_AMOUNT|TMAX|TMIN|rain|snow|both|weather|count|dow_number|dow_string|
+-------------------+----+-----------+----+----+----+----+----+-------+-----+----------+----------+
|2010-09-20 00:00:00| 0.0|        0.0|  81|  64|   0|   0|   0|  sunny|  212|         1|       Mon|
|2010-09-21 00:00:00| 0.0|        0.0|  80|  56|   0|   0|   0|  sunny|  324|         2|       Tue|
|2010-09-22 00:00:00| 0.0|        0.0|  95|  67|   0|   0|   0|  sunny|  377|         3|       Wed|
|2010-09-23 00:00:00| 0.0|        0.0|  93|  71|   0|   0|   0|  sunny|  373|         4|       Thu|
|2010-09-24 00:00:00| 0.0|        0.0|  99|  72|   0|   0|   0|  sunny|  362|         5|       Fri|
|2010-09-25 00:00:00| 0.0|        0.0|  93|  74|   0|   0|   0|  sunny|  539|         6|       Sat|
|2010-09-26 00:00:00| 0.1|        0.0|  76|  63|   1|   0|   0|   rain|  537|         7|       Sun|


In [33]:
daily_data.printSchema

<bound method DataFrame.printSchema of DataFrame[DATE: timestamp, PRCP: float, SNOW_AMOUNT: float, TMAX: int, TMIN: int, rain: int, snow: int, both: int, weather: string, count: int, dow_number: string, dow_string: string]>

## Machine Learning with process data

In [34]:
# select useful columns 

ml = daily_data.select("DATE", 
              "PRCP", 
              "SNOW_AMOUNT", 
              "TMAX", 
              "TMIN", 
              "weather", 
              "dow_number", 
              "dow_string",
              "count")
ml.printSchema()

root
 |-- DATE: timestamp (nullable = true)
 |-- PRCP: float (nullable = true)
 |-- SNOW_AMOUNT: float (nullable = true)
 |-- TMAX: integer (nullable = true)
 |-- TMIN: integer (nullable = true)
 |-- weather: string (nullable = true)
 |-- dow_number: string (nullable = true)
 |-- dow_string: string (nullable = true)
 |-- count: integer (nullable = true)



In [35]:
ml.show(10)

+-------------------+----+-----------+----+----+-------+----------+----------+-----+
|               DATE|PRCP|SNOW_AMOUNT|TMAX|TMIN|weather|dow_number|dow_string|count|
+-------------------+----+-----------+----+----+-------+----------+----------+-----+
|2010-09-20 00:00:00| 0.0|        0.0|  81|  64|  sunny|         1|       Mon|  212|
|2010-09-21 00:00:00| 0.0|        0.0|  80|  56|  sunny|         2|       Tue|  324|
|2010-09-22 00:00:00| 0.0|        0.0|  95|  67|  sunny|         3|       Wed|  377|
|2010-09-23 00:00:00| 0.0|        0.0|  93|  71|  sunny|         4|       Thu|  373|
|2010-09-24 00:00:00| 0.0|        0.0|  99|  72|  sunny|         5|       Fri|  362|
|2010-09-25 00:00:00| 0.0|        0.0|  93|  74|  sunny|         6|       Sat|  539|
|2010-09-26 00:00:00| 0.1|        0.0|  76|  63|   rain|         7|       Sun|  537|
|2010-09-27 00:00:00| 0.2|        0.0|  78|  64|   rain|         1|       Mon|  286|
|2010-09-28 00:00:00| 0.0|        0.0|  84|  65|  sunny|         

## Linear Regression with only numerical data

In [36]:
from pyspark.ml.regression import LinearRegression

In [37]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['PRCP', 'SNOW_AMOUNT', 'TMAX', 'TMIN'], outputCol = 'features')
vector_df = vectorAssembler.transform(ml)
vector_df = vector_df.select(['features', 'count'])
vector_df.show(3)

+-------------------+-----+
|           features|count|
+-------------------+-----+
|[0.0,0.0,81.0,64.0]|  212|
|[0.0,0.0,80.0,56.0]|  324|
|[0.0,0.0,95.0,67.0]|  377|
+-------------------+-----+
only showing top 3 rows



In [38]:
# create train and test data
train_data, test_data = vector_df.randomSplit([.8,.2],seed=1234)

In [39]:
# import `LinearRegression`
from pyspark.ml.regression import LinearRegression

# Initialize `lr`
lr = LinearRegression(featuresCol = 'features', labelCol='count', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_data)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-2312.2633608085325,-116.81584741125842,124.53870323717341,9.081777957576016]
Intercept: -1001.8063933212435


In [40]:
# train model and see training error
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 2908.028212
r2: 0.435713


In [41]:
# train data summary
train_data.describe().show()

+-------+-----------------+
|summary|            count|
+-------+-----------------+
|  count|             2684|
|   mean|7700.555141579732|
| stddev|3871.947212850098|
|    min|               21|
|    max|            18346|
+-------+-----------------+



In [42]:
# predict on test data
lr_predictions = lr_model.transform(test_data)
lr_predictions.select("prediction","count","features").show(5)

from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="count",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-----+-------------------+
|        prediction|count|           features|
+------------------+-----+-------------------+
|1446.1649698032356| 1426| [0.0,0.0,19.0,9.0]|
|1667.9970424048543| 2282| [0.0,0.0,21.0,6.0]|
|1713.4059321927343| 1489|[0.0,0.0,21.0,11.0]|
| 2345.181226336177| 2952|[0.0,0.0,26.0,12.0]|
| 2870.581373157599| 3035|[0.0,0.0,30.0,15.0]|
+------------------+-----+-------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.432235


## Linear Regression with categorical data

In [43]:
ml.show(3)

+-------------------+----+-----------+----+----+-------+----------+----------+-----+
|               DATE|PRCP|SNOW_AMOUNT|TMAX|TMIN|weather|dow_number|dow_string|count|
+-------------------+----+-----------+----+----+-------+----------+----------+-----+
|2010-09-20 00:00:00| 0.0|        0.0|  81|  64|  sunny|         1|       Mon|  212|
|2010-09-21 00:00:00| 0.0|        0.0|  80|  56|  sunny|         2|       Tue|  324|
|2010-09-22 00:00:00| 0.0|        0.0|  95|  67|  sunny|         3|       Wed|  377|
+-------------------+----+-----------+----+----+-------+----------+----------+-----+
only showing top 3 rows



In [44]:
train_df, test_df = ml.randomSplit([.8,.2],seed=1234)

In [45]:
# encode catogorical data with OneHotEncoder
from pyspark.ml.feature import StringIndexer, VectorAssembler, SQLTransformer, OneHotEncoderEstimator
from pyspark.ml import Pipeline


categorical_variables = ['weather', 'dow_string']
indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables]

encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="categorical-features"
)

pipeline = Pipeline(stages=indexers + [encoder, assembler])

train_df = pipeline.fit(train_df).transform(train_df)
test_df = pipeline.fit(test_df).transform(test_df)

In [46]:
# select out continuous_variables 
continuous_variables = ['PRCP', 'SNOW_AMOUNT', 'TMAX', 'TMIN']

# combine categorical-features with continuous-features
assembler = VectorAssembler(
    inputCols=['categorical-features', *continuous_variables],
    outputCol='features'
)

# transform train and test data with assembler
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

In [47]:
# initialize `lr`
lr = LinearRegression(featuresCol = 'features', labelCol='count', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [731.6846174072824,-531.936286065667,-548.6379838607589,197.55078332668162,670.5148664220861,768.1255661088442,667.0658613703243,697.667226200207,462.7270717130937,-1667.634547035611,45.08461271246068,97.39768837537152,41.14999281149254]
Intercept: -1753.4372440733168


In [48]:
# train data statistics
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 2871.962186
r2: 0.450877


In [49]:
# test data statistics
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","count","features").show(5)

lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="count",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+-----+--------------------+
|        prediction|count|            features|
+------------------+-----+--------------------+
| 9501.059671674582|  212|(13,[0,11,12],[1....|
| 9741.527902177593|  324|(13,[0,6,11,12],[...|
|11658.592153786347|  377|(13,[0,4,11,12],[...|
| 5951.718164592574|  505|(13,[1,3,9,11,12]...|
|   8426.9153895427| 1374|(13,[0,5,11,12],[...|
+------------------+-----+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.461891


## Decision Tree

In [147]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


In [148]:
ml.show(5)

+-------------------+----+-----------+----+----+-------+----------+----------+-----+
|               DATE|PRCP|SNOW_AMOUNT|TMAX|TMIN|weather|dow_number|dow_string|count|
+-------------------+----+-----------+----+----+-------+----------+----------+-----+
|2010-09-20 00:00:00| 0.0|        0.0|  81|  64|  sunny|         1|       Mon|  212|
|2010-09-21 00:00:00| 0.0|        0.0|  80|  56|  sunny|         2|       Tue|  324|
|2010-09-22 00:00:00| 0.0|        0.0|  95|  67|  sunny|         3|       Wed|  377|
|2010-09-23 00:00:00| 0.0|        0.0|  93|  71|  sunny|         4|       Thu|  373|
|2010-09-24 00:00:00| 0.0|        0.0|  99|  72|  sunny|         5|       Fri|  362|
+-------------------+----+-----------+----+----+-------+----------+----------+-----+
only showing top 5 rows



In [93]:
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])
trainingData.show(3)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[98,99,100,1...|
|  0.0|(692,[100,101,102...|
|  0.0|(692,[122,123,148...|
+-----+--------------------+
only showing top 3 rows



In [98]:
# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")
# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

predictions.show(5)

+-----+--------------------+--------------------+----------+
|label|            features|     indexedFeatures|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|(692,[95,96,97,12...|(692,[95,96,97,12...|       0.0|
|  0.0|(692,[121,122,123...|(692,[121,122,123...|       0.0|
|  0.0|(692,[122,123,124...|(692,[122,123,124...|       0.0|
|  0.0|(692,[123,124,125...|(692,[123,124,125...|       0.0|
|  0.0|(692,[124,125,126...|(692,[124,125,126...|       0.0|
+-----+--------------------+--------------------+----------+
only showing top 5 rows



In [99]:
# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = model.stages[1]
# summary only
print(treeModel)

+----------+-----+--------------------+
|prediction|label|            features|
+----------+-----+--------------------+
|       0.0|  0.0|(692,[95,96,97,12...|
|       0.0|  0.0|(692,[121,122,123...|
|       0.0|  0.0|(692,[122,123,124...|
|       0.0|  0.0|(692,[123,124,125...|
|       0.0|  0.0|(692,[124,125,126...|
+----------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.164399
DecisionTreeRegressionModel (uid=DecisionTreeRegressor_ad3228f029a6) of depth 1 with 3 nodes


In [149]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

In [150]:
# select useful columns 
ml = ml.select("PRCP", 
              "SNOW_AMOUNT", 
              "TMAX", 
              "TMIN", 
              "weather", 
              "dow_string",
              "count")
ml.printSchema()
ml.show(3)

root
 |-- PRCP: float (nullable = true)
 |-- SNOW_AMOUNT: float (nullable = true)
 |-- TMAX: integer (nullable = true)
 |-- TMIN: integer (nullable = true)
 |-- weather: string (nullable = true)
 |-- dow_string: string (nullable = true)
 |-- count: integer (nullable = true)

+----+-----------+----+----+-------+----------+-----+
|PRCP|SNOW_AMOUNT|TMAX|TMIN|weather|dow_string|count|
+----+-----------+----+----+-------+----------+-----+
| 0.0|        0.0|  81|  64|  sunny|       Mon|  212|
| 0.0|        0.0|  80|  56|  sunny|       Tue|  324|
| 0.0|        0.0|  95|  67|  sunny|       Wed|  377|
+----+-----------+----+----+-------+----------+-----+
only showing top 3 rows



In [151]:
# encode catogorical data with OneHotEncoder
from pyspark.ml.feature import StringIndexer, VectorAssembler, SQLTransformer, OneHotEncoderEstimator
from pyspark.ml import Pipeline


categorical_variables = ['weather', 'dow_string']
indexers = [StringIndexer(inputCol=column, outputCol=column+"-index") for column in categorical_variables]

encoder = OneHotEncoderEstimator(
    inputCols=[indexer.getOutputCol() for indexer in indexers],
    outputCols=["{0}-encoded".format(indexer.getOutputCol()) for indexer in indexers]
)

assembler = VectorAssembler(
    inputCols=encoder.getOutputCols(),
    outputCol="categorical-features"
)

pipeline = Pipeline(stages=indexers + [encoder, assembler])

ml = pipeline.fit(ml).transform(ml)
ml.show(3)

+----+-----------+----+----+-------+----------+-----+-------------+----------------+---------------------+------------------------+--------------------+
|PRCP|SNOW_AMOUNT|TMAX|TMIN|weather|dow_string|count|weather-index|dow_string-index|weather-index-encoded|dow_string-index-encoded|categorical-features|
+----+-----------+----+----+-------+----------+-----+-------------+----------------+---------------------+------------------------+--------------------+
| 0.0|        0.0|  81|  64|  sunny|       Mon|  212|          0.0|             3.0|        (3,[0],[1.0])|           (6,[3],[1.0])| (9,[0,6],[1.0,1.0])|
| 0.0|        0.0|  80|  56|  sunny|       Tue|  324|          0.0|             2.0|        (3,[0],[1.0])|           (6,[2],[1.0])| (9,[0,5],[1.0,1.0])|
| 0.0|        0.0|  95|  67|  sunny|       Wed|  377|          0.0|             1.0|        (3,[0],[1.0])|           (6,[1],[1.0])| (9,[0,4],[1.0,1.0])|
+----+-----------+----+----+-------+----------+-----+-------------+---------------

In [152]:
# select out continuous_variables 
continuous_variables = ['PRCP', 'SNOW_AMOUNT', 'TMAX', 'TMIN']

# combine categorical-features with continuous-features
assembler = VectorAssembler(
    inputCols=['categorical-features',*continuous_variables],
    outputCol='features'
)

# transform train and test data with assembler
ml = assembler.transform(ml)
ml.show(3)

+----+-----------+----+----+-------+----------+-----+-------------+----------------+---------------------+------------------------+--------------------+--------------------+
|PRCP|SNOW_AMOUNT|TMAX|TMIN|weather|dow_string|count|weather-index|dow_string-index|weather-index-encoded|dow_string-index-encoded|categorical-features|            features|
+----+-----------+----+----+-------+----------+-----+-------------+----------------+---------------------+------------------------+--------------------+--------------------+
| 0.0|        0.0|  81|  64|  sunny|       Mon|  212|          0.0|             3.0|        (3,[0],[1.0])|           (6,[3],[1.0])| (9,[0,6],[1.0,1.0])|(13,[0,6,11,12],[...|
| 0.0|        0.0|  80|  56|  sunny|       Tue|  324|          0.0|             2.0|        (3,[0],[1.0])|           (6,[2],[1.0])| (9,[0,5],[1.0,1.0])|(13,[0,5,11,12],[...|
| 0.0|        0.0|  95|  67|  sunny|       Wed|  377|          0.0|             1.0|        (3,[0],[1.0])|           (6,[1],[1.0])

In [153]:
ml.select("features").show(5, False)


+------------------------------------+
|features                            |
+------------------------------------+
|(13,[0,6,11,12],[1.0,1.0,81.0,64.0])|
|(13,[0,5,11,12],[1.0,1.0,80.0,56.0])|
|(13,[0,4,11,12],[1.0,1.0,95.0,67.0])|
|(13,[0,3,11,12],[1.0,1.0,93.0,71.0])|
|(13,[0,7,11,12],[1.0,1.0,99.0,72.0])|
+------------------------------------+
only showing top 5 rows



In [158]:
train_df, test_df = ml.randomSplit([.8,.2],seed=1234)

In [159]:
# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="features",labelCol='count')

# Train model.  This also runs the indexer.dt
dt_model = dt.fit(train_df)

# Make predictions.
predictions = dt_model.transform(test_df)

# Select example rows to display.
predictions.select("prediction", "count").show(5)

+------------------+-----+
|        prediction|count|
+------------------+-----+
|2972.8137931034485| 1426|
|2972.8137931034485| 2282|
|2972.8137931034485| 1489|
|2972.8137931034485| 2952|
|2972.8137931034485| 4010|
+------------------+-----+
only showing top 5 rows



In [160]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# summary only
print(dt_model)

Root Mean Squared Error (RMSE) on test data = 2834.34
DecisionTreeRegressionModel (uid=DecisionTreeRegressor_4c2f97ec28ff) of depth 5 with 63 nodes


In [163]:
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="count",metricName="r2")
print("R Squared (R2) on test data = %g" % evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.452394


## Random forest regression

In [164]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

In [167]:
# Train a DecisionTree model.
rf = RandomForestRegressor(featuresCol="features",labelCol='count')

# Train model.  This also runs the indexer.dt
rf_model = rf.fit(train_df)

# Make predictions.
predictions = rf_model.transform(test_df)

# Select example rows to display.
predictions.select("prediction", "count").show(5)

+------------------+-----+
|        prediction|count|
+------------------+-----+
| 4279.499625231992| 1426|
| 3750.725116437977| 2282|
| 3723.934951932032| 1489|
| 3837.527992554989| 2952|
|3822.8632461716797| 4010|
+------------------+-----+
only showing top 5 rows



In [168]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# summary only
print(rf_model)

Root Mean Squared Error (RMSE) on test data = 2779.86
RandomForestRegressionModel (uid=RandomForestRegressor_bdce62fab1be) with 20 trees


In [169]:
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="count",metricName="r2")
print("R Squared (R2) on test data = %g" % evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.473244


## Gradient-boosted tree regression

In [170]:
from pyspark.ml.regression import GBTRegressor

In [171]:
# Train a DecisionTree model.
gbt = GBTRegressor(featuresCol="features",labelCol='count')

# Train model.  This also runs the indexer.dt
gbt_model = gbt.fit(train_df)

# Make predictions.
predictions = gbt_model.transform(test_df)

# Select example rows to display.
predictions.select("prediction", "count").show(5)

+------------------+-----+
|        prediction|count|
+------------------+-----+
| 3572.324396937591| 1426|
| 2956.649978628918| 2282|
|2554.4041454393387| 1489|
|2949.8051331274987| 2952|
| 3098.102998961212| 4010|
+------------------+-----+
only showing top 5 rows



In [172]:
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="count", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

# summary only
print(gbt_model)

Root Mean Squared Error (RMSE) on test data = 2892.59
GBTRegressionModel (uid=GBTRegressor_3e57796708b8) with 20 trees


In [173]:
evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="count",metricName="r2")
print("R Squared (R2) on test data = %g" % evaluator.evaluate(predictions))

R Squared (R2) on test data = 0.429655
