In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
import preprocessing as prep
import extraction as ext
import yaml
import importlib
from pyspark.ml.feature import Normalizer 

In [2]:
importlib.reload(ext)
importlib.reload(prep)

with open("config.yml", "r") as ymlfile:
    cfg = yaml.safe_load(ymlfile)

spark = SparkSession.builder.appName("NYC_Taxi").getOrCreate()

spark.conf.set("spark.sql.execution.arrow.enabled","true")        
spark.conf.set("spark.local.dir", "c:/tmp_spark")      

print(f"Get Taxi Data")
psdf_taxi = ext.get_taxi_data(spark, cfg)

print(f"Get zones")
df_zones = ext.get_zones(cfg)
print(f"Clean zones")
df_zones = prep.clean_zone_data(df_zones)
psdf_zones=spark.createDataFrame(df_zones)

print(f"Add features taxi")
psdf_taxi = prep.add_features_taxi_data(psdf_taxi, psdf_zones)

print(f"Clean Taxi Data")
psdf_taxi = prep.clean_taxi_data(psdf_taxi, cfg)

print(f"SPDF Taxi size: {psdf_taxi.count()}")


Get Taxi Data
Already downloaded file: yellow_tripdata_2017-01.parquet
Already downloaded file: yellow_tripdata_2017-03.parquet
Already downloaded file: yellow_tripdata_2017-06.parquet
Already downloaded file: yellow_tripdata_2017-09.parquet
Already downloaded file: yellow_tripdata_2017-11.parquet
Already downloaded file: yellow_tripdata_2017-12.parquet
Get zones
Already downloaded file: taxi+_zone_lookup.csv
Clean zones
Add features taxi
Clean Taxi Data
SPDF Taxi size: 385696


In [3]:
psdf_taxi.count()

385696

In [4]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline

category_columns = ["VendorID", 
    # "RateCodeID", # adding this feature causes some surprising errors
    "store_and_fwd_flag", "payment_type", "pu_month", 
    "day_of_week", "hour",
    "PU_Borough", "PU_Zone", "PU_service_zone", 
    "DO_Borough", "DO_Zone", "DO_service_zone"]  

indexer = StringIndexer(inputCols=category_columns, 
    outputCols=[column+"_index" for column in category_columns])    

psdf_taxi_vec = indexer.fit(psdf_taxi).transform(psdf_taxi)

ohe = OneHotEncoder(inputCols=[column+"_index" for column in category_columns], 
    outputCols=[column+"_OHEVector" for column in category_columns])

psdf_taxi_vec = ohe.fit(psdf_taxi_vec).transform(psdf_taxi_vec)


In [5]:

# nummeric columns to normalize
numeric_columns_norm = ['trip_distance', 
    'fare_amount', 
    'extra', 'mta_tax', 'tolls_amount',     
    'total_amount','duration_in_min']

# nummeric columns that we don't want to normalize
numeric_columns = ['passenger_count']


vectorAssembler = VectorAssembler(
    inputCols = numeric_columns_norm,
    outputCol = 'numeric_features')

v_df = vectorAssembler.transform(psdf_taxi_vec)

normalizer = Normalizer(inputCol="numeric_features", 
    outputCol="numeric_features_norm")

v_df = normalizer.transform(v_df)    

category_columns_vec = [column+"_OHEVector" for column in category_columns]


mergeAssembler = VectorAssembler(
    inputCols = ["numeric_features_norm"] + numeric_columns + category_columns_vec,
    outputCol = 'features')

v_df = mergeAssembler.transform(v_df)
v_df = v_df.select(['features', 'tip_amount'])
v_df.show(5)


+--------------------+----------+
|            features|tip_amount|
+--------------------+----------+
|(540,[0,1,3,5,6,7...|      1.96|
|(540,[0,1,3,5,6,7...|      2.06|
|(540,[0,1,2,3,5,6...|      2.06|
|(540,[0,1,2,3,5,6...|      3.86|
|(540,[0,1,3,5,6,7...|       0.0|
+--------------------+----------+
only showing top 5 rows



In [6]:
splits = v_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [7]:
lr = LinearRegression(featuresCol = 'features', 
    labelCol='tip_amount', 
    maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))


Coefficients: [0.0,-2.7029737234864695,0.0,-76.6537015053019,12.742659422544563,7.940613828161231,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6349673096372203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.1850006949898113,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,

In [8]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("MSE: %f" % trainingSummary.meanSquaredError)
print("r2: %f" % trainingSummary.r2)
print("MAPE: %f" % trainingSummary.meanAbsoluteError)


RMSE: 1.692485
MSE: 2.864505
r2: 0.581737
MAPE: 0.873080


In [9]:
result = lr_model.evaluate(test_df)
print("RMSE: %f" % result.rootMeanSquaredError)
print("MSE: %f" % result.meanSquaredError)
print("r2: %f" % result.r2)
print("MAPE: %f" % result.meanAbsoluteError)

RMSE: 1.708766
MSE: 2.919883
r2: 0.576983
MAPE: 0.877865


In [10]:
predictions = lr_model.transform(test_df)
predictions.select("prediction","tip_amount","features").show(5)

+-----------------+----------+--------------------+
|       prediction|tip_amount|            features|
+-----------------+----------+--------------------+
|7.809769630059293|     12.57|(540,[0,1,2,3,4,5...|
|5.886537428429102|      5.87|(540,[0,1,2,3,4,5...|
| 7.43514567424946|      15.1|(540,[0,1,2,3,4,5...|
|8.229743954628603|       7.0|(540,[0,1,2,3,4,5...|
| 7.45153817606167|      7.57|(540,[0,1,2,3,4,5...|
+-----------------+----------+--------------------+
only showing top 5 rows



In [11]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator

dt = DecisionTreeRegressor(featuresCol ='features', 
    labelCol = 'tip_amount')
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="tip_amount", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

dt_evaluator = RegressionEvaluator(predictionCol="prediction", \
    labelCol="tip_amount",metricName="r2")
print("R Squared (R2) on test data = %g" % dt_evaluator.evaluate(dt_predictions))

print("Feature Importances")
dt_model.featureImportances

Root Mean Squared Error (RMSE) on test data = 1.3073
R Squared (R2) on test data = 0.752406
Feature Importances
(540,[0,1,2,3,4,5],[0.003131700654025644,0.031116799600539723,0.0007409679584589283,0.8219099475021654,0.004795826012293466,0.1383047582725167])


In [12]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'tip_amount', maxIter=10)
gbt_model = gbt.fit(train_df)

gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'tip_amount', 'features').show(5)

+------------------+----------+--------------------+
|        prediction|tip_amount|            features|
+------------------+----------+--------------------+
|11.758446210889097|     12.57|(540,[0,1,2,3,4,5...|
| 6.049268188277188|      5.87|(540,[0,1,2,3,4,5...|
| 9.508636887769553|      15.1|(540,[0,1,2,3,4,5...|
| 9.298939161349207|       7.0|(540,[0,1,2,3,4,5...|
|7.3786709514494335|      7.57|(540,[0,1,2,3,4,5...|
+------------------+----------+--------------------+
only showing top 5 rows



In [14]:
gbt_evaluator = RegressionEvaluator(
    labelCol="tip_amount", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

dt_evaluator = RegressionEvaluator(predictionCol="prediction", \
    labelCol="tip_amount",metricName="r2")
print("R Squared (R2) on test data = %g" % dt_evaluator.evaluate(gbt_predictions))


Root Mean Squared Error (RMSE) on test data = 0.980225
R Squared (R2) on test data = 0.860798
