In [6]:
!pip install pyspark



In [7]:
# Importing libraries

from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [8]:
pwd

'/content'

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# Get preprocessed data

spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
df = spark.read.option("inferSchema", "true").csv("drive/MyDrive/NYC Home/Preprocessing/NYC Taxi Duration Preprocessed/*.csv", header=True)
df.printSchema()

root
 |-- vendor_id: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- store_and_fwd_flag: integer (nullable = true)
 |-- trip_duration: integer (nullable = true)
 |-- distance: double (nullable = true)
 |-- week_day: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- quarter_of_year: integer (nullable = true)
 |-- hour: integer (nullable = true)



In [10]:
df.show()

+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+
|vendor_id|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|store_and_fwd_flag|trip_duration| distance|week_day|year|month|quarter_of_year|hour|
+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+
|        2|              1|      -73.982155|      40.767937|        -73.96463|       40.765602|                 0|          455|2.4444735|     Mon|2016|    3|              1|  17|
|        1|              1|      -73.980415|      40.738564|        -73.99948|        40.73115|                 0|          663|2.6599078|     Sun|2016|    6|              2|   0|
|        2|              1|       -73.97903|       40.76394|        -74.00533|       40.710087|     

# Preparing df for ML models

In [11]:
# Convert week day string column into index column.
label_index = StringIndexer(inputCol = 'week_day', outputCol = 'week_day_index')
df = label_index.fit(df).transform(df)
df.show()

+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+--------------+
|vendor_id|passenger_count|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|store_and_fwd_flag|trip_duration| distance|week_day|year|month|quarter_of_year|hour|week_day_index|
+---------+---------------+----------------+---------------+-----------------+----------------+------------------+-------------+---------+--------+----+-----+---------------+----+--------------+
|        2|              1|      -73.982155|      40.767937|        -73.96463|       40.765602|                 0|          455|2.4444735|     Mon|2016|    3|              1|  17|           6.0|
|        1|              1|      -73.980415|      40.738564|        -73.99948|        40.73115|                 0|          663|2.6599078|     Sun|2016|    6|              2|   0|           4.0|
|        2|              

In [12]:
# Creating a list of input columns headers
input_col = [i[0] for i in df.dtypes if i[1] == "int" or i[1] == "double"]
input_col.remove("trip_duration")

# Creating a dataframe of input columns.
input_features = VectorAssembler(inputCols = input_col, outputCol = "input_features")
input_column = input_features.transform(df)
input_column.select("input_features").show()

+--------------------+
|      input_features|
+--------------------+
|[2.0,1.0,-73.9821...|
|[1.0,1.0,-73.9804...|
|[2.0,1.0,-73.9790...|
|[1.0,4.0,-73.9690...|
|[2.0,1.0,-73.9692...|
|[1.0,1.0,-73.9994...|
|[2.0,1.0,-73.9826...|
|[2.0,4.0,-73.9915...|
|[2.0,2.0,-73.9629...|
|[2.0,1.0,-73.9921...|
|[1.0,1.0,-74.0039...|
|[1.0,1.0,-73.9803...|
|[2.0,1.0,-73.9795...|
|[1.0,1.0,-73.9935...|
|[2.0,1.0,-73.9552...|
|[2.0,1.0,-73.9565...|
|[1.0,1.0,-73.9837...|
|[2.0,1.0,-73.9942...|
|[1.0,1.0,-73.9821...|
|[1.0,1.0,-73.9709...|
+--------------------+
only showing top 20 rows



In [13]:
data_frame = input_column.select("input_features", "trip_duration")
data_frame.show(5)

+--------------------+-------------+
|      input_features|trip_duration|
+--------------------+-------------+
|[2.0,1.0,-73.9821...|          455|
|[1.0,1.0,-73.9804...|          663|
|[2.0,1.0,-73.9790...|         2124|
|[1.0,4.0,-73.9690...|          341|
|[2.0,1.0,-73.9692...|         1551|
+--------------------+-------------+
only showing top 5 rows



## Linear Regression model

In [14]:
# Split training and testing dataset for the models.
train, test = data_frame.randomSplit([0.7, 0.3])

# linear regression model.
linear_regression = LinearRegression(labelCol = "trip_duration", featuresCol = "input_features")

# Create ParamGrid for Cross Validation.
linear_regression_param_grid = ParamGridBuilder() \
                                .addGrid(linear_regression.regParam, [0.01, 0.1, 0.5, 1.0, 2.0]) \
                                .addGrid(linear_regression.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0]) \
                                .addGrid(linear_regression.maxIter, [1, 5, 10, 20, 50]) \
                                .addGrid(linear_regression.fitIntercept, [True, False]) \
                                .build()

# evaluator with the root mean square error metric.
evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "rmse")

# cross validator with total 5 folds.
cross_validator = CrossValidator(estimator = linear_regression,
                      estimatorParamMaps = linear_regression_param_grid,
                      evaluator = evaluator,
                      numFolds = 5)

In [15]:
# Training a linear regression model.
model = cross_validator.fit(train)


In [24]:
# Print best parameter of the model.
print("Best Regression Param: ", model.bestModel._java_obj.getRegParam())
print("Best Elastic Net Param: ", model.bestModel._java_obj.getElasticNetParam())
print("Best maxIter Param: ", model.bestModel._java_obj.getMaxIter())
print("Best fitIntercept Param: ", model.bestModel._java_obj.getFitIntercept())
print("Intercept: ", model.bestModel.intercept)

# Acuracy and coefficients of the linear equation.
print("Coefficient of Determination(Accuracy on Training data): ", str(round(model.bestModel.summary.r2*100, 2)), "%")
print("Coefficients: ", model.bestModel.coefficients)

Best Regression Param:  0.01
Best Elastic Net Param:  0.25
Best maxIter Param:  20
Best fitIntercept Param:  True
Intercept:  -17386.62658418851
Coefficient of Determination(Accuracy on Training data):  50.34 %
Coefficients:  [-1.9264345127185578,1.9369671976740022,-352.4777140418789,-289.3493350122499,-511.0744302118784,-839.6113171773275,66.53215336052293,82.31212610070523,0.0,19.23759714912255,23.41821222454084,5.471000257189258,-12.222808446700924]


In [17]:
# Predict.
prediction = model.transform(test)

prediction.show(5)

+--------------------+-------------+------------------+
|      input_features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[1.0,0.0,-73.8152...|         2251|2019.6528348686988|
|[1.0,1.0,-74.1793...|         2975| 5153.004129916691|
|[1.0,1.0,-74.0180...|         1155| 974.0161585466776|
|[1.0,1.0,-74.0179...|         1054|1251.1273160203054|
|[1.0,1.0,-74.0177...|         1411|1054.7550369819073|
+--------------------+-------------+------------------+
only showing top 5 rows



In [18]:
# Evaluator for testing data.
test_pred_evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "r2")

# Print RMSE and accuracy of the model on testing data.
print("RMSE: ", evaluator.evaluate(prediction))
print("Coefficient of Determination(Accuracy on Test Data): ", str(round(test_pred_evaluator.evaluate(prediction)*100, 4)), "%")


RMSE:  502.1706449975905
Coefficient of Determination(Accuracy on Test Data):  49.8045 %


## Gradient Boosted Trees Regression

In [19]:
# Gradient Boosted Trees regression instance.
gbtr = GBTRegressor(featuresCol="input_features", labelCol="trip_duration")

# Create ParamGrid for Cross Validation of gradient boosted trees regression.
gradient_boosted_trees_param_grid = ParamGridBuilder() \
                                .addGrid(gbtr.maxDepth, [5, 10]) \
                                .addGrid(gbtr.maxIter, [5, 10, 20]) \
                                .addGrid(gbtr.maxBins, [32, 64]) \
                                .build()

# evaluator with the root mean square error metric for gbtr.
gbt_evaluator = RegressionEvaluator(labelCol="trip_duration", predictionCol="prediction", metricName="rmse")

# Cross validator for gbtr with 5 total folds.
cross_validator_gbtr = CrossValidator(estimator = gbtr,
                      estimatorParamMaps = gradient_boosted_trees_param_grid,
                      evaluator = gbt_evaluator,
                      numFolds = 5)

In [20]:
# Training a gradient boosted trees regression model.
model_gbtr = cross_validator_gbtr.fit(train)

In [21]:
# Print best depth, bins, maxIter, impurity and losstype.
print("Best maxDepth Param: ", model_gbtr.bestModel._java_obj.getMaxDepth())
print("Best maxBins Param: ", model_gbtr.bestModel._java_obj.getMaxBins())
print("Best maxIter Param: ", model_gbtr.bestModel._java_obj.getMaxIter())
print("Best impurity Param: ", model_gbtr.bestModel._java_obj.getImpurity())
print("Best lossType Param: ", model_gbtr.bestModel._java_obj.getLossType())

Best maxDepth Param:  10
Best maxBins Param:  32
Best maxIter Param:  20
Best impurity Param:  variance
Best lossType Param:  squared


In [22]:
# Predict.
prediction_gbtr = model_gbtr.transform(test)

prediction_gbtr.show(5)

+--------------------+-------------+------------------+
|      input_features|trip_duration|        prediction|
+--------------------+-------------+------------------+
|[1.0,0.0,-73.8152...|         2251|2716.2209309835234|
|[1.0,1.0,-74.1793...|         2975|1998.6258238868547|
|[1.0,1.0,-74.0180...|         1155|1197.3937630659245|
|[1.0,1.0,-74.0179...|         1054|1188.8199341756604|
|[1.0,1.0,-74.0177...|         1411| 915.6259518474641|
+--------------------+-------------+------------------+
only showing top 5 rows



In [23]:
# Evaluator for testing data for gradient boosted trees regression model.
test_pred_evaluator_gbtr = RegressionEvaluator(predictionCol = "prediction", labelCol = "trip_duration", metricName = "r2")

# Print RMSE and accuracy of the model on testing data for gradient boosted trees regression.
print("RMSE: ", gbt_evaluator.evaluate(prediction_gbtr))
print("Coefficient of Determination(Accuracy on Test Data): ", str(round(test_pred_evaluator_gbtr.evaluate(prediction_gbtr)*100, 4)), "%")


RMSE:  361.0449206667591
Coefficient of Determination(Accuracy on Test Data):  74.0531 %
