# Project Model

In [1]:
# load modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import * #col, radians, asin, sin, sqrt, cos

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler, IndexToString, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator
from pyspark.ml.regression import GBTRegressor, RandomForestRegressor, LinearRegression

# Necessary for distance calculations
from math import atan2, pi

# Other
import matplotlib.pyplot as plt
import sklearn.metrics

import pandas as pd
import os

In [2]:
# param init
infile = '/project/ds5559/Summer2021_TeamBike/master_dataset.parquet/'

spark = SparkSession \
    .builder \
    .config("spark.driver.memory", "15g") \
    .appName("Bike") \
    .getOrCreate()

### Read in file

In [3]:
df = spark.read.parquet(infile, inferSchema=True, header = True)

In [4]:
# add age column
df = df.withColumn("age", (df.year - df.birthyear))

In [5]:
# radius of earth in miles
R = 3963.0

#Convert start/end latitude and longitude from degrees to Radians
df = df.withColumn("startRadLong", radians(df.startStationLongitude))\
       .withColumn("endRadLong", radians(df.endStationLongitude))\
       .withColumn("startRadLat", radians(df.startStationLatitude))\
       .withColumn("endRadLat", radians(df.endStationLatitude))\

df = df.withColumn("diffRadLong", (df.startRadLong-df.endRadLong))\
       .withColumn("diffRadLat", (df.startRadLat-df.endRadLat))\

df = df.withColumn("crowDist", asin(sqrt(sin(df.diffRadLat/2)**2 + cos(df.startRadLat) * cos(df.endRadLat) * sin(df.diffRadLong/2)**2)) * 2 * R)

df = df.drop("startRadLong","endRadLong","startRadLat","endRadLat","diffRadLong","diffRadLat")

## Random Forest

### Identify Categorical vs Continuous Features

In [6]:
categoricalCols = ["Borough","hour","usertype","gender","day","month","time_bin","year","precip","zipcodes"]

In [7]:
continuousCols = ["temp","pressure","humidity","wind_speed","rain_3h","snow_3h","clouds_all","avg_median_sales_price","age"]

### Index and One-Hot Encode Categorical Features

In [8]:
indexers = [
    StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c), handleInvalid='skip')
    for c in categoricalCols
]

In [9]:
encoders = [
    OneHotEncoder(dropLast=False,inputCol=indexer.getOutputCol(),
                 outputCol="{0}_encoded".format(indexer.getOutputCol()))
    for indexer in indexers
]

### Assemble Categorical and Continuous Features

In [10]:
assembler = VectorAssembler(inputCols=continuousCols + [encoder.getOutputCol() for encoder in encoders], outputCol="features")

In [11]:
pipeline = Pipeline(stages = indexers + encoders + [assembler])

In [12]:
model = pipeline.fit(df)

In [13]:
transformed = model.transform(df)

### "slimDF" shows our transformed dense vector down to simply 'features' and 'label'(crowDist or straight line distance "as the crow flies")

In [14]:
slimDF = transformed.select('features','crowDist')

In [15]:
slimDF.show(2,False)

+------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|features                                                                                                                                        |crowDist          |
+------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
|(181,[0,1,2,3,6,7,8,9,31,37,40,43,51,65,68,70,89],[75.704,1020.0,73.0,1.54,1.0,1122214.7127969791,32.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|1.614091713256487 |
|(181,[0,1,2,3,6,7,8,9,31,37,40,43,51,65,68,70,74],[75.704,1020.0,73.0,1.54,1.0,1122214.7127969791,48.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|0.6582641036301969|
+------------------------------------------------------------------------------------------------------------------------------------------------+------------------+
only

In [16]:
## Split training and testing data set
(train_data, test_data) = slimDF.randomSplit([0.7,0.3], seed = 314)

In [17]:
rfReg = RandomForestRegressor(featuresCol='features', labelCol='crowDist')

In [18]:
## Fit training data
rfReg_model = rfReg.fit(train_data)

In [19]:
predictionRF = rfReg_model.transform(test_data)

In [20]:
## Show features, the "crows distance", and our prediction on the test data
predictionRF.select("features","crowDist","prediction").show(1)

+--------------------+------------------+-----------------+
|            features|          crowDist|       prediction|
+--------------------+------------------+-----------------+
|(181,[0,1,2,3,4,6...|0.8039584205799567|1.069557733824869|
+--------------------+------------------+-----------------+
only showing top 1 row



In [21]:
evaluatorRMSE = RegressionEvaluator(labelCol="crowDist", predictionCol="prediction", metricName="rmse")
evaluatorR2 = RegressionEvaluator(labelCol="crowDist", predictionCol="prediction",metricName="r2")

In [22]:
## print RMSE for Random Forest
rmseRF = evaluatorRMSE.evaluate(predictionRF)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmseRF)

Root Mean Squared Error (RMSE) on test data = 0.955498


In [23]:
## print R^2 for Random Forest
r2RF = evaluatorR2.evaluate(predictionRF)
print("R Squared (R2) on test data = %g" % r2RF) 

R Squared (R2) on test data = 0.0260675


## GBT

In [24]:
gbtReg = GBTRegressor(featuresCol='features', labelCol='crowDist')

In [25]:
gbtReg_model = gbtReg.fit(train_data)

In [26]:
predictionGBT = gbtReg_model.transform(test_data)

In [27]:
predictionGBT.select("features","crowDist","prediction").show(1)

+--------------------+------------------+------------------+
|            features|          crowDist|        prediction|
+--------------------+------------------+------------------+
|(181,[0,1,2,3,4,6...|0.8039584205799567|0.9338852211023668|
+--------------------+------------------+------------------+
only showing top 1 row



In [28]:
## print RMSE for GBT
rmseGBT = evaluatorRMSE.evaluate(predictionGBT)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmseGBT)

Root Mean Squared Error (RMSE) on test data = 0.946022


In [29]:
## print R^2 for GBT
r2GBT = evaluatorR2.evaluate(predictionGBT)
print("R Squared (R2) on test data = %g" % r2GBT) 

R Squared (R2) on test data = 0.0452891


## Linear Regression

In [30]:
crowLR = LinearRegression(featuresCol='features', labelCol='crowDist', maxIter=3)

In [31]:
lrReg_model = crowLR.fit(train_data)

In [32]:
predictionLR = lrReg_model.transform(test_data)

In [33]:
predictionLR.select("features","crowDist","prediction").show(1)

+--------------------+------------------+------------------+
|            features|          crowDist|        prediction|
+--------------------+------------------+------------------+
|(181,[0,1,2,3,4,6...|0.8039584205799567|1.0113322386631556|
+--------------------+------------------+------------------+
only showing top 1 row



In [34]:
## print RMSE for GBT
rmseLR = evaluatorRMSE.evaluate(predictionLR)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmseLR)

Root Mean Squared Error (RMSE) on test data = 0.94507


In [35]:
## print R^2 for GBT
r2LR = evaluatorR2.evaluate(predictionLR)
print("R Squared (R2) on test data = %g" % r2LR) 

R Squared (R2) on test data = 0.0472101


## Feature Analysis

In [36]:
featureLabel = [
    transformed.select(feat).distinct().collect()
    for feat in categoricalCols
]

In [37]:
featureLabel = [item for sublist in featureLabel for item in sublist]

In [38]:
featureLabel =  continuousCols + featureLabel

In [39]:
rfReg_list = list(zip(featureLabel, rfReg_model.featureImportances))
gbtReg_list = list(zip(featureLabel, gbtReg_model.featureImportances))
lrReg_list = list(zip(featureLabel, lrReg_model.coefficients))

In [40]:
rfReg_list

[('temp', 0.08625056110092849),
 ('pressure', 3.332790513426185e-05),
 ('humidity', 0.00023197823472587584),
 ('wind_speed', 9.484997077068865e-07),
 ('rain_3h', 0.0),
 ('snow_3h', 0.0),
 ('clouds_all', 1.0665623749348274e-05),
 ('avg_median_sales_price', 0.010324952967236489),
 ('age', 0.053238897658504945),
 (Row(Borough='Queens'), 0.0),
 (Row(Borough='Brooklyn'), 0.0005724219557696502),
 (Row(Borough='Manhattan'), 0.0),
 (Row(Borough='Bronx'), 0.005149948513978049),
 (Row(hour=12), 0.0008332951991044897),
 (Row(hour=22), 0.0009858514435814683),
 (Row(hour=1), 0.0),
 (Row(hour=13), 0.0),
 (Row(hour=6), 0.007744039405524708),
 (Row(hour=16), 0.0),
 (Row(hour=3), 4.0690079048011683e-07),
 (Row(hour=20), 8.437245944365151e-05),
 (Row(hour=5), 0.0),
 (Row(hour=19), 1.408884894057632e-06),
 (Row(hour=15), 1.3386132407409302e-05),
 (Row(hour=9), 8.976586399434265e-07),
 (Row(hour=17), 4.65893342605987e-05),
 (Row(hour=4), 2.7800830762326683e-05),
 (Row(hour=8), 5.780508691027128e-05),
 (Ro

In [49]:
gbtReg_list

[('temp', 0.051353663274186716),
 ('pressure', 0.0003840669884271461),
 ('humidity', 0.00020395571188506305),
 ('wind_speed', 5.510455562484136e-05),
 ('rain_3h', 0.0),
 ('snow_3h', 0.0),
 ('clouds_all', 0.007405543916199066),
 ('avg_median_sales_price', 0.01075237677503553),
 ('age', 0.08303696022290782),
 (Row(Borough='Queens'), 0.03271479491517374),
 (Row(Borough='Brooklyn'), 0.00724205426377252),
 (Row(Borough='Manhattan'), 0.0),
 (Row(Borough='Bronx'), 0.0010994489636055548),
 (Row(hour=12), 0.010312851169777877),
 (Row(hour=22), 0.003379740864844144),
 (Row(hour=1), 0.0025690575420281685),
 (Row(hour=13), 0.0),
 (Row(hour=6), 0.037338697426926276),
 (Row(hour=16), 0.0),
 (Row(hour=3), 1.1064504512151233e-05),
 (Row(hour=20), 0.0),
 (Row(hour=5), 0.0),
 (Row(hour=19), 0.00032316621220324874),
 (Row(hour=15), 0.003366309255817055),
 (Row(hour=9), 0.00021375294018144086),
 (Row(hour=17), 5.868757042117252e-05),
 (Row(hour=4), 0.011539479016939686),
 (Row(hour=8), 0.00033467025012510

In [50]:
lrReg_list

[('temp', 0.0016090679138498582),
 ('pressure', -5.468527999792932e-06),
 ('humidity', -0.00011916375370350847),
 ('wind_speed', -0.004012144412906427),
 ('rain_3h', -0.0018333888685843447),
 ('snow_3h', 0.0008633072364899642),
 ('clouds_all', -0.00020495199521010027),
 ('avg_median_sales_price', -1.73588182960175e-08),
 ('age', -0.002987798577820864),
 (Row(Borough='Queens'), 0.016329943588659336),
 (Row(Borough='Brooklyn'), -0.0057519258814300155),
 (Row(Borough='Manhattan'), -0.04263517900544734),
 (Row(Borough='Bronx'), -0.13467935131147177),
 (Row(hour=12), 0.05141318370637242),
 (Row(hour=22), 0.029563863587867955),
 (Row(hour=1), 0.017466545959349784),
 (Row(hour=13), -0.003572487653340081),
 (Row(hour=6), 0.09891850958984678),
 (Row(hour=16), -0.014711249497489389),
 (Row(hour=3), -0.03833157519439554),
 (Row(hour=20), -0.030838901805836236),
 (Row(hour=5), -0.04035521763357223),
 (Row(hour=19), 0.009806715711175732),
 (Row(hour=15), -0.053786655388074436),
 (Row(hour=9), -0.03