In [20]:
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Encoders

import java.io.File
import scala.collection.mutable.ListBuffer
import java.lang.Math._

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.{LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit, CrossValidator}
import org.apache.spark.ml.feature.{VectorAssembler, MinMaxScaler, StandardScaler}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.tree.{RandomForest, GradientBoostedTrees}
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
//import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
import org.apache.spark.ml.{Pipeline, PipelineModel}

In [2]:
val spark = SparkSession.builder.master("local[*]").appName("SparkPreProcessing").getOrCreate()
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@1d76ae37


org.apache.spark.sql.SparkSession@1d76ae37

In [3]:
val path = "/media/sf_Documents/codes/id2221_project/data/samples_2019.csv"

path = /media/sf_Documents/codes/id2221_project/data/samples_2019.csv


/media/sf_Documents/codes/id2221_project/data/samples_2019.csv

In [4]:
case class Record(latitude: Double, longitude: Double, year: Int, month: Int, day: Int, hour: Int, min: Int, nb_bikes_available: Int)

defined class Record


In [5]:
val schema = Encoders.product[Record].schema

schema = StructType(StructField(latitude,DoubleType,false), StructField(longitude,DoubleType,false), StructField(year,IntegerType,false), StructField(month,IntegerType,false), StructField(day,IntegerType,false), StructField(hour,IntegerType,false), StructField(min,IntegerType,false), StructField(nb_bikes_available,IntegerType,false))


StructType(StructField(latitude,DoubleType,false), StructField(longitude,DoubleType,false), StructField(year,IntegerType,false), StructField(month,IntegerType,false), StructField(day,IntegerType,false), StructField(hour,IntegerType,false), StructField(min,IntegerType,false), StructField(nb_bikes_available,IntegerType,false))

In [6]:
val df = spark.read
              .option("header", "true")
              .schema(schema)
              .csv(path)
              .as[Record]

df = [latitude: double, longitude: double ... 6 more fields]


[latitude: double, longitude: double ... 6 more fields]

In [7]:
var finalDF = df.withColumnRenamed("nb_bikes_available", "label")

finalDF = [latitude: double, longitude: double ... 6 more fields]


[latitude: double, longitude: double ... 6 more fields]

In [8]:
finalDF.printSchema()

root
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- min: integer (nullable = true)
 |-- label: integer (nullable = true)


In [9]:
val Array(testingSet, trainingSet) = finalDF.randomSplit(Array(0.20, 0.80), seed = 12345)

testingSet = [latitude: double, longitude: double ... 6 more fields]
trainingSet = [latitude: double, longitude: double ... 6 more fields]


[latitude: double, longitude: double ... 6 more fields]

In [10]:
val assembler = new VectorAssembler()
                    .setInputCols(Array("latitude", "longitude", "year", "month", "day", "hour", "min"))
                    .setOutputCol("features")

assembler = vecAssembler_7b7bff10a4df


vecAssembler_7b7bff10a4df

In [11]:
val scaler = new StandardScaler()
                 .setInputCol("features")
                 .setOutputCol("scaledFeatures")

scaler = stdScal_3a0b734eeb0a


stdScal_3a0b734eeb0a

## Linear Regression

In [15]:
val model = new LinearRegression()
                .setLabelCol("label")
                .setFeaturesCol("scaledFeatures")

model = linReg_6c199617a2cd


linReg_6c199617a2cd

In [16]:
val pipeline = new Pipeline()
                .setStages(Array(assembler, scaler, model))

pipeline = pipeline_5d24b4785446


pipeline_5d24b4785446

In [17]:
val MLmodel = pipeline.fit(trainingSet)

MLmodel = pipeline_5d24b4785446


pipeline_5d24b4785446

In [18]:
val prediction = MLmodel.transform(testingSet)

prediction = [latitude: double, longitude: double ... 9 more fields]


[latitude: double, longitude: double ... 9 more fields]

In [19]:
prediction.select("prediction", "label").show(10)

+------------------+-----+
|        prediction|label|
+------------------+-----+
| 95.90124003253004|   20|
| 99.02144729881547|   38|
| 103.4509911803616|   41|
| 94.36212870779855|   19|
| 95.53435266739689|   17|
|  97.4386096316739|   22|
|  99.3111438271153|   16|
|100.26681461645057|   13|
|100.13433803436055|   13|
|101.83717244793661|   22|
+------------------+-----+
only showing top 10 rows


In [20]:
val evaluator = new RegressionEvaluator()
                    .setLabelCol("label")
                    .setPredictionCol("prediction")
                    .setMetricName("rmse")
val rmse = evaluator.evaluate(prediction)
println(s"RMSE = $rmse")

RMSE = 135.16015495741954


evaluator = regEval_e736b30e0049
rmse = 135.16015495741954


135.16015495741954

## Decision Tree

In [34]:
val model = new DecisionTreeRegressor()
                .setLabelCol("label")
                .setFeaturesCol("scaledFeatures")
                .setMaxDepth(10)
                .setMaxBins(25)

model = dtr_64cebba47bfb


dtr_64cebba47bfb

In [35]:
val pipeline = new Pipeline()
                .setStages(Array(assembler, scaler, model))

pipeline = pipeline_14fc301b29b4


pipeline_14fc301b29b4

In [36]:
val DTmodel = pipeline.fit(trainingSet)

DTmodel = pipeline_14fc301b29b4


pipeline_14fc301b29b4

In [41]:
val prediction = DTmodel.transform(testingSet)

prediction = [latitude: double, longitude: double ... 9 more fields]


[latitude: double, longitude: double ... 9 more fields]

In [38]:
prediction.select("prediction", "label").show(10)

+------------------+-----+
|        prediction|label|
+------------------+-----+
| 19.17808219178082|   20|
| 26.78053435114504|   38|
|32.481540930979136|   41|
| 19.17808219178082|   19|
| 19.17808219178082|   17|
| 26.78053435114504|   22|
| 26.78053435114504|   16|
|32.481540930979136|   13|
|32.481540930979136|   13|
|32.481540930979136|   22|
+------------------+-----+
only showing top 10 rows



In [43]:
val evaluator = new RegressionEvaluator()
                    .setLabelCol("label")
                    .setPredictionCol("prediction")
                    .setMetricName("rmse")
val rmse = evaluator.evaluate(prediction)
println(s"RMSE = $rmse")

RMSE = 82.10004877789012


evaluator = regEval_7ca6aa89cb9e
rmse = 82.10004877789012


lastException: Throwable = null


82.10004877789012

In [44]:
DTmodel.write.save("/home/hamid/Documents/DTmodel")

## Random Forest

In [12]:
val model = new RandomForestRegressor()
                .setLabelCol("label")
                .setFeaturesCol("scaledFeatures")
                .setNumTrees(20) //15
                .setMaxDepth(10)
                .setMaxBins(25)

model = rfr_d096c0a4ea65


rfr_d096c0a4ea65

In [13]:
val pipeline = new Pipeline()
                .setStages(Array(assembler, scaler, model))

pipeline = pipeline_38c84d66a300


pipeline_38c84d66a300

In [14]:
val RFmodel = pipeline.fit(trainingSet)

RFmodel = pipeline_38c84d66a300


pipeline_38c84d66a300

In [15]:
val prediction = RFmodel.transform(testingSet)

prediction = [latitude: double, longitude: double ... 9 more fields]


[latitude: double, longitude: double ... 9 more fields]

In [16]:
prediction.select("prediction", "label").show(10)

+------------------+-----+
|        prediction|label|
+------------------+-----+
| 32.76295904008222|   20|
| 65.24489581815922|   38|
| 74.05442906779263|   41|
|30.672805761534608|   19|
| 33.83484966003607|   17|
| 50.81481771142353|   22|
| 62.70892917069615|   16|
| 65.40614850799629|   13|
| 66.43925569947581|   13|
| 78.00747689021348|   22|
+------------------+-----+
only showing top 10 rows



In [17]:
val evaluator = new RegressionEvaluator()
                    .setLabelCol("label")
                    .setPredictionCol("prediction")
                    .setMetricName("rmse")
val rmse = evaluator.evaluate(prediction)
println(s"RMSE = $rmse")

RMSE = 92.2659275606024


evaluator = regEval_b5b133157bcc
rmse = 92.2659275606024


92.2659275606024

## Gradient Boosted Tree

In [None]:
val boostingStrategy = BoostingStrategy.defaultParams("Regression")
boostingStrategy.numIterations = 15
boostingStrategy.treeStrategy.maxDepth = 10

In [None]:
val model = GradientBoostedTrees.train(trainingData, boostingStrategy)

In [25]:
val model = new GBTRegressor()
                .setLabelCol("label")
                .setFeaturesCol("scaledFeatures")
                .setMaxDepth(10)
                .setMaxBins(25)
                .setMaxIter(15)

model = gbtr_7a07513f3c8b


gbtr_7a07513f3c8b

In [26]:
val pipeline = new Pipeline()
                .setStages(Array(assembler, scaler, model))

pipeline = pipeline_b5ca08761b96


pipeline_b5ca08761b96

In [27]:
val GBTmodel = pipeline.fit(trainingSet)

GBTmodel = pipeline_b5ca08761b96


pipeline_b5ca08761b96

In [45]:
val prediction = GBTmodel.transform(testingSet)

prediction = [latitude: double, longitude: double ... 9 more fields]


[latitude: double, longitude: double ... 9 more fields]

In [29]:
prediction.select("prediction", "label").show(10)

+------------------+-----+
|        prediction|label|
+------------------+-----+
|28.348866569752655|   20|
| 34.48318381068084|   38|
|36.861730425610425|   41|
|14.622622948167724|   19|
|28.941682457182726|   17|
|25.853737551770656|   22|
|28.624234590466262|   16|
| 30.69735682971335|   13|
| 30.69735682971335|   13|
|22.042565315140973|   22|
+------------------+-----+
only showing top 10 rows



In [46]:
val evaluator = new RegressionEvaluator()
                    .setLabelCol("label")
                    .setPredictionCol("prediction")
                    .setMetricName("rmse")
val rmse = evaluator.evaluate(prediction)
println(s"RMSE = $rmse")

RMSE = 59.79216397171131


evaluator = regEval_e937147add7f
rmse = 59.79216397171131


59.79216397171131

In [47]:
GBTmodel.write.save("/home/hamid/Documents/GBTmodel")