# Models

In [1]:
import org.apache.spark.SparkContext
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.tree.configuration.Algo._
import org.apache.spark.mllib.tree.impurity.Variance
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
import org.apache.spark.sql.Encoders
import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer}

In [2]:
val spark = SparkSession.builder.master("local[*]").appName("SparkML").getOrCreate()
import spark.implicits._

spark = org.apache.spark.sql.SparkSession@5634bf81


org.apache.spark.sql.SparkSession@5634bf81

In [3]:
val path = "/home/florent/id2221_project/data/outputs/dataset/2019-09.csv"

path = /home/florent/id2221_project/data/outputs/dataset/2019-09.csv


/home/florent/id2221_project/data/outputs/dataset/2019-09.csv

In [4]:
// Load and parse the data file.

var dataRDD = sc.textFile(path)
val header = dataRDD.first()
dataRDD = dataRDD.filter(row => row != header)

dataRDD = MapPartitionsRDD[2] at filter at <console>:49
header = x_coo,y_coo,z_coo,year,month,day,hour,min,nb_bikes_available
dataRDD = MapPartitionsRDD[2] at filter at <console>:49


MapPartitionsRDD[2] at filter at <console>:49

In [5]:
val parsedData = dataRDD.map { line =>
    val parts = line.split(",").map(_.toDouble)
    LabeledPoint(parts.last, Vectors.dense(parts.init))
}

parsedData = MapPartitionsRDD[3] at map at <console>:43


MapPartitionsRDD[3] at map at <console>:43

In [6]:
parsedData.take(2)

Array((31.0,[0.4909949444970359,-0.4513649771070291,0.7451131604793488,2019.0,9.0,1.0,0.0,0.0]), (30.0,[0.4909949444970359,-0.4513649771070291,0.7451131604793488,2019.0,9.0,1.0,0.0,41.0]))

## Decision Tree

Reference: https://spark.apache.org/docs/1.0.2/mllib-decision-tree.html

In [7]:
val maxDepth = 5
val model = DecisionTree.train(parsedData,
                               Regression,
                               Variance,
                               maxDepth)

maxDepth = 5
model = DecisionTreeModel regressor of depth 5 with 63 nodes


DecisionTreeModel regressor of depth 5 with 63 nodes

In [8]:
val valuesAndPreds = parsedData.map { point =>
    val prediction = model.predict(point.features)
    (point.label, prediction)
}

valuesAndPreds = MapPartitionsRDD[28] at map at <console>:45


MapPartitionsRDD[28] at map at <console>:45

In [9]:
val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.mean()
println("training Mean Squared Error = " + MSE)

training Mean Squared Error = 28328.85010637164


MSE = 28328.85010637164


28328.85010637164

In [10]:
model.save(sc, "./decision_tree.model")

## Working with a dataset

In [91]:
case class Record(x_coo: Double,y_coo: Double, z_coo: Double, year: Int, month: Int, day: Int, hour: Int, min: Int, nb_bikes_available: Int)
val schema = Encoders.product[Record].schema

defined class Record
schema = StructType(StructField(x_coo,DoubleType,false), StructField(y_coo,DoubleType,false), StructField(z_coo,DoubleType,false), StructField(year,IntegerType,false), StructField(month,IntegerType,false), StructField(day,IntegerType,false), StructField(hour,IntegerType,false), StructField(min,IntegerType,false), StructField(nb_bikes_available,IntegerType,false))


StructType(StructField(x_coo,DoubleType,false), StructField(y_coo,DoubleType,false), StructField(z_coo,DoubleType,false), StructField(year,IntegerType,false), StructField(month,IntegerType,false), StructField(day,IntegerType,false), StructField(hour,IntegerType,false), StructField(min,IntegerType,false), StructField(nb_bikes_available,IntegerType,false))

In [92]:
val data = spark.read
                .option("header", "true")
                .schema(schema)
                .csv(path)
                .as[Record]

data = [x_coo: double, y_coo: double ... 7 more fields]


[x_coo: double, y_coo: double ... 7 more fields]

In [93]:
data.show(10)

+------------------+-------------------+------------------+----+-----+---+----+---+------------------+
|             x_coo|              y_coo|             z_coo|year|month|day|hour|min|nb_bikes_available|
+------------------+-------------------+------------------+----+-----+---+----+---+------------------+
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|    9|  1|   0|  0|                31|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|    9|  1|   0| 41|                30|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|    9|  1|   0| 59|                31|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|    9|  1|   1|  5|                30|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|    9|  1|   1| 15|                31|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|    9|  1|   2| 13|                30|
|0.4909949444970359|-0.4513649771070291|0.7451131604793488|2019|    9|  1

In [96]:
val cols = Array("x_coo", "y_coo", "z_coo", "year", "month", "day", "hour", "min")

// VectorAssembler to add feature column
// input columns - cols
// feature column - features
val assembler = new VectorAssembler().setInputCols(cols)
                                     .setOutputCol("features")

val featureDf = assembler.transform(data)

root
 |-- x_coo: double (nullable = true)
 |-- y_coo: double (nullable = true)
 |-- z_coo: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- min: integer (nullable = true)
 |-- nb_bikes_available: integer (nullable = true)
 |-- features: vector (nullable = true)



cols = Array(x_coo, y_coo, z_coo, year, month, day, hour, min)
assembler = vecAssembler_55260010e253
featureDf = [x_coo: double, y_coo: double ... 8 more fields]


[x_coo: double, y_coo: double ... 8 more fields]

In [97]:
featureDf.printSchema()

root
 |-- x_coo: double (nullable = true)
 |-- y_coo: double (nullable = true)
 |-- z_coo: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- min: integer (nullable = true)
 |-- nb_bikes_available: integer (nullable = true)
 |-- features: vector (nullable = true)



In [101]:
val indexer = new StringIndexer().setInputCol("nb_bikes_available")
                                 .setOutputCol("label")

val labelDf = indexer.fit(featureDf).transform(featureDf)

indexer = strIdx_58df42cb4a8b
labelDf = [x_coo: double, y_coo: double ... 9 more fields]


[x_coo: double, y_coo: double ... 9 more fields]

In [102]:
labelDf.printSchema()

root
 |-- x_coo: double (nullable = true)
 |-- y_coo: double (nullable = true)
 |-- z_coo: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- hour: integer (nullable = true)
 |-- min: integer (nullable = true)
 |-- nb_bikes_available: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [104]:
val seed = 5043
val Array(trainingData, testData) = labelDf.randomSplit(Array(0.7, 0.3), seed)

seed = 5043
trainingData = [x_coo: double, y_coo: double ... 9 more fields]
testData = [x_coo: double, y_coo: double ... 9 more fields]


[x_coo: double, y_coo: double ... 9 more fields]

## Random forest regression

In [118]:
val rf = new RandomForestRegressor().setLabelCol("label")
                                    .setFeaturesCol("features")

rf = rfr_74796955c5ef


rfr_74796955c5ef

In [119]:
val rfModel = rf.fit(trainingData)

rfModel = RandomForestRegressionModel (uid=rfr_74796955c5ef) with 20 trees


RandomForestRegressionModel (uid=rfr_74796955c5ef) with 20 trees

In [120]:
val predictions = rfModel.transform(testData)

// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

+-----------------+-----+--------------------+
|       prediction|label|            features|
+-----------------+-----+--------------------+
|100.2909265923264|  3.0|[-0.1145248311126...|
|100.2909265923264| 10.0|[-0.1145248311126...|
|100.2909265923264| 11.0|[-0.1145248311126...|
|100.2909265923264|  9.0|[-0.1145248311126...|
|100.2909265923264| 36.0|[-0.1145248311126...|
+-----------------+-----+--------------------+
only showing top 5 rows



predictions = [x_coo: double, y_coo: double ... 10 more fields]


[x_coo: double, y_coo: double ... 10 more fields]

### Modelling (hamid)

In [None]:
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.{LinearRegression, DecisionTreeRegressor, RandomForestRegressor, GBTRegressor}
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit, CrossValidator}
import org.apache.spark.ml.feature.{VectorAssembler, MinMaxScaler, StandardScaler}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.mllib.tree.RandomForest
//import ml.dmlc.xgboost4j.scala.spark.{XGBoostRegressionModel, XGBoostRegressor}
import org.apache.spark.ml.{Pipeline, PipelineModel}

In [None]:
case class Record(latitude: Double,longitude: Double, year: Int, month: Int, day: Int, hour: Int, min: Int, nb_bikes_available: Int)
val schema = Encoders.product[Record].schema

In [None]:
val data = spark.read
                .option("header", "true")
                .schema(schema)
                .csv(path)
                .as[Record]

In [None]:
val seed = 5043
val Array(trainingData, testData) = data.randomSplit(Array(0.7, 0.3), seed)

In [None]:
val cols = Array("latitude", "longitude", "year", "month", "day", "hour", "min")

// VectorAssembler to add feature column
// input columns - cols
// feature column - features
val assembler = new VectorAssembler().setInputCols(cols)
                                     .setOutputCol("features")

//val featureDf = assembler.transform(data)

In [None]:
val indexer = new StringIndexer().setInputCol("nb_bikes_available")
                                 .setOutputCol("label")

//val labelDf = indexer.fit(featureDf).transform(featureDf)

In [None]:
val scaler = new StandardScaler()
                 .setInputCol("features")
                 .setOutputCol("scaledFeatures")

In [None]:
val model = new RandomForestRegressor()
                .setLabelCol("label")
                .setFeaturesCol("scaledFeatures")
                .setNumTrees(25)
                .setMaxDepth(10)
                .setMaxBins(25)

In [None]:
val pipeline = new Pipeline()
                .setStages(Array(assembler, indexer, scaler, model))

In [None]:
val model = pipeline.fit(trainingSet)

In [None]:
val RFPrediction = model.transform(testingSet)

In [None]:
RFPrediction.select("prediction", "label").show(10)

In [None]:
//.setNumTrees(25)
//.setMaxDepth(10)
//.setMaxBins(25)
val evaluator = new RegressionEvaluator()
                    .setLabelCol("label")
                    .setPredictionCol("prediction")
                    .setMetricName("rmse")
val rmse = evaluator.evaluate(RFPrediction)
println(s"RMSE = $rmse")