In [24]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.evaluation.RegressionEvaluator

In [25]:
val rawData = sc.textFile("/home/jovyan/data/bike-sharing/hour.csv")
val header = rawData.first()
val records = rawData.filter(line => line != header)

rawData = /home/jovyan/data/bike-sharing/hour.csv MapPartitionsRDD[138] at textFile at <console>:37
header = instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
records = MapPartitionsRDD[139] at filter at <console>:39


MapPartitionsRDD[139] at filter at <console>:39

In [26]:
records.first()

1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0,3,13,16

In [27]:
records.cache()

MapPartitionsRDD[139] at filter at <console>:39

The other 3 are casual and registered and we are not going to bother about them.

In [28]:
case class BikeRentals(season: Int, yr: Int, mnth: Int, hr: Int, holiday: Int, weekday: Int, workingday: Int, weathersit: Int, temp: Double, atemp: Double, hum: Double, windspeed: Double, cnt: Double)
def parseString(recs: String): BikeRentals = {
    val fields = recs.split(",")
    assert(fields.size==17)
    BikeRentals(fields(2).toInt, fields(3).toInt, fields(4).toInt, fields(5).toInt, fields(6).toInt, fields(7).toInt, fields(8).toInt, fields(9).toInt, fields(10).toDouble, fields(11).toDouble, fields(12).toDouble, fields(13).toDouble, fields(16).toDouble)
}

defined class BikeRentals


parseString: (recs: String)BikeRentals


In [29]:
val data = records.map(parseString).toDF()
data.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|      0.0|16.0|
|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|40.0|
|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|32.0|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+----+
only showing top 3 rows



data = [season: int, yr: int ... 11 more fields]


[season: int, yr: int ... 11 more fields]

In [30]:
val featureCols = Array("season", "yr", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed", "cnt")
val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")
val dataDF = assembler.transform(data)

featureCols = Array(season, yr, mnth, hr, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, cnt)
assembler = vecAssembler_55a954c27d85
dataDF = [season: int, yr: int ... 12 more fields]


[season: int, yr: int ... 12 more fields]

We need to convert the output column to label so that the algorithms can find it.

In [31]:
val dataDF1 = dataDF.withColumnRenamed("cnt", "label")

dataDF1 = [season: int, yr: int ... 12 more fields]


[season: int, yr: int ... 12 more fields]

In [32]:
dataDF1.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|label|            features|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|      0.0| 16.0|[1.0,0.0,1.0,0.0,...|
|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0| 40.0|[1.0,0.0,1.0,1.0,...|
|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0| 32.0|[1.0,0.0,1.0,2.0,...|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
only showing top 3 rows



In [33]:
val Array(train, test) = dataDF1.randomSplit(Array(0.8, 0.2))
println(train.count, rawData.count)

(13932,17380)


train = [season: int, yr: int ... 12 more fields]
test = [season: int, yr: int ... 12 more fields]


[season: int, yr: int ... 12 more fields]

In [34]:
train.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|label|            features|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|     1|  0|   1|  0|      0|      0|         0|         1|0.04|0.0758|0.57|   0.1045| 22.0|[1.0,0.0,1.0,0.0,...|
|     1|  0|   1|  0|      0|      0|         0|         1| 0.1|0.0758|0.42|   0.3881| 25.0|[1.0,0.0,1.0,0.0,...|
|     1|  0|   1|  0|      0|      0|         0|         1|0.16|0.1818| 0.8|   0.1045| 33.0|[1.0,0.0,1.0,0.0,...|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
only showing top 3 rows



### Model1: Linear Regression

In [35]:
val lr = new LinearRegression()
    .setMaxIter(1000)
    .setRegParam(0.3)
    .setElasticNetParam(0.8)

lr = linReg_76a5922fec64


linReg_76a5922fec64

In [36]:
//Fit the model
val lrModel = lr.fit(train)

lrModel = linReg_76a5922fec64


linReg_76a5922fec64

In [37]:
// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.998351256617081] Intercept: 0.313062072849809


In [38]:
// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

numIterations: 26
objectiveHistory: [0.5000000000000001,0.3752202767508399,0.06719464156273607,0.0368133584982284,0.0028023018670834854,0.0016922038706507396,0.0015301719505879028,0.0014951150787543388,0.0014902135601998312,0.0014900211660046289,0.0014899488408232511,0.001489920168332969,0.001489908801740442,0.0014899043481679372,0.0014899025457090981,0.001489901518164803,0.001489901368016495,0.0014899013420431675,0.001489901336995153,0.0014899013356168456,0.0014899013353729096,0.0014899013353296452,0.0014899013353219795,0.0014899013353205978,0.0014899013353205355,0.0014899013353202662]
+--------------------+
|           residuals|
+--------------------+
| -0.2777683672734348|
| -0.2727990890489984|
| -0.2595476804505026|
|-0.24960912400162982|
|-0.28605049764749424|
|-0.30261475839561935|
| -0.3059276105452433|
| -0.3059276105452433|
|-0.30261475839561935|
|  -0.294332628021559|
| -0.3059276105452433|
| -0.2993019062459954|
| -0.3042711844704318|
|-0.30261475839561935|
|-0.28605049764

trainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@4ff0e30a


org.apache.spark.ml.regression.LinearRegressionTrainingSummary@4ff0e30a

In [39]:
val predictions = lrModel.transform(test)
predictions.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+------------------+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|label|            features|        prediction|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+------------------+
|     1|  0|   1|  0|      0|      4|         1|         1|0.26|0.2273|0.56|   0.3881| 13.0|[1.0,0.0,1.0,0.0,...|13.291628408871862|
|     1|  0|   1|  1|      0|      0|         0|         1|0.14|0.2121| 0.8|      0.0| 29.0|[1.0,0.0,1.0,1.0,...| 29.26524851474516|
|     1|  0|   1|  1|      0|      1|         1|         1|0.12|0.1212| 0.5|   0.2836|  1.0|[1.0,0.0,1.0,1.0,...|1.3114133294668902|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+------------------+
only showing top 3 rows



predictions = [season: int, yr: int ... 13 more fields]


[season: int, yr: int ... 13 more fields]

In [40]:
val eval = new RegressionEvaluator().setMetricName("rmse").setLabelCol("label").setPredictionCol("prediction")
val rmse = eval.evaluate(predictions)

eval = regEval_c951e793e831
rmse = 0.3012528180461223


0.3012528180461223

### Model 2: generalised linear regression.

In [41]:
import org.apache.spark.ml.regression.GeneralizedLinearRegression

In [42]:
val glr = new GeneralizedLinearRegression()
  .setFamily("gaussian")
  .setLink("identity")
  .setMaxIter(1000)
  .setRegParam(0.3)

glr = glm_d1f488f566af


glm_d1f488f566af

In [43]:
// Fit the model
val model = glr.fit(train)

model = glm_d1f488f566af


glm_d1f488f566af

In [44]:
// Print the coefficients and intercept for generalized linear regression model
println(s"Coefficients: ${model.coefficients}")
println(s"Intercept: ${model.intercept}")

// Summarize the model over the training set and print out some metrics
val summary = model.summary
println(s"Coefficient Standard Errors: ${summary.coefficientStandardErrors.mkString(",")}")
println(s"T Values: ${summary.tValues.mkString(",")}")
println(s"P Values: ${summary.pValues.mkString(",")}")
println(s"Dispersion: ${summary.dispersion}")
println(s"Null Deviance: ${summary.nullDeviance}")
println(s"Residual Degree Of Freedom Null: ${summary.residualDegreeOfFreedomNull}")
println(s"Deviance: ${summary.deviance}")
println(s"Residual Degree Of Freedom: ${summary.residualDegreeOfFreedom}")
println(s"AIC: ${summary.aic}")
println("Deviance Residuals: ")
summary.residuals().show()

Coefficients: [0.05262068875784558,0.2198147709089548,4.425734492548236E-4,0.020644938546103483,-0.060618511266099594,0.004433603748682125,0.006195329438959874,-0.006006023167481202,0.21577612439792626,0.6352659000588462,-0.5483618245415635,0.12114961466945055,0.9972889672299394]
Intercept: -0.06576744454785755
Coefficient Standard Errors: 0.005504146242104719,0.006781156246926457,0.0017092239905718318,5.278979267271342E-4,0.020103664459593024,0.0016232187757158967,0.0071896091458654035,0.005763975296692908,0.10739646020147199,0.12079798972315632,0.021248940369325953,0.029052850175335754,2.2926730671182233E-5,0.02112595467106544
T Values: 9.560190889427398,32.41552958001599,0.258932387853249,39.1078227453897,-3.0152966086326507,2.73136549121467,0.8617060139524665,-1.0419932179319313,2.0091549013173973,5.258911191442362,-25.806549174242814,4.169973477242512,43498.96117039846,-3.1131111266623157
P Values: 0.0,0.0,0.7956912327897652,0.0,0.002571904245709522,0.006315185398997336,0.38886414

summary = 


  windspeed   0.1211    0.0291     4.1700  0.000...


Coefficients:
    Feature Estimate Std Error    T Value P Value
(Intercept)  -0.0658    0.0211    -3.1131  0.0019
     season   0.0526    0.0055     9.5602  0.0000
         yr   0.2198    0.0068    32.4155  0.0000
       mnth   0.0004    0.0017     0.2589  0.7957
         hr   0.0206    0.0005    39.1078  0.0000
    holiday  -0.0606    0.0201    -3.0153  0.0026
    weekday   0.0044    0.0016     2.7314  0.0063
 workingday   0.0062    0.0072     0.8617  0.3889
 weathersit  -0.0060    0.0058    -1.0420  0.2974
       temp   0.2158    0.1074     2.0092  0.0445
      atemp   0.6353    0.1208     5.2589  0.0000
        hum  -0.5484    0.0212   -25.8065  0.0000
  windspeed   0.1211    0.0291     4.1700  0.0000
        cnt   0.9973    0.0000 43498.9612  0.0000

(Dispersion parameter for gaussian family taken to be 0.1469)
Null deviance: 457252312.3725 on 13935 degrees of freedom
 Residual deviance: 2047.0442 on 13935 degrees of freedom
AIC: 12847.2636

In [45]:
val predictions = model.transform(test)
predictions.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+-----------------+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|label|            features|       prediction|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+-----------------+
|     1|  0|   1|  0|      0|      0|         0|         1|0.26| 0.303|0.56|      0.0| 39.0|(13,[0,2,7,8,9,10...|38.82009478500496|
|     1|  0|   1|  0|      0|      1|         1|         1|0.06|0.0606|0.41|    0.194|  7.0|[1.0,0.0,1.0,0.0,...| 6.82575397556851|
|     1|  0|   1|  0|      0|      1|         1|         1|0.12|0.1212| 0.5|   0.2836|  5.0|[1.0,0.0,1.0,0.0,...|4.842511065701145|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+-----------------+
only showing top 3 rows



predictions = [season: int, yr: int ... 13 more fields]


[season: int, yr: int ... 13 more fields]

In [46]:
val eval = new RegressionEvaluator().setMetricName("rmse").setLabelCol("label").setPredictionCol("prediction")
val rmse = eval.evaluate(predictions)

eval = regEval_a670a50edf73
rmse = 0.3786598762802824


0.3786598762802824

### Decision tree regression

In [47]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.regression.DecisionTreeRegressor

In [50]:
// Automatically identify categorical features, and index them.
// Here, we treat features with > 4 distinct values as continuous.
val featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4)
  .fit(dataDF1)

// Train a DecisionTree model.
val dt = new DecisionTreeRegressor()
  .setLabelCol("label")
  .setFeaturesCol("indexedFeatures")

// Chain indexer and tree in a Pipeline.
val pipeline = new Pipeline()
  .setStages(Array(featureIndexer, dt))

// Train model. This also runs the indexer.
val model = pipeline.fit(train)

// Make predictions.
val predictions = model.transform(test)

// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

// Select (prediction, true label) and compute test error.
val evaluator = new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("rmse")
val rmse = evaluator.evaluate(predictions)
println("Root Mean Squared Error (RMSE) on test data = " + rmse)

val treeModel = model.stages(1).asInstanceOf[DecisionTreeRegressionModel]
println("Learned regression tree model:\n" + treeModel.toDebugString)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
|35.831381733021075| 39.0|(13,[0,2,7,8,9,10...|
| 4.497063142437592|  7.0|[1.0,0.0,1.0,0.0,...|
| 4.497063142437592|  5.0|[1.0,0.0,1.0,0.0,...|
| 12.62093023255814| 12.0|[1.0,0.0,1.0,0.0,...|
| 4.497063142437592|  5.0|[1.0,0.0,1.0,0.0,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 15.233787588849488
Learned regression tree model:
DecisionTreeRegressionModel (uid=dtr_cc3e9578dd9a) of depth 5 with 63 nodes
  If (feature 12 <= 259.5)
   If (feature 12 <= 102.5)
    If (feature 12 <= 40.5)
     If (feature 12 <= 17.5)
      If (feature 12 <= 8.5)
       Predict: 4.497063142437592
      Else (feature 12 > 8.5)
       Predict: 12.62093023255814
     Else (feature 12 > 17.5)
      If (feature 12 <= 31.5)
       Predict: 24.29988465974625
      Else (feature 12 > 31.5)
       

featureIndexer = vecIdx_4c151d72f6b5
dt = dtr_cc3e9578dd9a
pipeline = pipeline_5d70f042bcac
model = pipeline_5d70f042bcac
predictions = [season: int, yr: int ... 14 more fields]
evaluator = regEval_84849d5386c4
rmse = 15.233787588849488
treeModel = DecisionTreeRegressionModel (uid=dtr_cc3e9578dd9a) of depth 5 with 63 nodes


DecisionTreeRegressionModel (uid=dtr_cc3e9578dd9a) of depth 5 with 63 nodes

### Random forest regression

In [51]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
val featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4)
  .fit(dataDF1)

// Train a RandomForest model.
val rf = new RandomForestRegressor()
  .setLabelCol("label")
  .setFeaturesCol("indexedFeatures")

// Chain indexer and forest in a Pipeline.
val pipeline = new Pipeline()
  .setStages(Array(featureIndexer, rf))

// Train model. This also runs the indexer.
val model = pipeline.fit(train)

// Make predictions.
val predictions = model.transform(test)

// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

// Select (prediction, true label) and compute test error.
val evaluator = new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("rmse")
val rmse = evaluator.evaluate(predictions)
println("Root Mean Squared Error (RMSE) on test data = " + rmse)

val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel]
println("Learned regression forest model:\n" + rfModel.toDebugString)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
|30.205306987378343| 39.0|(13,[0,2,7,8,9,10...|
| 27.05189758183248|  7.0|[1.0,0.0,1.0,0.0,...|
|22.462892752778828|  5.0|[1.0,0.0,1.0,0.0,...|
|21.096706702842148| 12.0|[1.0,0.0,1.0,0.0,...|
|20.689009715084815|  5.0|[1.0,0.0,1.0,0.0,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 34.07138081037501
Learned regression forest model:
RandomForestRegressionModel (uid=rfr_27251a44ebe7) with 20 trees
  Tree 0 (weight 1.0):
    If (feature 12 <= 278.5)
     If (feature 12 <= 113.5)
      If (feature 12 <= 51.5)
       If (feature 12 <= 23.5)
        If (feature 6 in {1.0})
         Predict: 8.654848800834202
        Else (feature 6 not in {1.0})
         Predict: 11.522875816993464
       Else (feature 12 > 23.5)
        If (feature 10 <= 0.415)
         Predict: 39.640776699

featureIndexer = vecIdx_6af78a5991af
rf = rfr_27251a44ebe7
pipeline = pipeline_1bc3a138b7d7
model = pipeline_1bc3a138b7d7
predictions = [season: int, yr: int ... 14 more fields]
evaluator = regEval_fd96db406961
rmse = 34.07138081037501


rfModel: org.apache.spark.ml.regression.RandomForestRegressionModel ...


34.07138081037501

### Gradient-boosted tree regression

In [53]:
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}

// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
val featureIndexer = new VectorIndexer()
  .setInputCol("features")
  .setOutputCol("indexedFeatures")
  .setMaxCategories(4)
  .fit(dataDF1)

// Split the data into training and test sets (30% held out for testing).
val Array(trainingData, testData) = dataDF1.randomSplit(Array(0.7, 0.3))

// Train a GBT model.
val gbt = new GBTRegressor()
  .setLabelCol("label")
  .setFeaturesCol("indexedFeatures")
  .setMaxIter(10)

// Chain indexer and GBT in a Pipeline.
val pipeline = new Pipeline()
  .setStages(Array(featureIndexer, gbt))

// Train model. This also runs the indexer.
val model = pipeline.fit(trainingData)

// Make predictions.
val predictions = model.transform(testData)

// Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

// Select (prediction, true label) and compute test error.
val evaluator = new RegressionEvaluator()
  .setLabelCol("label")
  .setPredictionCol("prediction")
  .setMetricName("rmse")
val rmse = evaluator.evaluate(predictions)
println("Root Mean Squared Error (RMSE) on test data = " + rmse)

val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
println("Learned regression GBT model:\n" + gbtModel.toDebugString)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
| 24.21518454819239| 22.0|[1.0,0.0,1.0,0.0,...|
|  4.14947920710293|  5.0|[1.0,0.0,1.0,0.0,...|
|  4.14947920710293|  7.0|[1.0,0.0,1.0,0.0,...|
|3.0571537907650703|  3.0|[1.0,0.0,1.0,0.0,...|
| 11.89254762432865| 13.0|[1.0,0.0,1.0,0.0,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 13.370106514281224
Learned regression GBT model:
GBTRegressionModel (uid=gbtr_70903e1d529b) with 10 trees
  Tree 0 (weight 1.0):
    If (feature 12 <= 258.5)
     If (feature 12 <= 102.5)
      If (feature 12 <= 41.5)
       If (feature 12 <= 17.5)
        If (feature 12 <= 7.5)
         Predict: 4.134831460674158
        Else (feature 12 > 7.5)
         Predict: 11.877899877899878
       Else (feature 12 > 17.5)
        If (feature 12 <= 32.5)
         Predict: 24.824516129032258
        Els

featureIndexer = vecIdx_2f0ae606ec22
trainingData = [season: int, yr: int ... 12 more fields]
testData = [season: int, yr: int ... 12 more fields]
gbt = gbtr_70903e1d529b
pipeline = pipeline_ef1de030c003
model = pipeline_ef1de030c003
predictions = [season: int, yr: int ... 14 mo...


lastException: Throwable = null


[season: int, yr: int ... 14 more fields]

### Survival regression

In [56]:
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.AFTSurvivalRegression

val quantileProbabilities = Array(0.3, 0.6)
val aft = new AFTSurvivalRegression()
  .setQuantileProbabilities(quantileProbabilities)
  .setQuantilesCol("quantiles")

val model = aft.fit(train)

// Print the coefficients, intercept and scale parameter for AFT survival regression
println(s"Coefficients: ${model.coefficients}")
println(s"Intercept: ${model.intercept}")
println(s"Scale: ${model.scale}")
model.transform(train).show(false)

Name: java.lang.IllegalArgumentException
Message: Field "censor" does not exist.
Available fields: season, yr, mnth, hr, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, label, features
StackTrace: Available fields: season, yr, mnth, hr, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, label, features
  at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:267)
  at org.apache.spark.sql.types.StructType$$anonfun$apply$1.apply(StructType.scala:267)
  at scala.collection.MapLike$class.getOrElse(MapLike.scala:128)
  at scala.collection.AbstractMap.getOrElse(Map.scala:59)
  at org.apache.spark.sql.types.StructType.apply(StructType.scala:266)
  at org.apache.spark.ml.util.SchemaUtils$.checkNumericType(SchemaUtils.scala:71)
  at org.apache.spark.ml.regression.AFTSurvivalRegressionParams$class.validateAndTransformSchema(AFTSurvivalRegression.scala:109)
  at org.apache.spark.ml.regression.AFTSurvivalRegression.validateAndTr

### Isotonic Regression

In [58]:
import org.apache.spark.ml.regression.IsotonicRegression

// Trains an isotonic regression model.
val ir = new IsotonicRegression()
val model = ir.fit(train)

println(s"Boundaries in increasing order: ${model.boundaries}\n")
println(s"Predictions associated with the boundaries: ${model.predictions}\n")

// Makes predictions.
model.transform(test).show()

Boundaries in increasing order: [1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0

ir = isoReg_5d9e62af97cf
model = isoReg_5d9e62af97cf


isoReg_5d9e62af97cf