In [1]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.feature.VectorAssembler

In [2]:
val rawData = sc.textFile("/home/jovyan/data/bike-sharing/hour.csv")
val header = rawData.first()
val records = rawData.filter(line => line != header)

rawData = /home/jovyan/data/bike-sharing/hour.csv MapPartitionsRDD[1] at textFile at <console>:29
header = instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
records = MapPartitionsRDD[2] at filter at <console>:31


MapPartitionsRDD[2] at filter at <console>:31

In [3]:
records.first()

1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0,3,13,16

In [4]:
records.cache()

MapPartitionsRDD[2] at filter at <console>:31

The other 3 are casual and registered and we are not going to bother about them.

In [5]:
case class BikeRentals(season: Int, yr: Int, mnth: Int, hr: Int, holiday: Int, weekday: Int, workingday: Int, weathersit: Int, temp: Double, atemp: Double, hum: Double, windspeed: Double, cnt: Double)
def parseString(recs: String): BikeRentals = {
    val fields = recs.split(",")
    assert(fields.size==17)
    BikeRentals(fields(2).toInt, fields(3).toInt, fields(4).toInt, fields(5).toInt, fields(6).toInt, fields(7).toInt, fields(8).toInt, fields(9).toInt, fields(10).toDouble, fields(11).toDouble, fields(12).toDouble, fields(13).toDouble, fields(16).toDouble)
}

defined class BikeRentals


parseString: (recs: String)BikeRentals


In [6]:
val data = records.map(parseString).toDF()
data.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|      0.0|16.0|
|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|40.0|
|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0|32.0|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+----+
only showing top 3 rows



data = [season: int, yr: int ... 11 more fields]


[season: int, yr: int ... 11 more fields]

In [7]:
val featureCols = Array("season", "yr", "mnth", "hr", "holiday", "weekday", "workingday", "weathersit", "temp", "atemp", "hum", "windspeed", "cnt")
val assembler = new VectorAssembler().setInputCols(featureCols).setOutputCol("features")
val dataDF = assembler.transform(data)

featureCols = Array(season, yr, mnth, hr, holiday, weekday, workingday, weathersit, temp, atemp, hum, windspeed, cnt)
assembler = vecAssembler_1d4ed3217b69
dataDF = [season: int, yr: int ... 12 more fields]


[season: int, yr: int ... 12 more fields]

We need to convert the output column to label so that the algorithms can find it.

In [8]:
val dataDF1 = dataDF.withColumnRenamed("cnt", "label")

dataDF1 = [season: int, yr: int ... 12 more fields]


[season: int, yr: int ... 12 more fields]

In [9]:
dataDF1.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|label|            features|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|     1|  0|   1|  0|      0|      6|         0|         1|0.24|0.2879|0.81|      0.0| 16.0|[1.0,0.0,1.0,0.0,...|
|     1|  0|   1|  1|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0| 40.0|[1.0,0.0,1.0,1.0,...|
|     1|  0|   1|  2|      0|      6|         0|         1|0.22|0.2727| 0.8|      0.0| 32.0|[1.0,0.0,1.0,2.0,...|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
only showing top 3 rows



In [10]:
val Array(train, test) = dataDF1.randomSplit(Array(0.8, 0.2))
println(train.count, rawData.count)

(13906,17380)


train = [season: int, yr: int ... 12 more fields]
test = [season: int, yr: int ... 12 more fields]


[season: int, yr: int ... 12 more fields]

In [11]:
train.show(3)

+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|season| yr|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed|label|            features|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
|     1|  0|   1|  0|      0|      0|         0|         1|0.04|0.0758|0.57|   0.1045| 22.0|[1.0,0.0,1.0,0.0,...|
|     1|  0|   1|  0|      0|      0|         0|         1|0.16|0.1818| 0.8|   0.1045| 33.0|[1.0,0.0,1.0,0.0,...|
|     1|  0|   1|  0|      0|      0|         0|         1|0.26| 0.303|0.56|      0.0| 39.0|(13,[0,2,7,8,9,10...|
+------+---+----+---+-------+-------+----------+----------+----+------+----+---------+-----+--------------------+
only showing top 3 rows



In [12]:
val lr = new LinearRegression()
    .setMaxIter(10)
    .setRegParam(0.3)
    .setElasticNetParam(0.8)

lr = linReg_8fe778811336


linReg_8fe778811336

In [13]:
//Fit the model
val lrModel = lr.fit(train)

lrModel = linReg_8fe778811336


linReg_8fe778811336

In [14]:
// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9984925404171867] Intercept: 0.28553578317316514


In [15]:
// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

numIterations: 11
objectiveHistory: [0.5000000000000002,0.37539056214362565,0.06723242887105649,0.036563371481683815,0.0027664543901562127,0.0016725370526272047,0.0015334935430298493,0.0014972371326115084,0.0014933634383373934,0.0014933159661761567,0.0014932970240573904]
+--------------------+
|           residuals|
+--------------------+
|-0.25237167235127345|
| -0.2357896169403304|
| -0.2267448594434498|
|-0.25990897026533943|
| -0.2749835660934714|
|-0.27799848525909887|
|-0.27799848525909887|
| -0.2749835660934714|
| -0.2674462681794054|
| -0.2719686469278457|
| -0.2749835660934714|
|-0.25990897026533943|
|-0.28101340442472544|
| -0.2749835660934714|
|-0.26895372776221826|
| -0.2659388085965926|
|-0.26443134901377796|
|-0.25990897026533943|
| -0.2719686469278457|
|-0.25387913193408806|
+--------------------+
only showing top 20 rows

RMSE: 0.2723114411941159
r2: 0.9999977275656062


trainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@42134ef5


org.apache.spark.ml.regression.LinearRegressionTrainingSummary@42134ef5