In [1]:
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.feature.StandardScaler 


Creating SparkContext as 'sc'
Creating SqlContext as 'sqlContext'
Creating HiveContext as 'hiveContext'
import org.apache.spark.mllib.feature.StandardScaler

In [2]:
val input = sc.textFile("wasb:///autos.csv")

val featurevector  = input.map { line =>
    val lineSplit = line.split(',')
    val featureArr = lineSplit.slice(1, 8).map(_.toDouble)
    val values = Vectors.dense(featureArr)
    LabeledPoint(lineSplit(0).toDouble, values)
}.persist()


featurevector: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[2] at map at <console>:28

In [3]:
val scaler = new StandardScaler().fit(featurevector.map(x => x.features))
val scaledData = featurevector.map(x => LabeledPoint(x.label,scaler.transform(Vectors.dense(x.features.toArray))))


scaledData: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[5] at map at <console>:31

In [6]:
val allData = scaledData.randomSplit(Array(0.7, 0.3), seed = 11L)
val (training, test) = (allData(0), allData(1))



training: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[6] at randomSplit at <console>:33
test: org.apache.spark.rdd.RDD[org.apache.spark.mllib.regression.LabeledPoint] = MapPartitionsRDD[7] at randomSplit at <console>:33

In [7]:
val numIterations = 100
val stepSize = 0.001
val algorithm = new LinearRegressionWithSGD()
algorithm.setIntercept(true)
algorithm.optimizer.setNumIterations(numIterations)
algorithm.optimizer.setStepSize(stepSize)


res6: algorithm.optimizer.type = org.apache.spark.mllib.optimization.GradientDescent@33d02597

In [8]:
val model = algorithm.run(training)

model: org.apache.spark.mllib.regression.LinearRegressionModel = org.apache.spark.mllib.regression.LinearRegressionModel: intercept = 1.0453147863792198, numFeatures = 7

In [9]:
val valuesAndPreds = test.map { point =>
  val prediction = model.predict(point.features)
  (point.label, prediction)
}
valuesAndPreds.take(20)


res7: Array[(Double, Double)] = Array((18.0,21.73829610544691), (16.0,21.847308715274426), (15.0,21.5213273965266), (15.0,21.648102228681044), (14.0,21.432607843528572), (15.0,21.58980829309481), (24.0,22.086207696159754), (25.0,22.40982931254288), (26.0,21.874441339602015), (19.0,22.073750795258192), (14.0,22.140757973615873), (14.0,22.266296901876675), (12.0,22.179968212493833), (19.0,22.33109945991472), (23.0,22.091506861093198), (30.0,22.85532175199785), (23.0,23.551281176582325), (13.0,22.44705026736736), (17.0,22.34540231057206), (13.0,22.616024568736798))

In [10]:
val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
println("training Mean Squared Error = " + MSE)


training Mean Squared Error = 50.91870946643104