**From the documentation**

In [3]:
import org.apache.spark.ml.regression.LinearRegression

// Load training data
val training = spark.read.format("libsvm")
  .load("./datasets/sample_linear_regression_data.txt")

val lr = new LinearRegression()
  .setMaxIter(10)
  .setRegParam(0.3)
  .setElasticNetParam(0.8)

// Fit the model
val lrModel = lr.fit(training)

// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

// Summarize the model over the training set and print out some metrics
val trainingSummary = lrModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

Coefficients: [0.0,0.32292516677405936,-0.3438548034562218,1.9156017023458414,0.05288058680386263,0.765962720459771,0.0,-0.15105392669186682,-0.21587930360904642,0.22025369188813426] Intercept: 0.1598936844239736
numIterations: 7
objectiveHistory: [0.49999999999999994,0.4967620357443381,0.4936361664340463,0.4936351537897608,0.4936351214177871,0.49363512062528014,0.4936351206216114]
+--------------------+
|           residuals|
+--------------------+
|  -9.889232683103197|
|  0.5533794340053554|
|  -5.204019455758823|
| -20.566686715507508|
|    -9.4497405180564|
|  -6.909112502719486|
|  -10.00431602969873|
|   2.062397807050484|
|  3.1117508432954772|
| -15.893608229419382|
|  -5.036284254673026|
|   6.483215876994333|
|  12.429497299109002|
|  -20.32003219007654|
| -2.0049838218725005|
| -17.867901734183793|
|   7.646455887420495|
| -2.2653482182417406|
|-0.10308920436195645|
|  -1.380034070385301|
+--------------------+
only showing top 20 rows

RMSE: 10.189077167598475
r2: 0.022861

import org.apache.spark.ml.regression.LinearRegression
training: org.apache.spark.sql.DataFrame = [label: double, features: vector]
lr: org.apache.spark.ml.regression.LinearRegression = linReg_53223c369fb7
lrModel: org.apache.spark.ml.regression.LinearRegressionModel = LinearRegressionModel: uid=linReg_53223c369fb7, numFeatures=10
trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@11f92399


**Real dataset**

In [5]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.log4j._

Logger.getLogger("org").setLevel(Level.ERROR)
val spark = SparkSession.builder().getOrCreate()

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.log4j._
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@ac7ccfb


In [13]:
val data = spark.read.option("header", "true").option("inferSchema", "true").format("csv").load("./datasets/Clean_USA_Housing.csv")
data.show()

+------------------+------------------+------------------------+---------------------------+------------------+------------------+
|   Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|   Area Population|             Price|
+------------------+------------------+------------------------+---------------------------+------------------+------------------+
| 79545.45857431678| 5.682861321615587|       7.009188142792237|                       4.09|23086.800502686456|1059033.5578701235|
| 79248.64245482568|6.0028998082752425|       6.730821019094919|                       3.09| 40173.07217364482|  1505890.91484695|
|61287.067178656784| 5.865889840310001|       8.512727430375099|                       5.13| 36882.15939970458|1058987.9878760849|
| 63345.24004622798|7.1882360945186425|       5.586728664827653|                       3.26| 34310.24283090706|1260616.8066294468|
| 59982.19722570803| 5.040554523106283|       7.839387785120487|                   

data: org.apache.spark.sql.DataFrame = [Avg Area Income: double, Avg Area House Age: double ... 4 more fields]


In [14]:
data.printSchema()

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)



In [10]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors


In [15]:
data.columns

res12: Array[String] = Array(Avg Area Income, Avg Area House Age, Avg Area Number of Rooms, Avg Area Number of Bedrooms, Area Population, Price)


In [18]:
val df = data.select(data("Price").as("label"),
                     $"Avg Area Income", $"Avg Area House Age", $"Avg Area Number of Rooms", $"Avg Area Number of Bedrooms", $"Area Population")
df.printSchema()

root
 |-- label: double (nullable = true)
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)



df: org.apache.spark.sql.DataFrame = [label: double, Avg Area Income: double ... 4 more fields]


In [19]:
val assembler = new VectorAssembler()
                .setInputCols(Array("Avg Area Income", "Avg Area House Age", "Avg Area Number of Rooms", "Avg Area Number of Bedrooms", "Area Population"))
                .setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_d570fc176a9d, handleInvalid=error, numInputCols=5


In [20]:
val output = assembler.transform(df).select($"label", $"features")

output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [21]:
output.show()

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
|1059033.5578701235|[79545.4585743167...|
|  1505890.91484695|[79248.6424548256...|
|1058987.9878760849|[61287.0671786567...|
|1260616.8066294468|[63345.2400462279...|
| 630943.4893385402|[59982.1972257080...|
|1068138.0743935304|[80175.7541594853...|
|1502055.8173744078|[64698.4634278877...|
|1573936.5644777215|[78394.3392775308...|
| 798869.5328331633|[59927.6608133496...|
|1545154.8126419624|[81885.9271840956...|
| 1707045.722158058|[80527.4720829228...|
| 663732.3968963273|[50593.6954970428...|
|1042814.0978200928|[39033.8092369823...|
|1291331.5184858206|[73163.6634410467...|
|1402818.2101658515|[69391.3801843616...|
|1306674.6599511993|[73091.8667458232...|
|1556786.6001947748|[79706.9630576574...|
| 528485.2467305964|[61929.0770180892...|
|1019425.9367578316|[63508.1942994299...|
|1030591.4292116085|[62085.2764034048...|
+------------------+--------------

In [22]:
val lr = new LinearRegression()

lr: org.apache.spark.ml.regression.LinearRegression = linReg_b971fa2230dd


In [23]:
val lrModel = lr.fit(output)

lrModel: org.apache.spark.ml.regression.LinearRegressionModel = LinearRegressionModel: uid=linReg_b971fa2230dd, numFeatures=5


In [24]:
val trainingSummary = lrModel.summary

trainingSummary: org.apache.spark.ml.regression.LinearRegressionTrainingSummary = org.apache.spark.ml.regression.LinearRegressionTrainingSummary@3dbfb4ac


In [26]:
trainingSummary.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| -164813.4848834239|
| 10953.223229611525|
|-194028.75820535724|
| 139392.73897870956|
| -214445.2769567219|
|  -701.081331347581|
|-168103.79466084158|
|  3974.217703604372|
|  32978.44903599529|
|  76897.30405031843|
| -66939.03875315329|
| 34017.775693529635|
|  91027.54522960202|
|  -14496.7229596104|
|  96010.27173951385|
|  67115.24134960445|
| 34045.648309222655|
|  40333.55834060686|
|-117424.29344359436|
|-170059.04701524798|
+-------------------+
only showing top 20 rows



In [27]:
trainingSummary.predictions.show()

+------------------+--------------------+------------------+
|             label|            features|        prediction|
+------------------+--------------------+------------------+
|1059033.5578701235|[79545.4585743167...|1223847.0427535474|
|  1505890.91484695|[79248.6424548256...|1494937.6916173385|
|1058987.9878760849|[61287.0671786567...| 1253016.746081442|
|1260616.8066294468|[63345.2400462279...|1121224.0676507372|
| 630943.4893385402|[59982.1972257080...| 845388.7662952621|
|1068138.0743935304|[80175.7541594853...| 1068839.155724878|
|1502055.8173744078|[64698.4634278877...|1670159.6120352494|
|1573936.5644777215|[78394.3392775308...| 1569962.346774117|
| 798869.5328331633|[59927.6608133496...|  765891.083797168|
|1545154.8126419624|[81885.9271840956...| 1468257.508591644|
| 1707045.722158058|[80527.4720829228...|1773984.7609112114|
| 663732.3968963273|[50593.6954970428...| 629714.6212027976|
|1042814.0978200928|[39033.8092369823...| 951786.5525904908|
|1291331.5184858206|[731

In [29]:
trainingSummary.r2

res21: Double = 0.9180238195089546


In [30]:
trainingSummary.rootMeanSquaredError

res22: Double = 101092.70158252306


In [31]:
trainingSummary.meanSquaredError

res23: Double = 1.0219734313253061E10
