In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.log4j._
Logger.getLogger("org").setLevel(Level.ERROR)

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.log4j._


In [4]:
val spark = SparkSession.builder().getOrCreate()
val data = spark.read.option("header", "true")
            .option("inferSchema", "true")
            .format("csv")
            .load("./datasets/Clean_USA_Housing.csv")

data.show()

+------------------+------------------+------------------------+---------------------------+------------------+------------------+
|   Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|   Area Population|             Price|
+------------------+------------------+------------------------+---------------------------+------------------+------------------+
| 79545.45857431678| 5.682861321615587|       7.009188142792237|                       4.09|23086.800502686456|1059033.5578701235|
| 79248.64245482568|6.0028998082752425|       6.730821019094919|                       3.09| 40173.07217364482|  1505890.91484695|
|61287.067178656784| 5.865889840310001|       8.512727430375099|                       5.13| 36882.15939970458|1058987.9878760849|
| 63345.24004622798|7.1882360945186425|       5.586728664827653|                       3.26| 34310.24283090706|1260616.8066294468|
| 59982.19722570803| 5.040554523106283|       7.839387785120487|                   

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@4f5e87f0
data: org.apache.spark.sql.DataFrame = [Avg Area Income: double, Avg Area House Age: double ... 4 more fields]


In [5]:
data.printSchema()

root
 |-- Avg Area Income: double (nullable = true)
 |-- Avg Area House Age: double (nullable = true)
 |-- Avg Area Number of Rooms: double (nullable = true)
 |-- Avg Area Number of Bedrooms: double (nullable = true)
 |-- Area Population: double (nullable = true)
 |-- Price: double (nullable = true)



In [6]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors


In [7]:
val df = data.select(data("Price").as("label"),
                     $"Avg Area Income", $"Avg Area House Age",
                     $"Avg Area Number of Rooms", $"Avg Area Number of Bedrooms",
                     $"Area Population")
df.show()

+------------------+------------------+------------------+------------------------+---------------------------+------------------+
|             label|   Avg Area Income|Avg Area House Age|Avg Area Number of Rooms|Avg Area Number of Bedrooms|   Area Population|
+------------------+------------------+------------------+------------------------+---------------------------+------------------+
|1059033.5578701235| 79545.45857431678| 5.682861321615587|       7.009188142792237|                       4.09|23086.800502686456|
|  1505890.91484695| 79248.64245482568|6.0028998082752425|       6.730821019094919|                       3.09| 40173.07217364482|
|1058987.9878760849|61287.067178656784| 5.865889840310001|       8.512727430375099|                       5.13| 36882.15939970458|
|1260616.8066294468| 63345.24004622798|7.1882360945186425|       5.586728664827653|                       3.26| 34310.24283090706|
| 630943.4893385402| 59982.19722570803| 5.040554523106283|       7.839387785120487|

df: org.apache.spark.sql.DataFrame = [label: double, Avg Area Income: double ... 4 more fields]


In [8]:
val assembler = new VectorAssembler()
                .setInputCols(Array("Avg Area Income",
                                    "Avg Area House Age",
                                    "Avg Area Number of Rooms",
                                    "Avg Area Number of Bedrooms",
                                    "Area Population"))
                .setOutputCol("features")

assembler: org.apache.spark.ml.feature.VectorAssembler = VectorAssembler: uid=vecAssembler_cbc9240e11ab, handleInvalid=error, numInputCols=5


In [9]:
val output = assembler.transform(df).select($"label", $"features")

output: org.apache.spark.sql.DataFrame = [label: double, features: vector]


In [10]:
output.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)



In [11]:
output.show()

+------------------+--------------------+
|             label|            features|
+------------------+--------------------+
|1059033.5578701235|[79545.4585743167...|
|  1505890.91484695|[79248.6424548256...|
|1058987.9878760849|[61287.0671786567...|
|1260616.8066294468|[63345.2400462279...|
| 630943.4893385402|[59982.1972257080...|
|1068138.0743935304|[80175.7541594853...|
|1502055.8173744078|[64698.4634278877...|
|1573936.5644777215|[78394.3392775308...|
| 798869.5328331633|[59927.6608133496...|
|1545154.8126419624|[81885.9271840956...|
| 1707045.722158058|[80527.4720829228...|
| 663732.3968963273|[50593.6954970428...|
|1042814.0978200928|[39033.8092369823...|
|1291331.5184858206|[73163.6634410467...|
|1402818.2101658515|[69391.3801843616...|
|1306674.6599511993|[73091.8667458232...|
|1556786.6001947748|[79706.9630576574...|
| 528485.2467305964|[61929.0770180892...|
|1019425.9367578316|[63508.1942994299...|
|1030591.4292116085|[62085.2764034048...|
+------------------+--------------

In [12]:
val Array(train, test) = output.select("label", "features")
                        .randomSplit(Array(0.7, 0.3), seed=42)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [label: double, features: vector]


In [13]:
val lr = new LinearRegression()

lr: org.apache.spark.ml.regression.LinearRegression = linReg_6560522557fe


In [28]:
val paramGrid = new ParamGridBuilder()
                .addGrid(lr.regParam, Array(1000000, 0.01))
                .build()

paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linReg_6560522557fe-regParam: 1000000.0
}, {
	linReg_6560522557fe-regParam: 0.01
})


In [29]:
val trainValSplit = new TrainValidationSplit()
                    .setEstimator(lr)
                    .setEvaluator(new RegressionEvaluator().setMetricName("r2"))
                    .setEstimatorParamMaps(paramGrid)
                    .setTrainRatio(0.8)

trainValSplit: org.apache.spark.ml.tuning.TrainValidationSplit = tvs_ceafa1e7c9c2


In [30]:
val model = trainValSplit.fit(train)

model: org.apache.spark.ml.tuning.TrainValidationSplitModel = TrainValidationSplitModel: uid=tvs_ceafa1e7c9c2, bestModel=LinearRegressionModel: uid=linReg_6560522557fe, numFeatures=5, trainRatio=0.8


In [31]:
model.transform(test).select("features", "label", "prediction").show()

+--------------------+------------------+------------------+
|            features|             label|        prediction|
+--------------------+------------------+------------------+
|[60167.6726073388...| 88591.77016003926|  164837.750884654|
|[48904.9832693168...|201898.08657249613| 386157.1511005121|
|[62173.5800990082...| 231189.8209898588|298874.52416372765|
|[56654.9623901952...|239319.93417545114| 488446.5003530653|
|[49851.1347839676...|283208.13218687923| 349391.9587676944|
|[51144.8509024324...|  287307.583688923|250639.01594452746|
|[55048.3320238013...| 288708.9121479006| 393087.9159483239|
|[45914.0112190894...| 300464.0986827323| 312129.7971716868|
|[17796.6311895433...|302355.83597895555| 98181.21872312808|
|[52376.6115211505...|308199.89116387354|  435343.333801609|
|[59801.4910294355...| 311111.2005867723|459908.61842230894|
|[54994.9182899755...|321058.96071220515|379447.42889893707|
|[43952.3362139292...| 324981.9929926853| 483679.7246268727|
|[52511.6543462467...| 3

In [32]:
model.validationMetrics

res13: Array[Double] = Array(0.41298923086881134, 0.9130778312923207)
