In [1]:
# Importing required libraries
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder.appName('linear_regression_project').getOrCreate()

In [3]:
# Load the Boston Housing dataset into a dataframe
data = spark.read.csv('boston_house_prices.csv', inferSchema=True, header=True)

In [4]:
# Show the data
data.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM|  AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

In [5]:
# Prepare the data
vectorAssembler = VectorAssembler(inputCols=['CRIM','ZN','INDUS','CHAS','NOX','RM','AGE','DIS','RAD','TAX','PTRATIO','B','LSTAT'], outputCol='features')
df = vectorAssembler.transform(data)

In [6]:
# Split the data into training and test datasets
train_data, test_data = df.randomSplit([0.7,0.3])

In [7]:
# Create the linear regression model
lr = LinearRegression(featuresCol='features', labelCol='MEDV')

In [8]:
# Train the model
lrModel = lr.fit(train_data)

In [9]:
# Evaluate the model
result = lrModel.evaluate(test_data)
print("Mean Squared Error: {}".format(result.meanSquaredError))
print("Root Mean Squared Error: {}".format(result.rootMeanSquaredError))

Mean Squared Error: 19.28536841986939
Root Mean Squared Error: 4.3915109495331315


In [10]:
# Make predictions
predictions = lrModel.transform(test_data)
predictions.show()

+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+----+--------------------+------------------+
|   CRIM|  ZN|INDUS|CHAS|   NOX|   RM| AGE|    DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|            features|        prediction|
+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+----+--------------------+------------------+
|0.00906|90.0| 2.97|   0|   0.4|7.088|20.8| 7.3073|  1|285|   15.3|394.72| 7.85|32.2|[0.00906,90.0,2.9...|31.547242709067252|
|0.01096|55.0| 2.25|   0| 0.389|6.453|31.9| 7.3073|  1|300|   15.3|394.72| 8.23|22.0|[0.01096,55.0,2.2...|27.528267151106164|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8| 6.2196|  1|265|   15.6| 376.7| 4.38|29.1|[0.01439,60.0,2.9...|31.872133091345844|
|0.01538|90.0| 3.75|   0| 0.394|7.454|34.2| 6.3361|  3|244|   15.9|386.34| 3.11|44.0|[0.01538,90.0,3.7...|  37.4985110773585|
|0.01709|90.0| 2.02|   0|  0.41|6.728|36.1|12.1265|  5|187|   17.0|384.46|  4.5|30.1|[0.01709,90.0,2.0...|23.944328624