In [None]:
#L.Clark
#Python interface to Spark is pyspark
#JDK and JRE are dependencies, must be same version
#pip install pyspark
#OR can install with homebrew, a linux package manager, $brew install apache-spark
#very important that Spark be packaged with your juptyer directory
#Setup instructions:
#https://opensource.com/article/18/11/pyspark-jupyter-notebook

In [18]:
import pandas as pd
import numpy as np

In [2]:
#https://towardsdatascience.com/apache-spark-mllib-tutorial-ec6f1cb336a9

In [19]:
#Every spark application requries a session, so we create one
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [20]:
#load the data
data = spark.read.csv('./boston_housing.csv', header=True, inferSchema=True)

In [21]:
#display the data
data.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5605|  5|311|   15.2| 395.6|12.43|22.9|
|0.14455|12.5| 7.87|   0|0.524|6.172| 96.1|5.9505|  5|311|   15.2| 396.9|19.15|27.1|
|0.21124|12.5| 7.87|   0|0.524|5.631|100.0|6.0821|  5|311|   15.2

In [22]:
type(data)

pyspark.sql.dataframe.DataFrame

In [23]:
#set up the features
feature_columns = data.columns[:-1] # omit the final column, which is the output, or dependent variable
from pyspark.ml.feature import VectorAssembler

#here, we pass the features to the vector assemler method
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [24]:
data_2 = assembler.transform(data)

In [25]:
data_2.show()

+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|   crim|  zn|indus|chas|  nox|   rm|  age|   dis|rad|tax|ptratio|     b|lstat|medv|            features|
+-------+----+-----+----+-----+-----+-----+------+---+---+-------+------+-----+----+--------------------+
|0.00632|18.0| 2.31|   0|0.538|6.575| 65.2|  4.09|  1|296|   15.3| 396.9| 4.98|24.0|[0.00632,18.0,2.3...|
|0.02731| 0.0| 7.07|   0|0.469|6.421| 78.9|4.9671|  2|242|   17.8| 396.9| 9.14|21.6|[0.02731,0.0,7.07...|
|0.02729| 0.0| 7.07|   0|0.469|7.185| 61.1|4.9671|  2|242|   17.8|392.83| 4.03|34.7|[0.02729,0.0,7.07...|
|0.03237| 0.0| 2.18|   0|0.458|6.998| 45.8|6.0622|  3|222|   18.7|394.63| 2.94|33.4|[0.03237,0.0,2.18...|
|0.06905| 0.0| 2.18|   0|0.458|7.147| 54.2|6.0622|  3|222|   18.7| 396.9| 5.33|36.2|[0.06905,0.0,2.18...|
|0.02985| 0.0| 2.18|   0|0.458| 6.43| 58.7|6.0622|  3|222|   18.7|394.12| 5.21|28.7|[0.02985,0.0,2.18...|
|0.08829|12.5| 7.87|   0|0.524|6.012| 66.6|5.5

In [26]:
#test/train split of the data to the 70/30 convention
train, test = data_2.randomSplit([0.7, 0.3])

In [27]:
#create a linear regression model
from pyspark.ml.regression import LinearRegression
algo = LinearRegression(featuresCol="features", labelCol="medv")

In [28]:
model = algo.fit(train) #action happens here

In [29]:
#evaluate the model
evaluation_summary = model.evaluate(test)

In [30]:
evaluation_summary.meanAbsoluteError
# Output: 3.39
evaluation_summary.rootMeanSquaredError
# Output: 5.16
evaluation_summary.r2
# Output: 0.58

0.7593850536815562

In [31]:
#prediction
predictions = model.transform(test)

In [32]:
predictions

DataFrame[crim: double, zn: double, indus: double, chas: int, nox: double, rm: double, age: double, dis: double, rad: int, tax: int, ptratio: double, b: double, lstat: double, medv: double, features: vector, prediction: double]

In [34]:
type(predictions)

pyspark.sql.dataframe.DataFrame

In [35]:
predictions.show()

+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+----+--------------------+------------------+
|   crim|  zn|indus|chas|   nox|   rm| age|    dis|rad|tax|ptratio|     b|lstat|medv|            features|        prediction|
+-------+----+-----+----+------+-----+----+-------+---+---+-------+------+-----+----+--------------------+------------------+
|0.00906|90.0| 2.97|   0|   0.4|7.088|20.8| 7.3073|  1|285|   15.3|394.72| 7.85|32.2|[0.00906,90.0,2.9...| 30.96906942008454|
|0.01381|80.0| 0.46|   0| 0.422|7.875|32.0| 5.6484|  4|255|   14.4|394.23| 2.97|50.0|[0.01381,80.0,0.4...| 40.56948670426118|
|0.01501|90.0| 1.21|   1| 0.401|7.923|24.8|  5.885|  1|198|   13.6|395.52| 3.16|50.0|[0.01501,90.0,1.2...|44.715443615973626|
|0.01538|90.0| 3.75|   0| 0.394|7.454|34.2| 6.3361|  3|244|   15.9|386.34| 3.11|44.0|[0.01538,90.0,3.7...|37.328689370967325|
|0.02055|85.0| 0.74|   0|  0.41|6.383|35.7| 9.1876|  2|313|   17.3| 396.9| 5.77|24.7|[0.02055,85.0,0.7...| 25.26986636

In [33]:
predictions.select(predictions.columns[13:]).show()

+----+--------------------+------------------+
|medv|            features|        prediction|
+----+--------------------+------------------+
|32.2|[0.00906,90.0,2.9...| 30.96906942008454|
|50.0|[0.01381,80.0,0.4...| 40.56948670426118|
|50.0|[0.01501,90.0,1.2...|44.715443615973626|
|44.0|[0.01538,90.0,3.7...|37.328689370967325|
|24.7|[0.02055,85.0,0.7...| 25.26986636503433|
|23.9|[0.02543,55.0,3.7...|27.943881282996898|
|34.7|[0.02729,0.0,7.07...| 30.74662354090006|
|25.0|[0.02875,28.0,15....|29.044033091980147|
|20.6|[0.03306,0.0,5.19...|22.637132381553332|
|34.9|[0.03359,75.0,2.9...| 34.13726296338415|
|24.1|[0.03445,82.5,2.0...|29.600383973938683|
|19.4|[0.03466,35.0,6.0...| 23.35215630171394|
|35.4|[0.03705,20.0,3.3...|   34.727928391016|
|23.2|[0.03871,52.5,5.3...| 27.48391384719946|
|33.3|[0.04011,80.0,1.5...|36.077923838743004|
|28.0|[0.04113,25.0,4.8...|  28.4251007476294|
|20.6|[0.04294,28.0,15....|27.193028905040517|
|18.2|[0.04301,80.0,1.9...| 14.74665079970839|
|24.8|[0.0441