In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [0]:
!wget -q http://mirrors.viethosting.com/apache/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz

In [0]:
!tar xf spark-2.4.4-bin-hadoop2.7.tgz

In [0]:
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
#Simple Linear Regression Model 
from google.colab import files
files.upload()

In [0]:
ls 

In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)

In [0]:
dataset.printSchema()

In [0]:
#Input all the features in one vector column 
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol = 'Attributes')

output = assembler.transform(dataset)

#Input vs output
finalized_data = output.select("Attributes","medv")

finalized_data.show()

In [0]:
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])

In [0]:
regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')

#Learn to fit the model from training set
regressor = regressor.fit(train_data)

#To predict the prices on testing set
pred = regressor.evaluate(test_data)

#Predict the model
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.01301,35.0,1.5...|32.7| 30.06652817812244|
|[0.01432,100.0,1....|31.6|  34.0887225200804|
|[0.01501,80.0,2.0...|24.5| 28.26478550413279|
|[0.01538,90.0,3.7...|44.0| 37.85807016143466|
|[0.01778,95.0,1.4...|32.9|31.109687160524288|
|[0.02543,55.0,3.7...|23.9| 28.09323730668173|
|[0.02729,0.0,7.07...|34.7|29.883017073935072|
|[0.03502,80.0,4.9...|28.5|33.945021846057216|
|[0.03548,80.0,3.6...|20.9| 22.59048209800852|
|[0.03584,80.0,3.3...|23.5|31.113880488800994|
|[0.03705,20.0,3.3...|35.4|  34.1123949861194|
|[0.04011,80.0,1.5...|33.3| 36.39957625736466|
|[0.04527,0.0,11.9...|20.6|22.232253714400414|
|[0.04544,0.0,3.24...|19.8|21.331678263106923|
|[0.05023,35.0,6.0...|17.1|20.352858557647174|
|[0.0536,21.0,5.64...|25.0|27.577480120100024|
|[0.05561,70.0,2.2...|29.0| 32.16320919244194|
|[0.05602,0.0,2.46...|50.0| 34.95037125846132|
|[0.05735,0.0

In [0]:
#coefficient of the regression model
coeff = regressor.coefficients

#X and Y intercept
intr = regressor.intercept

print ("The coefficient of the model is : %a" %coeff)
print ("The Intercept of the model is : %f" %intr)

The coefficient of the model is : DenseVector([-0.1519, 0.0602, 0.0097, 2.4341, -16.4977, 3.2871, 0.0035, -1.4588, 0.3404, -0.013, -0.9576, 0.0096, -0.5293])
The Intercept of the model is : 38.830118


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 4.851
MSE: 23.531
MAE: 3.363
r2: 0.781
