<a href="https://colab.research.google.com/github/hagohel/Bigdata-Analytics-Google-Colab/blob/master/Bigdata_Analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [0]:
!wget -q http://mirrors.viethosting.com/apache/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz

In [0]:
!tar xf spark-2.4.4-bin-hadoop2.7.tgz

In [0]:
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
#Simple Linear Regression Model 
from google.colab import files
files.upload()

In [9]:
ls 

BostonHousing.csv  [0m[01;34mspark-2.4.4-bin-hadoop2.7[0m/
[01;34msample_data[0m/       spark-2.4.4-bin-hadoop2.7.tgz


In [0]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)

In [14]:
dataset.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [15]:
#Input all the features in one vector column 
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol = 'Attributes')

output = assembler.transform(dataset)

#Input vs output
finalized_data = output.select("Attributes","medv")

finalized_data.show()

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796,0.0,8.14...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



In [0]:
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])

In [17]:
regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')

#Learn to fit the model from training set
regressor = regressor.fit(train_data)

#To predict the prices on testing set
pred = regressor.evaluate(test_data)

#Predict the model
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.00632,18.0,2.3...|24.0|30.129541358290275|
|[0.01301,35.0,1.5...|32.7|30.530481375903598|
|[0.0136,75.0,4.0,...|18.9|15.591468512954592|
|[0.02177,82.5,2.0...|42.3|  37.2171243211321|
|[0.02498,0.0,1.89...|16.5|22.465754181446194|
|[0.02763,75.0,2.9...|30.8|31.268242645819182|
|[0.0315,95.0,1.47...|34.9|30.158436625238792|
|[0.03359,75.0,2.9...|34.9| 34.07676088086826|
|[0.03502,80.0,4.9...|28.5| 33.21543402370832|
|[0.03551,25.0,4.8...|22.9|25.108354158397105|
|[0.03932,0.0,3.41...|22.0|27.542530087811492|
|[0.04337,21.0,5.6...|20.5| 24.10049185299711|
|[0.04379,80.0,3.3...|19.4| 25.70770316853508|
|[0.04417,70.0,2.2...|24.8| 30.84555800375422|
|[0.05023,35.0,6.0...|17.1|20.427166210151093|
|[0.05188,0.0,4.49...|22.5| 22.49323673248017|
|[0.05372,0.0,13.9...|27.1|27.617595592824173|
|[0.0566,0.0,3.41,...|23.6| 30.68311738233474|
|[0.05789,12.

In [18]:
#coefficient of the regression model
coeff = regressor.coefficients

#X and Y intercept
intr = regressor.intercept

print ("The coefficient of the model is : %a" %coeff)
print ("The Intercept of the model is : %f" %intr)

The coefficient of the model is : DenseVector([-0.1207, 0.0432, 0.0223, 1.3796, -18.2475, 3.8339, -0.0028, -1.4709, 0.2705, -0.0116, -0.9918, 0.0086, -0.4906])
The Intercept of the model is : 37.459434


In [22]:
from pyspark.ml.evaluation import RegressionEvaluator
eval = RegressionEvaluator(labelCol="medv", predictionCol="prediction", metricName="rmse")

# Root Mean Square Error
rmse = eval.evaluate(pred.predictions)
print("RMSE: %.3f" % rmse)

# Mean Square Error
mse = eval.evaluate(pred.predictions, {eval.metricName: "mse"})
print("MSE: %.3f" % mse)

# Mean Absolute Error
mae = eval.evaluate(pred.predictions, {eval.metricName: "mae"})
print("MAE: %.3f" % mae)

# r2 - coefficient of determination
r2 = eval.evaluate(pred.predictions, {eval.metricName: "r2"})
print("r2: %.3f" %r2)

RMSE: 5.064
MSE: 25.644
MAE: 3.486
r2: 0.721


In [0]:
import matplotlib.pyplot as plt

plt.plot(rmse)