In [1]:
import findspark
from pyspark.sql import SparkSession 
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.regression import LinearRegression 
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder.appName("Regression using SparkML").getOrCreate()

In [3]:
mpg_data = spark.read.csv("mpg.csv", header = True, inferSchema = True)

In [4]:
mpg_data.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Engine Disp: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Accelerate: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [5]:
mpg_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [10]:
assembler = VectorAssembler(inputCols = ['Cylinders', 'Engine Disp', 'Horsepower', 'Weight', 'Accelerate', 'Year'], outputCol = 'features')
mpg_transformed_data = assembler.transform(mpg_data)

In [11]:
mpg_transformed_data.select("features", "MPG").show()

+--------------------+----+
|            features| MPG|
+--------------------+----+
|[8.0,390.0,190.0,...|15.0|
|[6.0,199.0,90.0,2...|21.0|
|[6.0,199.0,97.0,2...|18.0|
|[8.0,304.0,150.0,...|16.0|
|[8.0,455.0,225.0,...|14.0|
|[8.0,350.0,165.0,...|15.0|
|[8.0,307.0,130.0,...|18.0|
|[8.0,454.0,220.0,...|14.0|
|[8.0,400.0,150.0,...|15.0|
|[8.0,307.0,200.0,...|10.0|
|[8.0,383.0,170.0,...|15.0|
|[8.0,318.0,210.0,...|11.0|
|[8.0,360.0,215.0,...|10.0|
|[8.0,429.0,198.0,...|15.0|
|[6.0,200.0,85.0,2...|21.0|
|[8.0,302.0,140.0,...|17.0|
|[8.0,304.0,193.0,...| 9.0|
|[8.0,340.0,160.0,...|14.0|
|[6.0,198.0,95.0,2...|22.0|
|[8.0,440.0,215.0,...|14.0|
+--------------------+----+
only showing top 20 rows



In [12]:
(training_data, testing_data) = mpg_transformed_data.randomSplit([0.7, 0.3], seed = 42)

In [14]:
# Build and Train a linear regression model 
lr = LinearRegression(featuresCol = "features", labelCol="MPG")
model = lr.fit(training_data)

In [15]:
# Evaluate the model 
predictions = model.transform(testing_data)

In [16]:
# R square
evaluator = RegressionEvaluator(labelCol = "MPG", predictionCol = "prediction", metricName= "r2")
r2 = evaluator.evaluate(predictions)
print("R Squared = ", r2)

R Squared =  0.8046190375720325


In [17]:
# Mean Absolute Error
evaluator = RegressionEvaluator(labelCol = "MPG", predictionCol = "prediction", metricName = 'mae')
mae = evaluator.evaluate(predictions)
print("MAE = ", mae)

MAE =  2.8423911791950123


In [19]:
# Root Mean Squared Error 
evaluator = RegressionEvaluator(labelCol = "MPG", predictionCol = "prediction", metricName = "rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE = ", rmse)

RMSE =  3.453104969079217


In [20]:
spark.stop()

In [21]:
spark1 = SparkSession.builder.appName("Diamond Price Prediction").getOrCreate()

In [22]:
diamond_data = spark1.read.csv("diamonds.csv", header = True, inferSchema=True)

In [23]:
diamond_data.show(5)

+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|  s|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|  1| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
|  2| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
|  3| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
|  4| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
|  5| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows



In [24]:
assembler = VectorAssembler(inputCols=['carat', 'depth', 'table'], outputCol = 'features')
diamond_transformed_data = assembler.transform(diamond_data)

In [25]:
diamond_transformed_data.select("features", 'price').show()

+----------------+-----+
|        features|price|
+----------------+-----+
|[0.23,61.5,55.0]|  326|
|[0.21,59.8,61.0]|  326|
|[0.23,56.9,65.0]|  327|
|[0.29,62.4,58.0]|  334|
|[0.31,63.3,58.0]|  335|
|[0.24,62.8,57.0]|  336|
|[0.24,62.3,57.0]|  336|
|[0.26,61.9,55.0]|  337|
|[0.22,65.1,61.0]|  337|
|[0.23,59.4,61.0]|  338|
| [0.3,64.0,55.0]|  339|
|[0.23,62.8,56.0]|  340|
|[0.22,60.4,61.0]|  342|
|[0.31,62.2,54.0]|  344|
| [0.2,60.2,62.0]|  345|
|[0.32,60.9,58.0]|  345|
| [0.3,62.0,54.0]|  348|
| [0.3,63.4,54.0]|  351|
| [0.3,63.8,56.0]|  351|
| [0.3,62.7,59.0]|  351|
+----------------+-----+
only showing top 20 rows



In [26]:
(training_data, testing_data) = diamond_transformed_data.randomSplit([0.70, 0.30], seed = 42)

In [28]:
lr = LinearRegression(featuresCol = "features", labelCol = "price")
model = lr.fit(training_data)

In [29]:
predictions = model.transform(testing_data)

In [30]:
# R Square 
evaluator = RegressionEvaluator(labelCol = "price", predictionCol = "prediction", metricName = "r2")
r2 = evaluator.evaluate(predictions)
print("R2 Score: ", r2)

R2 Score:  0.8535478116135986


In [31]:
training_data.show(5)

+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+
|  s|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|        features|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+
|  1| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|[0.23,61.5,55.0]|
|  2| 0.21|  Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|[0.21,59.8,61.0]|
|  4| 0.29|  Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|[0.29,62.4,58.0]|
|  5| 0.31|     Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|[0.31,63.3,58.0]|
|  6| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|  336|3.94|3.96|2.48|[0.24,62.8,57.0]|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+
only showing top 5 rows



In [32]:
predictions.show(5)

+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+-------------------+
|  s|carat|      cut|color|clarity|depth|table|price|   x|   y|   z|        features|         prediction|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+-------------------+
|  3| 0.23|     Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|[0.23,56.9,65.0]| -624.0176432223307|
|  7| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|  336|3.95|3.98|2.47|[0.24,62.3,57.0]| -482.7112646898513|
|  9| 0.22|     Fair|    E|    VS2| 65.1| 61.0|  337|3.87|3.78|2.49|[0.22,65.1,61.0]|-1463.5189715981141|
| 10| 0.23|Very Good|    H|    VS1| 59.4| 61.0|  338| 4.0|4.05|2.39|[0.23,59.4,61.0]| -563.8070729522533|
| 14| 0.31|    Ideal|    J|    SI2| 62.2| 54.0|  344|4.35|4.37|2.71|[0.31,62.2,54.0]|  396.5773133230159|
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+-------------------+
only showing top 5 rows



In [33]:
# Mean Absolute Error 
evaluator = RegressionEvaluator(labelCol = "price", predictionCol = "prediction", metricName = "mae")
mae = evaluator.evaluate(predictions)
print("mae = ", mae)

mae =  995.444205014046


In [34]:
# root mean squared error
evaluator = RegressionEvaluator(labelCol = 'price', predictionCol = 'prediction', metricName = 'rmse')
rmse = evaluator.evaluate(predictions)
print("rmse = ", rmse)

rmse =  1525.1184531740482
