In [2]:
import findspark
findspark.init() 

from pyspark.ml.regression import  LinearRegression
from pyspark.ml.classification import LogisticRegression 
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.sql import SparkSession 
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator

In [3]:
spark = SparkSession.builder.appName('ML Pipeline Example').getOrCreate()

In [4]:
mpg_data = spark.read.csv("mpg.csv", header = True, inferSchema= True)

In [5]:
mpg_data.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Engine Disp: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Accelerate: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [6]:
mpg_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [7]:
# Stage 1: assemble the input columns into a single vector
vectorAssembler = VectorAssembler(inputCols = ['Weight', 'Horsepower', 'Engine Disp'], outputCol = 'features')
# stage 2: scale the features using standard scaler
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')
# stage 3: create a linear regression instance 
lr = LinearRegression(featuresCol = 'scaledFeatures', labelCol= 'MPG')

In [8]:
# Build the pipeline
pipeline = Pipeline(stages = [vectorAssembler, scaler, lr])

In [9]:
(training_data, testing_data) = mpg_data.randomSplit([0.7, 0.3], seed = 42)

In [10]:
model = pipeline.fit(training_data)

In [11]:
predictions = model.transform(testing_data)

evaluator = RegressionEvaluator(labelCol = 'MPG', predictionCol= 'prediction', metricName = 'rmse')
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) = ", rmse)

Root Mean Squared Error (RMSE) =  3.8756646183839334


In [12]:
spark.stop()

# Exercise

In [13]:
spark = SparkSession.builder.appName("ML Pipeline Exercise").getOrCreate()

In [14]:
iris_data = spark.read.csv('iris.csv', header= True, inferSchema = True)

In [15]:
iris_data.printSchema()

root
 |-- SepalLengthCm: double (nullable = true)
 |-- SepalWidthCm: double (nullable = true)
 |-- PetalLengthCm: double (nullable = true)
 |-- PetalWidthCm: double (nullable = true)
 |-- Species: string (nullable = true)



In [16]:
iris_data.show(5)

+-------------+------------+-------------+------------+-----------+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+-------------+------------+-------------+------------+-----------+
|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [17]:
indexer = StringIndexer(inputCol='Species', outputCol = 'label')

In [19]:
vectorAssembler = VectorAssembler(inputCols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], outputCol = 'features')
scaler = StandardScaler(inputCol='features', outputCol = 'scaledFeatures')
classifier = LogisticRegression(featuresCol='scaledFeatures', labelCol = 'label')

In [20]:
pipeline = Pipeline(stages = [indexer, vectorAssembler, scaler, classifier])

In [22]:
(training_data, testing_data) = iris_data.randomSplit([0.7, 0.3], seed = 42)

In [23]:
model = pipeline.fit(training_data)

In [24]:
predictions = model.transform(testing_data)

In [25]:
predictions.show(5)

+-------------+------------+-------------+------------+-----------+-----+-----------------+--------------------+--------------------+--------------------+----------+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|label|         features|      scaledFeatures|       rawPrediction|         probability|prediction|
+-------------+------------+-------------+------------+-----------+-----+-----------------+--------------------+--------------------+--------------------+----------+
|          4.4|         3.0|          1.3|         0.2|Iris-setosa|  2.0|[4.4,3.0,1.3,0.2]|[5.27150982460498...|[-55.310180102378...|[6.62464940171990...|       2.0|
|          4.6|         3.2|          1.4|         0.2|Iris-setosa|  2.0|[4.6,3.2,1.4,0.2]|[5.51112390754157...|[-58.172271151226...|[2.46968055421705...|       2.0|
|          4.6|         3.6|          1.0|         0.2|Iris-setosa|  2.0|[4.6,3.6,1.0,0.2]|[5.51112390754157...|[-70.656693937905...|[4.71731962522210...|       2.0|
|   

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol='label', predictionCol='prediction', metricName = 'accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy = ", accuracy)

Accuracy =  0.9782608695652174


In [27]:
spark.stop()