In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel

In [4]:
spark = SparkSession.builder.appName("ML Pipeline").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/04 10:09:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
mpg = spark.read.csv('mpg.csv', header=True, inferSchema=True)
mpg.show(5)

                                                                                

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [6]:
count_mpg1 = mpg.count()
print(count_mpg1)

392


In [7]:
mpg = mpg.dropDuplicates()
print(mpg.count())

[Stage 6:>                                                          (0 + 1) / 1]

392


                                                                                

In [8]:
mpg = mpg.dropna()
print(mpg.count())

392


In [9]:
assembler = VectorAssembler(inputCols=['Cylinders','Engine Disp', 'Horsepower', 'Weight'], outputCol='features')
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
lr = LinearRegression(featuresCol='scaled_features', labelCol='MPG')

In [10]:
pipeline = Pipeline(stages=[assembler, scaler, lr])

In [11]:
(train_data, test_data) = mpg.randomSplit([0.7, 0.3], seed=42)

In [12]:
pipeline_model = pipeline.fit(train_data)

23/12/04 10:25:08 WARN Instrumentation: [57513cab] regParam is zero, which might cause numerical instability and overfitting.
23/12/04 10:25:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/12/04 10:25:08 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [13]:
predictions = pipeline_model.transform(test_data)

In [14]:
evaluator = RegressionEvaluator(labelCol='MPG', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(rmse)

3.862531076317451


In [15]:
evaluator = RegressionEvaluator(labelCol='MPG', predictionCol='prediction', metricName='r2')
r2 = evaluator.evaluate(predictions)
print(r2)

0.7555406476203342


In [16]:
spark.stop()

In [17]:
spark = SparkSession.builder.appName('ML Pipeline').getOrCreate()

In [18]:
iris = spark.read.csv('iris.csv', header=True, inferSchema=True)
iris.show(5)

+-------------+------------+-------------+------------+-----------+
|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+-------------+------------+-------------+------------+-----------+
|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
|          4.6|         3.1|          1.5|         0.2|Iris-setosa|
|          5.0|         3.6|          1.4|         0.2|Iris-setosa|
+-------------+------------+-------------+------------+-----------+
only showing top 5 rows



In [20]:
iris.columns[:-1]

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

In [21]:
indexer = StringIndexer(inputCol='Species', outputCol='species_index')
assembler = VectorAssembler(inputCols= iris.columns[:-1], outputCol='features')
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
classifier = LogisticRegression(featuresCol='scaled_features',labelCol='species_index')

In [22]:
(train_data, test_data) = iris.randomSplit([0.7, 0.3], seed=42)

In [23]:
pipeline = Pipeline(stages=[indexer, assembler, scaler, classifier])
pipeline_model = pipeline.fit(train_data)

                                                                                

In [24]:
predictions = pipeline_model.transform(test_data    )

In [25]:
predictions.select('species_index', 'prediction').show()

+-------------+----------+
|species_index|prediction|
+-------------+----------+
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          1.0|       1.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          1.0|       1.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
|          2.0|       2.0|
+-------------+----------+
only showing top 20 rows



In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol='species_index', predictionCol='prediction',metricName='accuracy')
accuracy = evaluator.evaluate(predictions)
print("Accuracy = ", accuracy)

Accuracy =  0.9782608695652174


In [27]:
evaluator = MulticlassClassificationEvaluator(labelCol='species_index', predictionCol='prediction',metricName='weightedPrecision')
precision = evaluator.evaluate(predictions)
print("Precision = ", precision)

Precision =  0.9804347826086957


In [28]:
evaluator = MulticlassClassificationEvaluator(labelCol='species_index', predictionCol='prediction',metricName='weightedRecall')
Recall = evaluator.evaluate(predictions)
print("Recall = ", Recall)

Recall =  0.9782608695652174


In [29]:
evaluator = MulticlassClassificationEvaluator(labelCol='species_index', predictionCol='prediction',metricName='f1')
F1_score = evaluator.evaluate(predictions)
print("F1_score = ", F1_score)

F1_score =  0.9784581393513769


In [30]:
spark.stop()