In [1]:
import findspark 
findspark.init()

from pyspark.ml.regression import LinearRegression 
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession 
from pyspark.ml import Pipeline 

from pyspark.ml.evaluation import RegressionEvaluator



In [2]:
spark = SparkSession.builder.appName('Model Persistence').getOrCreate()

In [3]:
mpg_data = spark.read.csv('mpg.csv', header = True, inferSchema = True)

In [4]:
mpg_data.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Engine Disp: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Accelerate: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [5]:
mpg_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [6]:
assembler = VectorAssembler(inputCols = ['Cylinders', 'Engine Disp', 'Horsepower', 'Weight', 'Accelerate', 'Year'], outputCol = 'features')
mpg_transformed_data = assembler.transform(mpg_data)

In [7]:
mpg_transformed_data.select('features', "MPG").show(5)

+--------------------+----+
|            features| MPG|
+--------------------+----+
|[8.0,390.0,190.0,...|15.0|
|[6.0,199.0,90.0,2...|21.0|
|[6.0,199.0,97.0,2...|18.0|
|[8.0,304.0,150.0,...|16.0|
|[8.0,455.0,225.0,...|14.0|
+--------------------+----+
only showing top 5 rows



In [8]:
(training_data, testing_data) = mpg_transformed_data.randomSplit([0.7, 0.3], seed = 42)

In [9]:
lr = LinearRegression(labelCol = 'MPG', featuresCol = 'features')
pipeline = Pipeline(stages = [lr])
model = pipeline.fit(training_data)

In [10]:
model.write().overwrite().save('./model_storage/')

In [11]:
from pyspark.ml.pipeline import PipelineModel 

loaded_model = PipelineModel.load('./model_storage/')

In [12]:
predictions = loaded_model.transform(testing_data)

In [13]:
predictions = model.transform(testing_data)

In [14]:
predictions.select('prediction').show(5)

+------------------+
|        prediction|
+------------------+
| 6.683344024048662|
| 8.344953219723493|
|10.043420590827143|
| 5.252194346982389|
|21.473697417345097|
+------------------+
only showing top 5 rows



In [None]:
spark.stop()

# Exercise