In [1]:
import findspark 
findspark.init()

from pyspark.ml.regression import LinearRegression 
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession 
from pyspark.ml import Pipeline 

from pyspark.ml.evaluation import RegressionEvaluator



In [2]:
spark = SparkSession.builder.appName('Model Persistence').getOrCreate()

In [3]:
mpg_data = spark.read.csv('mpg.csv', header = True, inferSchema = True)

In [4]:
mpg_data.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Engine Disp: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Accelerate: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [5]:
mpg_data.show(5)

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [6]:
assembler = VectorAssembler(inputCols = ['Cylinders', 'Engine Disp', 'Horsepower', 'Weight', 'Accelerate', 'Year'], outputCol = 'features')
mpg_transformed_data = assembler.transform(mpg_data)

In [7]:
mpg_transformed_data.select('features', "MPG").show(5)

+--------------------+----+
|            features| MPG|
+--------------------+----+
|[8.0,390.0,190.0,...|15.0|
|[6.0,199.0,90.0,2...|21.0|
|[6.0,199.0,97.0,2...|18.0|
|[8.0,304.0,150.0,...|16.0|
|[8.0,455.0,225.0,...|14.0|
+--------------------+----+
only showing top 5 rows



In [8]:
(training_data, testing_data) = mpg_transformed_data.randomSplit([0.7, 0.3], seed = 42)

In [9]:
lr = LinearRegression(labelCol = 'MPG', featuresCol = 'features')
pipeline = Pipeline(stages = [lr])
model = pipeline.fit(training_data)

In [10]:
model.write().overwrite().save('./model_storage/')

In [11]:
from pyspark.ml.pipeline import PipelineModel 

loaded_model = PipelineModel.load('./model_storage/')

In [12]:
predictions = loaded_model.transform(testing_data)


In [13]:
predictions = model.transform(testing_data)

In [14]:
predictions.select('prediction').show(5)

+------------------+
|        prediction|
+------------------+
| 6.683344024048662|
| 8.344953219723493|
|10.043420590827143|
| 5.252194346982389|
|21.473697417345097|
+------------------+
only showing top 5 rows



In [15]:
spark.stop()

# Exercise

In [16]:
spark = SparkSession.builder.appName("Model Persistence Exercise").getOrCreate()

In [17]:
diamond_data = spark.read.csv("diamonds.csv", header=True, inferSchema = True)

In [18]:
diamond_data.show(5)

+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|  s|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|  1| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
|  2| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
|  3| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
|  4| 0.29|Premium|    I|    VS2| 62.4| 58.0|  334| 4.2|4.23|2.63|
|  5| 0.31|   Good|    J|    SI2| 63.3| 58.0|  335|4.34|4.35|2.75|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 5 rows



In [19]:
assembler = VectorAssembler(inputCols=['carat', 'depth', 'table'], outputCol = 'features')
diamond_transformed_data = assembler.transform(diamond_data)

In [20]:
diamond_transformed_data.show(5, truncate = False)

+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+----------------+
|s  |carat|cut    |color|clarity|depth|table|price|x   |y   |z   |features        |
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+----------------+
|1  |0.23 |Ideal  |E    |SI2    |61.5 |55.0 |326  |3.95|3.98|2.43|[0.23,61.5,55.0]|
|2  |0.21 |Premium|E    |SI1    |59.8 |61.0 |326  |3.89|3.84|2.31|[0.21,59.8,61.0]|
|3  |0.23 |Good   |E    |VS1    |56.9 |65.0 |327  |4.05|4.07|2.31|[0.23,56.9,65.0]|
|4  |0.29 |Premium|I    |VS2    |62.4 |58.0 |334  |4.2 |4.23|2.63|[0.29,62.4,58.0]|
|5  |0.31 |Good   |J    |SI2    |63.3 |58.0 |335  |4.34|4.35|2.75|[0.31,63.3,58.0]|
+---+-----+-------+-----+-------+-----+-----+-----+----+----+----+----------------+
only showing top 5 rows



In [21]:
(training_data, testing_data) = diamond_transformed_data.randomSplit([0.7, 0.3], seed = 42)

In [24]:
lr = LinearRegression(labelCol = 'price', featuresCol = 'features')
pipeline = Pipeline(stages = [lr])
model = pipeline.fit(training_data)

In [25]:
model.write().save('diamond_model')

In [26]:
from pyspark.ml.pipeline import PipelineModel 

loaded_model = PipelineModel.load("diamond_model")

In [27]:
predictions = loaded_model.transform(testing_data)

In [28]:
predictions.show(5, truncate=False)

+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+-------------------+
|s  |carat|cut      |color|clarity|depth|table|price|x   |y   |z   |features        |prediction         |
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+-------------------+
|3  |0.23 |Good     |E    |VS1    |56.9 |65.0 |327  |4.05|4.07|2.31|[0.23,56.9,65.0]|-624.0176432223307 |
|7  |0.24 |Very Good|I    |VVS1   |62.3 |57.0 |336  |3.95|3.98|2.47|[0.24,62.3,57.0]|-482.7112646898513 |
|9  |0.22 |Fair     |E    |VS2    |65.1 |61.0 |337  |3.87|3.78|2.49|[0.22,65.1,61.0]|-1463.5189715981141|
|10 |0.23 |Very Good|H    |VS1    |59.4 |61.0 |338  |4.0 |4.05|2.39|[0.23,59.4,61.0]|-563.8070729522533 |
|14 |0.31 |Ideal    |J    |SI2    |62.2 |54.0 |344  |4.35|4.37|2.71|[0.31,62.2,54.0]|396.5773133230159  |
+---+-----+---------+-----+-------+-----+-----+-----+----+----+----+----------------+-------------------+
only showing top 5 rows



In [29]:
spark.stop()