In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel

In [4]:
spark = SparkSession.builder.appName('Model Persistence').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/04 14:40:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
mpg = spark.read.csv('mpg.csv', header=True, inferSchema=True)
mpg.show(5)

                                                                                

+----+---------+-----------+----------+------+----------+----+--------+
| MPG|Cylinders|Engine Disp|Horsepower|Weight|Accelerate|Year|  Origin|
+----+---------+-----------+----------+------+----------+----+--------+
|15.0|        8|      390.0|       190|  3850|       8.5|  70|American|
|21.0|        6|      199.0|        90|  2648|      15.0|  70|American|
|18.0|        6|      199.0|        97|  2774|      15.5|  70|American|
|16.0|        8|      304.0|       150|  3433|      12.0|  70|American|
|14.0|        8|      455.0|       225|  3086|      10.0|  70|American|
+----+---------+-----------+----------+------+----------+----+--------+
only showing top 5 rows



In [6]:
mpg.printSchema()

root
 |-- MPG: double (nullable = true)
 |-- Cylinders: integer (nullable = true)
 |-- Engine Disp: double (nullable = true)
 |-- Horsepower: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Accelerate: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Origin: string (nullable = true)



In [7]:
mpg.columns[1:-1]

['Cylinders', 'Engine Disp', 'Horsepower', 'Weight', 'Accelerate', 'Year']

In [8]:
feature_cols = mpg.columns[1:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
scaler = StandardScaler(inputCol='features', outputCol='scaled_features', withMean=True, withStd=True)
lr = LinearRegression(featuresCol='scaled_features', labelCol='MPG', predictionCol='prediction')
pipeline = Pipeline(stages=[assembler, scaler, lr])

In [9]:
(train_data, test_data) = mpg.randomSplit([0.7, 0.3], seed=42)

In [10]:
pipeline_model = pipeline.fit(train_data)

23/12/04 14:40:40 WARN Instrumentation: [96e5a420] regParam is zero, which might cause numerical instability and overfitting.
23/12/04 14:40:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/12/04 14:40:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [11]:
predictions = pipeline_model.transform(test_data)

In [12]:
predictions.select('MPG','prediction').show(10)

+----+------------------+
| MPG|        prediction|
+----+------------------+
|10.0| 6.683344024049767|
|11.0| 8.344953219723626|
|12.0|10.043420590827163|
|12.0| 5.252194346981181|
|13.0|21.473697417345377|
|13.0|17.421344951368425|
|13.0|11.168080223812227|
|13.0|14.154650234586116|
|13.0| 9.853448998811599|
|13.0|10.994457356404661|
+----+------------------+
only showing top 10 rows



In [13]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='MPG', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(rmse)

3.453104969079314


In [14]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='MPG', metricName='mae')
mae = evaluator.evaluate(predictions)
print(mae)

2.8423911791950864


In [15]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='MPG', metricName='mse')
mse = evaluator.evaluate(predictions)
print(mse)

11.923933927480249


In [16]:
evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='MPG', metricName='r2')
r2 = evaluator.evaluate(predictions)
print(r2)

0.8046190375720215


In [17]:
!mkdir model_storage

mkdir: model_storage: File exists


In [18]:
pipeline_model.write().overwrite().save("./model_storage/")

                                                                                

In [19]:
loaded_model = PipelineModel.load('./model_storage/')

                                                                                

In [20]:
predictions = loaded_model.transform(test_data)

In [21]:
predictions.select('MPG','prediction').show(truncate=False)

+----+------------------+
|MPG |prediction        |
+----+------------------+
|10.0|6.683344024049767 |
|11.0|8.344953219723626 |
|12.0|10.043420590827163|
|12.0|5.252194346981181 |
|13.0|21.473697417345377|
|13.0|17.421344951368425|
|13.0|11.168080223812227|
|13.0|14.154650234586116|
|13.0|9.853448998811599 |
|13.0|10.994457356404661|
|13.0|13.0440872179524  |
|13.0|10.85071261940328 |
|13.0|7.0212674929366266|
|13.0|4.102584238936959 |
|13.0|8.389478890011015 |
|14.0|10.241735354704755|
|14.0|15.991415050247074|
|14.0|12.234738750650209|
|14.0|10.678338313921108|
|14.0|10.883442729945425|
+----+------------------+
only showing top 20 rows



In [22]:
spark.stop()