In [38]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression

## Set seed
seed = 42

In [39]:
## Create Spark Session
spark = SparkSession.builder.appName('lrExample').getOrCreate()

In [41]:
## Load Data - Note CSV is in libsvm format so must account for that
df = spark.read.format('libsvm').load('gs://spark-training-data/datasets/sample_linear_regression_data.txt')
df.show(5)

21/11/11 01:43:15 WARN org.apache.spark.ml.source.libsvm.LibSVMFileFormat: 'numFeatures' option not specified, determining the number of features by going though the input. If you know the number in advance, please specify it via 'numFeatures' option to avoid the extra scan.


+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
+-------------------+--------------------+
only showing top 5 rows



In [42]:
## Split into train / test data
train_data, test_data = df.randomSplit([0.7, 0.3], seed=seed)

In [43]:
## Show train & test data
train_data.describe().show()
test_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                375|
|   mean|  0.724113021004657|
| stddev|  9.982212130320981|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                126|
|   mean|-1.1336593885480712|
| stddev| 11.184561422877946|
|    min|-26.805483428483072|
|    max|  23.52945433069272|
+-------+-------------------+



In [44]:
## Setup Model Object & Fit
lr = LinearRegression(featuresCol='features', labelCol='label', predictionCol='prediction')
lrModel = lr.fit(train_data)

21/11/11 01:43:21 WARN org.apache.spark.ml.util.Instrumentation: [dc9a7acc] regParam is zero, which might cause numerical instability and overfitting.


In [45]:
## Show Model Values
print(f'Coeffs: {lrModel.coefficients}')
print(f'Intercept: {lrModel.intercept}')
print(f'R^2: {lrModel.summary.r2}')

Coeffs: [0.5918059154882541,1.5074485921726686,-2.092983672586816,3.2111914129063464,0.8256578885505846,1.8989010267402955,-0.06002242466251009,-0.922117578233245,-0.5657647948846599,1.147004045732437]
Intercept: 0.5292901287039765
R^2: 0.07273225877410616


In [49]:
## Evaluate Results
test_results = lrModel.evaluate(test_data)
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|-28.305902730922302|
|-29.490547492772325|
| -22.86936529151847|
|-21.527682553818114|
|-19.687660427789638|
+-------------------+
only showing top 5 rows



In [51]:
## Run model on unlabeled data
unlabeled_data = test_data.select('features')
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 5 rows



In [52]:
## Predict Results
predictions = lrModel.transform(unlabeled_data)
predictions.show(5)

+--------------------+--------------------+
|            features|          prediction|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|   1.500419302439231|
|(10,[0,1,2,3,4,5,...|   6.540721556576252|
|(10,[0,1,2,3,4,5,...|  1.4369775273526635|
|(10,[0,1,2,3,4,5,...|  1.3156052948594428|
|(10,[0,1,2,3,4,5,...|-0.09510236182489817|
+--------------------+--------------------+
only showing top 5 rows



In [54]:
## Show predictions from eval - Predictions should match
test_results.predictions.show(5)

+-------------------+--------------------+--------------------+
|              label|            features|          prediction|
+-------------------+--------------------+--------------------+
|-26.805483428483072|(10,[0,1,2,3,4,5,...|   1.500419302439231|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|   6.540721556576252|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|  1.4369775273526635|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|  1.3156052948594428|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|-0.09510236182489817|
+-------------------+--------------------+--------------------+
only showing top 5 rows



In [65]:
## Save model to GCS
# LinearRegression.save(lrModel, 'gs://spark-training-data/ml_models/sample_model.model')
lrModel.save('gs://spark-training-data/ml_models/sample_model.model')

                                                                                

In [67]:
## Load model
sameModel = LinearRegressionModel.load('gs://spark-training-data/ml_models/sample_model.model')

In [74]:
## Check Saves Model matches initial model
test_results_same = sameModel.evaluate(test_data)
test_results_same.predictions.show(5)

test_results.predictions.show(5)

+-------------------+--------------------+--------------------+
|              label|            features|          prediction|
+-------------------+--------------------+--------------------+
|-26.805483428483072|(10,[0,1,2,3,4,5,...|   1.500419302439231|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|   6.540721556576252|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|  1.4369775273526635|
|-20.212077258958672|(10,[0,1,2,3,4,5,...|  1.3156052948594428|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|-0.09510236182489817|
+-------------------+--------------------+--------------------+
only showing top 5 rows

+-------------------+--------------------+--------------------+
|              label|            features|          prediction|
+-------------------+--------------------+--------------------+
|-26.805483428483072|(10,[0,1,2,3,4,5,...|   1.500419302439231|
|-22.949825936196074|(10,[0,1,2,3,4,5,...|   6.540721556576252|
|-21.432387764165806|(10,[0,1,2,3,4,5,...|  1.4369775273526635|
|-20.2120772589