# Linear Regression

- 참고 : https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression
- Colaboratory Features : https://colab.research.google.com/notebooks/basic_features_overview.ipynb

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop2.7.tgz 
!tar xf spark-3.1.2-bin-hadoop2.7.tgz
!pip install -q findspark

import os
import findspark

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop2.7"

findspark.init()
findspark.find()

'/content/spark-3.1.2-bin-hadoop2.7'

## First thing to do is start a Spark Session

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('lr').getOrCreate()

In [None]:
from pyspark.ml.regression import LinearRegression

## Load training data

In [None]:
training = spark.read.format("libsvm").load("/content/spark-3.1.2-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt")

In [None]:
! cat /content/spark-3.1.2-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt

In [None]:
training.show()

In [None]:
lr = LinearRegression(featuresCol ='features', labelCol='label',
                      predictionCol='predictions')

In [None]:
help(LinearRegression)

## Fit the model

In [None]:
lrModel = lr.fit(training)

## Print the coefficients and intercept for linear regression

In [None]:
lrModel.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [None]:
lrModel.intercept

0.14228558260358093

## Summarize the model over the training set and print out some metrics

In [None]:
training_summary = lrModel.summary

Press control space (or command space, on a Mac) and autocomplete suggestions will appear.

In [None]:
training_summary.rootMeanSquaredError

10.16309157133015

## train_data, test_data 데이터셋 분리

In [None]:
all_data = spark.read.format("libsvm").load("/content/spark-3.1.2-bin-hadoop2.7/data/mllib/sample_linear_regression_data.txt")

In [None]:
train_data, test_data  = all_data.randomSplit([0.7, 0.3])

In [None]:
split_object

[DataFrame[label: double, features: vector],
 DataFrame[label: double, features: vector]]

In [None]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                343|
|   mean|  0.199562451907194|
| stddev| 10.566158352465362|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



In [None]:
train_data.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                343|
|   mean|  0.199562451907194|
| stddev| 10.566158352465362|
|    min|-28.571478869743427|
|    max|  27.78383192005107|
+-------+-------------------+



## Fit the model

In [None]:
correct_model = lr.fit(train_data)

## Evaluation

In [None]:
test_results = correct_model.evaluate(test_data)

In [None]:
test_results.rootMeanSquaredError

10.015176854211804

## Prepare unlabled data

In [None]:
unlabled_data = test_data.select('features')

In [None]:
unlabled_data.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



# Predict using the model

In [None]:
predictions = correct_model.transform(unlabled_data)

In [None]:
predictions.show()

+--------------------+--------------------+
|            features|         predictions|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...| -2.4585783351046313|
|(10,[0,1,2,3,4,5,...|  -1.463501473519322|
|(10,[0,1,2,3,4,5,...|  1.7212799361336657|
|(10,[0,1,2,3,4,5,...| -0.9676232838788632|
|(10,[0,1,2,3,4,5,...|  3.8126344891877753|
|(10,[0,1,2,3,4,5,...|-0.21441970179464553|
|(10,[0,1,2,3,4,5,...|  0.6107187104540752|
|(10,[0,1,2,3,4,5,...|   2.550195838136547|
|(10,[0,1,2,3,4,5,...|  1.2093665513537133|
|(10,[0,1,2,3,4,5,...| -3.4890882958471985|
|(10,[0,1,2,3,4,5,...|  -1.672622926130968|
|(10,[0,1,2,3,4,5,...|  0.9226121132023659|
|(10,[0,1,2,3,4,5,...|   4.164055905438106|
|(10,[0,1,2,3,4,5,...| -2.6945335760212736|
|(10,[0,1,2,3,4,5,...| -0.5541664563492996|
|(10,[0,1,2,3,4,5,...| -0.5216936299549758|
|(10,[0,1,2,3,4,5,...|   2.777961552761816|
|(10,[0,1,2,3,4,5,...|   2.226116418746598|
|(10,[0,1,2,3,4,5,...| -1.4643607579629372|
|(10,[0,1,2,3,4,5,...|  1.005476