In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import os

spark = SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("Spark master") \
    .getOrCreate()
sc = spark.sparkContext

In [2]:
df = spark.read.parquet("employees.parquet")

# Features

In [3]:
from pyspark.ml.feature import VectorAssembler

In [4]:
va = VectorAssembler(inputCols = ['wiek'], outputCol = 'features')

In [5]:
reg_df = va.transform(df)

In [8]:
reg_df.select('features', 'wynagrodzenie').show(3)

+--------+-------------+
|features|wynagrodzenie|
+--------+-------------+
|  [41.0]|         7700|
|  [28.0]|         5400|
|  [28.0]|         6000|
+--------+-------------+
only showing top 3 rows



# Model

https://spark.apache.org/docs/latest/ml-classification-regression.html
https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.regression.LinearRegression

In [10]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

## train vs test

In [11]:
(train_df, test_df) = reg_df.randomSplit([0.7, 0.3])
 
lr = LinearRegression(featuresCol='features',
                      labelCol='wynagrodzenie')
 
lr_model = lr.fit(train_df)

In [12]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [199.39790091388716]
Intercept: 31.94857725223344


$$\hat{Y} = X_{1}{\beta}_{1} + {\beta}_0 \$$
```md
$$\hat{Y} = X_{1}{\beta}_{1} + {\beta}_0 \$$
```

$$\hat{Wynagrodzenie} = 199,3979 * wiek + 31 \$$

In [13]:
trainingSummary = lr_model.summary

In [17]:
type(trainingSummary)

pyspark.ml.regression.LinearRegressionTrainingSummary

https://spark.apache.org/docs/2.3.2/api/java/org/apache/spark/ml/regression/LinearRegressionTrainingSummary.html

In [19]:
print("R2: %f" % trainingSummary.r2)

R2: 0.981456


In [20]:
lr_predictions = lr_model.transform(test_df)

In [21]:
lr_predictions.select("prediction","wynagrodzenie","features").show(10)

+-----------------+-------------+--------+
|       prediction|wynagrodzenie|features|
+-----------------+-------------+--------+
|5016.896100099412|         4600|  [25.0]|
|5016.896100099412|         4600|  [25.0]|
|5016.896100099412|         4600|  [25.0]|
|5016.896100099412|         5200|  [25.0]|
|5016.896100099412|         5500|  [25.0]|
|5216.294001013299|         5100|  [26.0]|
|5216.294001013299|         5300|  [26.0]|
|5415.691901927186|         5200|  [27.0]|
|5415.691901927186|         5400|  [27.0]|
|5415.691901927186|         5600|  [27.0]|
+-----------------+-------------+--------+
only showing top 10 rows



## ocena danych testowych

In [22]:
from pyspark.ml.evaluation import RegressionEvaluator

In [23]:
# R2:
lr_evaluator = RegressionEvaluator(predictionCol="prediction",
                                   labelCol="wynagrodzenie",
                                   metricName="r2")
 
print("R2 on test data = %g" % lr_evaluator.evaluate(lr_predictions))

R2 on test data = 0.981251


In [24]:
sc.stop()