In [1]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.appName("ml_lr_example").getOrCreate()
df = spark.read.csv(
        "file:///home/jovyan/work/sample/ecommerce_customers.csv",
        inferSchema=True,
        header=True)
# schema
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- avg_session: double (nullable = true)
 |-- time_on_app: double (nullable = true)
 |-- time_on_website: double (nullable = true)
 |-- membership_period: double (nullable = true)
 |-- year_spent: double (nullable = true)



In [3]:
# show the first one
df.head(1)

[Row(name='Ebonie Watson', avg_session=41.39672127013476, time_on_app=12.529094637675133, time_on_website=51.45096842538406, membership_period=4.08272063295296, year_spent=652.6256699049252)]

In [4]:
# get columns
df.columns

['name',
 'avg_session',
 'time_on_app',
 'time_on_website',
 'membership_period',
 'year_spent']

In [5]:
# pyspark.ml.feature.VectorAssembler(*, inputCols=None, outputCol=None, handleInvalid='error'
# A feature transformer that merges multiple columns into a vector column
asbl = VectorAssembler(
        inputCols=[
        'avg_session',
         'time_on_app',
         'time_on_website',
         'membership_period'],
        outputCol='feature_vectors')

# apply the transform
tr_data = asbl.transform(df)

# check the new column
tr_data.printSchema()

root
 |-- name: string (nullable = true)
 |-- avg_session: double (nullable = true)
 |-- time_on_app: double (nullable = true)
 |-- time_on_website: double (nullable = true)
 |-- membership_period: double (nullable = true)
 |-- year_spent: double (nullable = true)
 |-- feature_vectors: vector (nullable = true)



In [6]:
data = tr_data.select('feature_vectors', 'year_spent')
data.show()

+--------------------+------------------+
|     feature_vectors|        year_spent|
+--------------------+------------------+
|[41.3967212701347...| 652.6256699049252|
|[38.3115264316322...| 435.3474761232019|
|[39.6010977067712...|  541.177730402894|
|[41.1666679557066...| 645.8561018790921|
|[39.9968070283756...|  665.340762170797|
|[40.6452454552103...| 707.1837171857322|
|[38.4259146016643...| 578.9451139811881|
|[39.2869715260563...| 610.3936021768764|
|[40.7853274748227...| 743.9224539496183|
|[38.3238583421386...| 474.1913172338142|
|[40.7910873299444...| 546.7926741169767|
|[40.655232989766,...|  690.794519113699|
|[35.4389147604694...| 453.5907896906171|
|[39.8284008524672...| 636.4916127378395|
|[38.8655710237846...| 522.2025339640601|
|[36.8852644471538...| 512.5766238378153|
|[38.5504642767453...| 508.2109424988935|
|[38.8066791876806...|452.55204777999387|
|[38.6253744551186...| 502.0703997828389|
|[39.1414272753881...| 671.6177530734301|
+--------------------+------------

In [7]:
# split
train, test = data.randomSplit([0.7, 0.3])

# confirm the count
train.describe().show()
test.describe().show()

+-------+-----------------+
|summary|       year_spent|
+-------+-----------------+
|  count|              346|
|   mean|554.3986963565693|
| stddev|91.62718547620463|
|    min|284.9043463419622|
|    max|849.7254927521092|
+-------+-----------------+

+-------+-----------------+
|summary|       year_spent|
+-------+-----------------+
|  count|              154|
|   mean|555.3204045074345|
| stddev|81.60362313180785|
|    min|331.6258287266069|
|    max|778.0179716953096|
+-------+-----------------+



In [8]:
lr = LinearRegression(
    featuresCol='feature_vectors',
    labelCol='year_spent')

# build a model
lr_model = lr.fit(train)

test_output = lr_model.evaluate(test)

# diff btw actual vs. prediction
test_output.residuals.show()

+-------------------+
|          residuals|
+-------------------+
|-13.018712198954631|
| -6.273895306160512|
|-13.895159855309828|
|  25.41519924586987|
| -5.924419421945572|
|-11.548280546710828|
|-19.606939837567893|
| 19.868344484138674|
|  18.60848989422425|
|-29.554095237940146|
|-2.6094229988851225|
| -7.035579759902021|
|-21.026505644483336|
|  9.351672542600795|
|-3.1846417275947942|
|  14.10572862552442|
| -6.417188481901064|
|  -9.72666752549344|
|0.36346898319277443|
| 23.539713334448038|
+-------------------+
only showing top 20 rows



In [9]:
print(test_output.rootMeanSquaredError)

14.566620376625236


In [10]:
print(test_output.r2)

0.9679278546593447


In [11]:
data.describe().show()

+-------+-----------------+
|summary|       year_spent|
+-------+-----------------+
|  count|              500|
|   mean| 554.682582467036|
| stddev|88.58045911052957|
|    min|284.9043463419622|
|    max|849.7254927521092|
+-------+-----------------+



In [12]:
predictions = lr_model.transform(test)
predictions.show()

+--------------------+------------------+------------------+
|     feature_vectors|        year_spent|        prediction|
+--------------------+------------------+------------------+
|[36.4718214508146...|355.12104548154537|    368.1397576805|
|[37.2735901880593...| 541.1865584442712| 547.4604537504317|
|[37.2794617939651...| 498.3159554605182|  512.211115315828|
|[37.5401369712698...| 656.8770092624916| 631.4618100166217|
|[37.8309029003618...| 492.8018457589691|498.72626518091465|
|[37.8314374778877...| 454.0949240734952|  465.643204620206|
|[37.8842409951843...| 605.9994962769596| 625.6064361145275|
|[37.9206146403639...| 531.8818651551177|  512.013520670979|
|[37.9318074880675...|493.44556011272994| 474.8370702185057|
|[38.00869860393,1...|  528.054825379968| 557.6089206179081|
|[38.0659828326108...|386.03238856137904|388.64181156026416|
|[38.0690430286142...| 558.7605548896368| 565.7961346495388|
|[38.1797140010391...| 556.2459655690583| 577.2724712135416|
|[38.2215037543300...| 5