**Import Spark**

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

**Start Spark**

In [2]:
spark = SparkSession.builder.appName('lr_ecommerce_customer').getOrCreate()

**Load Data**

In [3]:
data = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,
                     header=True)

In [4]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [5]:
for i in data.head(2)[1]:
    print (i)

hduke@hotmail.com
4547 Archer CommonDiazchester, CA 06566-8576
DarkGreen
31.92627202636016
11.109460728682564
37.268958868297744
2.66403418213262
392.2049334443264


In [6]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

**Transform Data**

In [7]:
assembler = VectorAssembler(inputCols=['Avg Session Length','Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent'],
  outputCol='features')

In [8]:
output = assembler.transform(data)

In [9]:
output.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- features: vector (nullable = true)



**Show Data**

In [11]:
output.select('features').show(10)

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
+--------------------+
only showing top 10 rows



In [23]:
output.head(2)[1]

Row(Email='hduke@hotmail.com', Address='4547 Archer CommonDiazchester, CA 06566-8576', Avatar='DarkGreen', Avg Session Length=31.92627202636016, Time on App=11.109460728682564, Time on Website=37.268958868297744, Length of Membership=2.66403418213262, Yearly Amount Spent=392.2049334443264, features=DenseVector([31.9263, 11.1095, 37.269, 2.664, 392.2049]))

In [24]:
final_data = output.select('features','Yearly Amount Spent')

In [26]:
final_data.show(9)

+--------------------+-------------------+
|            features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
|[33.8710378793419...|   637.102447915074|
|[32.0215955013870...|  521.5721747578274|
|[32.7391429383803...|  549.9041461052942|
|[33.9877728956856...|  570.2004089636196|
+--------------------+-------------------+
only showing top 9 rows



In [31]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [32]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                340|
|   mean|  499.0310531348243|
| stddev|   78.1755763315523|
|    min| 256.67058229005585|
|    max|  744.2218671047146|
+-------+-------------------+



In [33]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                160|
|   mean| 499.91538164659414|
| stddev|  81.92916624839104|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



**Create model**

In [37]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [38]:
lr_model = lr.fit(train_data)

In [39]:
test_results = lr_model.evaluate(test_data)

In [42]:
test_results.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|-1.13686837721616...|
|1.477928890381008...|
|7.389644451905042...|
|2.046363078989088...|
|2.501110429875552...|
|-7.95807864051312...|
|5.684341886080801...|
|7.389644451905042...|
|1.023181539494544...|
|1.591615728102624...|
|2.273736754432320...|
|-1.76214598468504...|
|6.821210263296962...|
|2.842170943040400...|
|8.526512829121202...|
|5.684341886080801...|
|3.979039320256561...|
|2.216893335571512...|
|-9.09494701772928...|
|1.136868377216160...|
+--------------------+
only showing top 20 rows



In [43]:
test_results.rootMeanSquaredError

1.1305804485561844e-12

In [44]:
test_results.r2

1.0

In [45]:
unlabeled_data = test_data.select('features')

In [46]:
unlabeled_data.show(9)

+--------------------+
|            features|
+--------------------+
|[29.5324289670579...|
|[30.3931845423455...|
|[30.4925366965402...|
|[30.8162006488763...|
|[31.1239743499119...|
|[31.1695067987115...|
|[31.4252268808548...|
|[31.4474464941278...|
|[31.5261978982398...|
+--------------------+
only showing top 9 rows



In [48]:
predictions = lr_model.transform(unlabeled_data)

In [49]:
predictions.show(10)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[29.5324289670579...| 408.6403510726286|
|[30.3931845423455...|319.92886980319213|
|[30.4925366965402...| 282.4712457199138|
|[30.8162006488763...|266.08634094846695|
|[31.1239743499119...|486.94705383976327|
|[31.1695067987115...| 427.3565308022936|
|[31.4252268808548...| 530.7667186547613|
|[31.4474464941278...| 418.6027420952233|
|[31.5261978982398...| 409.0945261923368|
|[31.5741380228732...| 544.4092721605853|
+--------------------+------------------+
only showing top 10 rows

