In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('lr_example').getOrCreate()

In [5]:
# Use Spark to read in the Ecommerce Customers csv file.
data = spark.read.csv('Cung cap du lieu buoi 5/Ecommerce_Customers.csv', inferSchema=True, header=True)

In [6]:
# Prin the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [7]:
data.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [23]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import corr
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [9]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [15]:
assembler = VectorAssembler(
    inputCols=['Avg Session Length', 'Time on App',
               'Time on Website', 'Length of Membership'],
    outputCol='features') # inputs

In [16]:
data_pre = assembler.transform(data)

In [17]:
data_pre.select('features').show(2, False)

+--------------------------------------------------------------------------+
|features                                                                  |
+--------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]|
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]|
+--------------------------------------------------------------------------+
only showing top 2 rows



In [18]:
final_data = data_pre.select('features', 'Yearly Amount Spent')

In [19]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [20]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                345|
|   mean|  498.7911476143517|
| stddev|   80.6747371139875|
|    min|   266.086340948469|
|    max|  765.5184619388373|
+-------+-------------------+



In [21]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                155|
|   mean|  500.4778916280268|
| stddev|  76.44402178447433|
|    min| 256.67058229005585|
|    max|  712.3963268096637|
+-------+-------------------+



In [24]:
# Create a Linear Regression Model object
lr = LinearRegression(featuresCol='features',
                      labelCol='Yearly Amount Spent',
                      predictionCol='Predict_Yearly_Amount_Spent')

# Fit the model to the data and call this model LrModel
lrModel = lr.fit(train_data,)

# Print the coefficients and intercept for linear regression
print('Coefficients: {} Intercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficients: [26.02046779132627,38.865890353101676,0.6312124528277805,61.35192088113451] Intercept: -1069.2487316011834


In [25]:
test_results = lrModel.evaluate(test_data)

In [26]:
# Interesting results
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
| 11.544904302676741|
| 11.217733580402523|
|-3.4701749921925398|
|   7.70328970740843|
| -5.433386912038316|
+-------------------+
only showing top 5 rows



In [27]:
print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MSE: {}'.format(test_results.meanSquaredError))
print('r2: {}'.format(test_results.r2))

RMSE: 10.862594105059724
MSE: 117.99595069127827
r2: 0.9796768498453684


In [31]:
# Check test dataset
test_model = lrModel.transform(test_data)

In [33]:
# Inspect results
test_model.select('Predict_Yearly_Amount_Spent', 'Yearly Amount Spent').show(5)

+---------------------------+-------------------+
|Predict_Yearly_Amount_Spent|Yearly Amount Spent|
+---------------------------+-------------------+
|         397.09544676995074|  408.6403510726275|
|         450.56300861582736|  461.7807421962299|
|          493.6767749770472|  490.2065999848547|
|          486.9353200494843|  494.6386097568927|
|          492.9888449699399|  487.5554580579016|
+---------------------------+-------------------+
only showing top 5 rows



In [34]:
# Save model
lrModel.save('lrModel_Ecommerce_Customers')

In [35]:
from pyspark.ml.regression import LinearRegressionModel
# Load model from
lrModel2 = LinearRegressionModel.load('lrModel_Ecommerce_Customers')

In [36]:
# Predict new values (Assuming select test_data)
unlabeled_data = test_data.select('features')

In [37]:
predictions = lrModel2.transform(unlabeled_data)

In [38]:
predictions.show(5)

+--------------------+---------------------------+
|            features|Predict_Yearly_Amount_Spent|
+--------------------+---------------------------+
|[29.5324289670579...|         397.09544676995074|
|[30.7377203726281...|         450.56300861582736|
|[30.8794843441274...|          493.6767749770472|
|[30.9716756438877...|          486.9353200494843|
|[31.0613251567161...|          492.9888449699399|
+--------------------+---------------------------+
only showing top 5 rows

