In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.sql.functions import corr

In [3]:
sc = SparkContext()
spark = SparkSession(sc)

### Chuẩn bị và chuẩn hóa dữ liệu, xác định inputs, output

In [4]:
# Use Spark to read in the Ecomerce Customers csv file
data = spark.read.csv(["../../Data/Ecommerce_Customers.csv"], header=True, inferSchema=True)

In [5]:
# Print the Schema of the DataFrame
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [6]:
data.show(3)

+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|   Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+---------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|   Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|   Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
+--------------------+--------------------+---------+------------------+----------------

In [7]:
# Import VectorAssembler and Vectors
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [9]:
assembler = VectorAssembler(inputCols=["Avg Session Length", "Time on App", "Time on Website", "Length of Membership"],
                            outputCol="features")

In [10]:
data_pre = assembler.transform(data)

In [11]:
data_pre.select('features').show(2, False)

+--------------------------------------------------------------------------+
|features                                                                  |
+--------------------------------------------------------------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]|
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]|
+--------------------------------------------------------------------------+
only showing top 2 rows



In [12]:
final_data = data_pre.select('features', 'Yearly Amount Spent')

In [13]:
final_data.show(3, truncate=False)

+----------------------------------------------------------------------------+-------------------+
|features                                                                    |Yearly Amount Spent|
+----------------------------------------------------------------------------+-------------------+
|[34.49726772511229,12.65565114916675,39.57766801952616,4.0826206329529615]  |587.9510539684005  |
|[31.92627202636016,11.109460728682564,37.268958868297744,2.66403418213262]  |392.2049334443264  |
|[33.000914755642675,11.330278057777512,37.110597442120856,4.104543202376424]|487.54750486747207 |
+----------------------------------------------------------------------------+-------------------+
only showing top 3 rows



### Chuẩn bị train/test dataset

In [14]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [15]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                355|
|   mean| 495.66141184166304|
| stddev|  79.42852076192497|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+



In [16]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                145|
|   mean|  508.2566753483105|
| stddev|  78.59209773676399|
|    min| 298.76200786180766|
|    max|  744.2218671047146|
+-------+-------------------+



### Xây dựng model với train dataset

In [17]:
# Create a Linear Regression Model object
lr = LinearRegression(featuresCol='features',
                      labelCol='Yearly Amount Spent',
                      predictionCol='Predict_Yearly Amount Spent')

In [18]:
# Fit the model to the data and call this model lrModel
lrModel = lr.fit(train_data)

In [19]:
# Print the coefficients and intercept for linear regression
print('Coefficients: {}, Intercept: {}'.format(lrModel.coefficients, lrModel.intercept))

Coefficients: [26.139063479429982,38.26568916270118,0.29007241161021946,60.94405642290165], Intercept: -1052.0072548409466


### Đánh giá model với test dataset

In [20]:
test_results = lrModel.evaluate(test_data)

In [21]:
# Interesting results...
test_results.residuals.show(5)

+-------------------+
|          residuals|
+-------------------+
|  11.63891273446859|
| 19.710588216713234|
| -8.606870514542322|
|-0.4723796102163078|
| -4.779150678399276|
+-------------------+
only showing top 5 rows



In [22]:
print('RMSE: {}'.format(test_results.rootMeanSquaredError))
print('MSE: {}'.format(test_results.meanSquaredError))
print('r2: {}'.format(test_results.r2))

RMSE: 9.479169928074818
MSE: 89.85466252531793
r2: 0.9853516615496501


In [23]:
# Check test dataset
test_model = lrModel.transform(test_data)

In [24]:
# Inspect results
test_model.select('Predict_Yearly Amount Spent', 'Yearly Amount Spent').show(5)

+---------------------------+-------------------+
|Predict_Yearly Amount Spent|Yearly Amount Spent|
+---------------------------+-------------------+
|          450.1418294617613|  461.7807421962299|
|          443.8808298112274|  463.5914180279406|
|         417.70139670688013|  409.0945261923378|
|          541.6989635995446|  541.2265839893283|
|         381.11605143532347|  376.3369007569242|
+---------------------------+-------------------+
only showing top 5 rows



### Lưu trữ và tải model

In [25]:
# Save Model
lrModel.save('lrModel_Ecommerce_Customers')

Py4JJavaError: An error occurred while calling o217.save.
: java.io.IOException: Path lrModel_Ecommerce_Customers already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


In [None]:
from pyspark.ml.regression import LinearRegressionModel
# Load model from
lrModel2 = LinearRegressionModel.load('lrModel_Ecommerce_Customers')

### Dự đoán dữ liệu mới

In [None]:
unlabeled_data = test_data.select('features')

In [None]:
predictions = lrModel2.transform(unlabeled_data)

In [None]:
predictions.show(5)