In [1]:
from pyspark.sql.session import SparkSession
from pyspark.context import SparkContext
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [16]:
spark = SparkSession.builder.master("local").appName("History").getOrCreate()

In [17]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("EcommerceCustomers.csv")
df.show(5)

+------------------+-----------+---------------+--------------------+-------------------+
|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+------------------+-----------+---------------+--------------------+-------------------+
|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|       33.33067252|12.79518855|     37.5366533|         4.446308318|         599.406092|
+------------------+-----------+---------------+--------------------+-------------------+
only showing top 5 rows



In [27]:
assembler_vector = VectorAssembler(inputCols=["Avg Session Length","Time on App","Time on Website","Length of Membership"],outputCol="features")

In [28]:
assembler_vector.transform(df)
output = assembler.transform(df)
output.show()
df2 = output.select(col('Yearly Amount Spent').alias('label'),"features")
df2.show()

+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|            features|
+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|[34.30555663,13.7...|
|       33.33067252|12.79518855|     37.5366533|         4.446308318|         599.406092|[33.33067252,12.7...|
|       33.87103788|12.02692534|    34.47687763|         5.493507201|        637.1024479|[33.87103788,12.0...|
|

In [29]:
fit_model=LinearRegression(featuresCol="features", labelCol = "label")
fit_model=fit_model.fit(df2)
print(fit_model.coefficients)
print(fit_model.intercept)
model_result = fit_model.summary
print(model_result.rootMeanSquaredError)
print(model_result.r2)

[25.734271083497525,38.70915381360397,0.4367388283127819,61.57732374979357]
-1051.5942549969473
9.923256786178925
0.9843155370195906
