In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('linearReg').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression
df = spark.read.csv('FileStore/tables/Ecommerce_Customers.csv', inferSchema=True, header=True)

In [4]:
df.printSchema()

In [5]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [6]:
df.columns

In [7]:
assembler = VectorAssembler(inputCols=['Avg Session Length',
                                       'Time on App', 
                                       'Time on Website',
                                       'Length of Membership'],
                            outputCol='features')

In [8]:
output = assembler.transform(df)

In [9]:
output.printSchema()

In [10]:
output.head(1)

In [11]:
final_data = output.select(['features', 'Yearly Amount Spent'])

In [12]:
final_data.show()

In [13]:
train_data, test_data = final_data.randomSplit([0.7,0.3])

In [14]:
train_data.describe().show()

In [15]:
test_data.describe().show()

In [16]:
lr = LinearRegression(labelCol='Yearly Amount Spent')

In [17]:
lr_model = lr.fit(train_data)

In [18]:
test_result = lr_model.evaluate(test_data)

In [19]:
test_result.residuals.show()

In [20]:
test_result.rootMeanSquaredError

In [21]:
test_result.r2

In [22]:
final_data.describe().show()

In [23]:
pred_data = test_data.select('features')

In [24]:
predictions = lr_model.transform(pred_data)

In [25]:
predictions.show()