In [16]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



### Building APP

In [4]:
spark = SparkSession.builder \
     .master("local") \
     .appName("E-Commerce Customer") \
     .config("spark.some.config.option", "some-value") \
     .getOrCreate()


### Reading dataset

In [26]:
dataset = spark.read.csv('/home/hasan/DATA SET/Ecommerce_Customers.csv', header=True)

In [27]:
dataset

DataFrame[Email: string, Address: string, Avg Session Length: string, Time on App: string, Time on Website: string, Length of Membership: string, Yearly Amount Spent: string]

In [28]:
dataset.head()

Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avg Session Length='34.49726773', Time on App='12.65565115', Time on Website='39.57766802', Length of Membership='4.082620633', Yearly Amount Spent='587.951054')

In [29]:
dataset.show(5)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [30]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: string (nullable = true)
 |-- Time on App: string (nullable = true)
 |-- Time on Website: string (nullable = true)
 |-- Length of Membership: string (nullable = true)
 |-- Yearly Amount Spent: string (nullable = true)



### Changing datatype

In [32]:
from pyspark.sql.types import DoubleType, IntegerType

In [36]:
dataset = dataset.withColumn("Avg Session Length", dataset["Avg Session Length"].cast(DoubleType()))
dataset = dataset.withColumn("Time on App", dataset["Time on App"].cast(DoubleType()))
dataset = dataset.withColumn("Time on Website", dataset["Time on Website"].cast(DoubleType()))
dataset = dataset.withColumn("Length of Membership", dataset["Length of Membership"].cast(DoubleType()))
dataset = dataset.withColumn("Yearly Amount Spent", dataset["Yearly Amount Spent"].cast(DoubleType()))



### VectorAssembler

In [44]:
featueAssembler = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'], outputCol='Independent Features')


In [45]:
output = featueAssembler.transform(dataset)
output.show(5)

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [47]:
output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|[34.49726773,12.6...|
|[31.92627203,11.1...|
|[33.00091476,11.3...|
|[34.30555663,13.7...|
|[33.33067252,12.7...|
|[33.87103788,12.0...|
|[32.0215955,11.36...|
|[32.73914294,12.3...|
|[33.9877729,13.38...|
|[31.93654862,11.8...|
|[33.99257277,13.3...|
|[33.87936082,11.5...|
|[29.53242897,10.9...|
|[33.19033404,12.9...|
|[32.38797585,13.1...|
|[30.73772037,12.6...|
|[32.1253869,11.73...|
|[32.33889932,12.0...|
|[32.18781205,14.7...|
|[32.61785606,13.9...|
+--------------------+
only showing top 20 rows



In [48]:
output.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 'Independent Features']

### Seperating only necessary features and Labels

In [49]:
finalize_data = output.select('Independent Features', 'Yearly Amount Spent')
finalize_data.show()

+--------------------+-------------------+
|Independent Features|Yearly Amount Spent|
+--------------------+-------------------+
|[34.49726773,12.6...|         587.951054|
|[31.92627203,11.1...|        392.2049334|
|[33.00091476,11.3...|        487.5475049|
|[34.30555663,13.7...|         581.852344|
|[33.33067252,12.7...|         599.406092|
|[33.87103788,12.0...|        637.1024479|
|[32.0215955,11.36...|        521.5721748|
|[32.73914294,12.3...|        549.9041461|
|[33.9877729,13.38...|         570.200409|
|[31.93654862,11.8...|        427.1993849|
|[33.99257277,13.3...|        492.6060127|
|[33.87936082,11.5...|        522.3374046|
|[29.53242897,10.9...|        408.6403511|
|[33.19033404,12.9...|        573.4158673|
|[32.38797585,13.1...|        470.4527333|
|[30.73772037,12.6...|        461.7807422|
|[32.1253869,11.73...|        457.8476959|
|[32.33889932,12.0...|        407.7045475|
|[32.18781205,14.7...|        452.3156755|
|[32.61785606,13.9...|        605.0610388|
+----------

### Dividing dataset

In [51]:
train_data, test_data = finalize_data.randomSplit([0.75, 0.25])


### Model

In [52]:
regressor = LinearRegression(featuresCol="Independent Features", labelCol="Yearly Amount Spent")
regressor = regressor.fit(train_data)


In [53]:
regressor.coefficients

DenseVector([25.9995, 38.4272, 0.2773, 61.5652])

In [54]:
regressor.intercept

-1051.1978820975382

### Evaluate and Prediction

In [59]:
pre_result = regressor.evaluate(test_data)

In [61]:
pre_result.predictions.show()

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[29.53242897,10.9...|        408.6403511| 397.3376002679188|
|[30.73772037,12.6...|        461.7807422|450.32504009840727|
|[31.06621816,11.7...|        448.9332932|461.33362558817043|
|[31.12809005,13.2...|        557.2526867| 563.5622617969902|
|[31.26810421,12.1...|        423.4705332| 426.9402527449979|
|[31.28344748,12.7...|        591.7810894| 568.9820032625466|
|[31.42522688,13.2...|        530.7667187| 533.7756659527408|
|[31.44597248,12.8...|        484.8769649|481.09958635397425|
|[31.44744649,10.1...|        418.6027421| 426.0779771923678|
|[31.51473786,12.5...|         489.812488|494.14301211437964|
|[31.5261979,12.04...|        409.0945262|417.33296269985703|
|[31.53160448,13.3...|        436.5156057|  431.672360116813|
|[31.57613197,12.5...|         541.226584| 542.4740105630506|
|[31.600