In [5]:
from pyspark.sql import SparkSession

In [7]:
spark = SparkSession.builder.appName("Customers").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/02/15 17:49:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
from pyspark.ml.regression import LinearRegression

In [11]:
dataset = spark.read.csv("Ecommerce_Customers.csv", inferSchema=True,  header=True)
dataset.show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.72128268|         3.120178783|         581.852344|
|mstephens@davidso...|14023 Rodriguez P...|       33.33067252|12.79518855|  

In [13]:
dataset.columns

['Email',
 'Address',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [14]:
dataset.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [21]:
from pyspark.ml.linalg  import Vectors
from pyspark.ml.feature import VectorAssembler
VA = VectorAssembler(inputCols=['Avg Session Length','Time on App','Time on Website','Length of Membership'],outputCol='Independent Features')
data = VA.transform(dataset)
VA.transform(dataset).show()

+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|               Email|             Address|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent|Independent Features|
+--------------------+--------------------+------------------+-----------+---------------+--------------------+-------------------+--------------------+
|mstephenson@ferna...|835 Frank TunnelW...|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|[34.49726773,12.6...|
|   hduke@hotmail.com|4547 Archer Commo...|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|[31.92627203,11.1...|
|    pallen@yahoo.com|24645 Valerie Uni...|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|[33.00091476,11.3...|
|riverarebecca@gma...|1414 David Throug...|       34.30555663|13.71751367|    36.7

In [27]:
finalized_data = data.select('Independent Features', 'Yearly Amount Spent')
finalized_data

DataFrame[Independent Features: vector, Yearly Amount Spent: double]

In [28]:
train_data,test_data = finalized_data.randomSplit([0.7,0.3])

In [26]:
print(train_data,test_data)

DataFrame[Independent Features: vector] DataFrame[Independent Features: vector]


In [29]:
regression = LinearRegression(featuresCol="Independent Features", labelCol="Yearly Amount Spent")
reg = regression.fit(train_data)

24/02/15 18:06:09 WARN Instrumentation: [acdf73a1] regParam is zero, which might cause numerical instability and overfitting.
24/02/15 18:06:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/02/15 18:06:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/02/15 18:06:11 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


In [30]:
reg.coefficients

DenseVector([26.0259, 38.6405, 0.767, 61.7641])

In [31]:
reg.intercept

-1073.5134907752288

In [34]:
pred = reg.evaluate(test_data)
pred.predictions.show()

+--------------------+-------------------+------------------+
|Independent Features|Yearly Amount Spent|        prediction|
+--------------------+-------------------+------------------+
|[30.57436368,11.3...|        442.0644138| 441.1608005736334|
|[31.06621816,11.7...|        448.9332932| 461.0504622334531|
|[31.26064687,13.2...|        421.3266313| 421.0974497910472|
|[31.28344748,12.7...|        591.7810894| 568.8128458752988|
|[31.30919264,11.9...|        432.7207178| 428.7248807414237|
|[31.36621217,11.1...|        430.5888826|426.22369773965806|
|[31.51473786,12.5...|         489.812488|495.48346220421945|
|[31.57020083,13.3...|        545.9454921|  562.804264192745|
|[31.57613197,12.5...|         541.226584| 542.7404152923375|
|[31.65480968,13.0...|        475.2634237|   468.12228326689|
|[31.66104982,11.3...|        416.3583536|416.53347131776195|
|[31.6739155,12.32...|        475.7250679| 501.6409215700571|
|[31.8530748,12.14...|        459.2851235| 461.2159331607777|
|[31.862

                                                                                