In [1]:
## Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler # numerical columns to vectors before modeling
from pyspark.sql.types import StructType, StructField, StringType, DoubleType
from pyspark.ml.regression import LinearRegression

## Set seed
seed = 42

In [2]:
## Create Spark Session
spark = SparkSession.builder.appName('lrCodeAlong').getOrCreate()

In [3]:
## Setup Schema
schema = StructType(fields=[StructField('email', StringType(), True),
                            StructField('address', StringType(), True),
                            StructField('avatar', StringType(), True),
                            StructField('avg_session_length_min', DoubleType(), True),
                            StructField('time_on_app', DoubleType(), True),
                            StructField('time_on_site', DoubleType(), True),
                            StructField('length_of_membership', DoubleType(), True),
                            StructField('yearly_amount_spent', DoubleType(), True)])

In [4]:
## Load Data
df = spark.read.csv('gs://spark-training-data/datasets/Ecommerce_Customers.csv', inferSchema=False,
                    header=True, schema=schema)
df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+--------------------+----------------+----------------------+------------------+------------------+--------------------+-------------------+
|               email|             address|          avatar|avg_session_length_min|       time_on_app|      time_on_site|length_of_membership|yearly_amount_spent|
+--------------------+--------------------+----------------+----------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet|     34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen|     31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|    33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|

                                                                                

In [5]:
## Confirm Proper Schema & Cols
df.printSchema()
df.columns

root
 |-- email: string (nullable = true)
 |-- address: string (nullable = true)
 |-- avatar: string (nullable = true)
 |-- avg_session_length_min: double (nullable = true)
 |-- time_on_app: double (nullable = true)
 |-- time_on_site: double (nullable = true)
 |-- length_of_membership: double (nullable = true)
 |-- yearly_amount_spent: double (nullable = true)



['email',
 'address',
 'avatar',
 'avg_session_length_min',
 'time_on_app',
 'time_on_site',
 'length_of_membership',
 'yearly_amount_spent']

In [7]:
## Create Vector Assembler & transform data
assembler = VectorAssembler(inputCols=['avg_session_length_min','time_on_app'],
                            outputCol='features')
output_features = assembler.transform(df)
output_features.head(1) # Vector of numerical values we will be modeling on - Spark expects this

                                                                                

[Row(email='mstephenson@fernandez.com', address='835 Frank TunnelWrightmouth, MI 82180-9605', avatar='Violet', avg_session_length_min=34.49726772511229, time_on_app=12.65565114916675, time_on_site=39.57766801952616, length_of_membership=4.0826206329529615, yearly_amount_spent=587.9510539684005, features=DenseVector([34.4973, 12.6557]))]

In [9]:
## Prep data - Grab only features and target column
final_data = output_features.select(['features', 'yearly_amount_spent'])
final_data.show(5)

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+-------------------+
|            features|yearly_amount_spent|
+--------------------+-------------------+
|[34.4972677251122...|  587.9510539684005|
|[31.9262720263601...|  392.2049334443264|
|[33.0009147556426...| 487.54750486747207|
|[34.3055566297555...|  581.8523440352177|
|[33.3306725236463...|  599.4060920457634|
+--------------------+-------------------+
only showing top 5 rows



                                                                                

In [10]:
## Split data into train / test
train_data, test_data = final_data.randomSplit([0.7, 0.3], seed=seed)
train_data.describe().show()
test_data.describe().show()

                                                                                

+-------+-------------------+
|summary|yearly_amount_spent|
+-------+-------------------+
|  count|                374|
|   mean| 499.78515710377627|
| stddev|  79.76963867632105|
|    min| 256.67058229005585|
|    max|  765.5184619388373|
+-------+-------------------+

+-------+-------------------+
|summary|yearly_amount_spent|
+-------+-------------------+
|  count|                126|
|   mean|  497.9156378768501|
| stddev|  78.24705567534872|
|    min|  282.4712457199145|
|    max|  744.2218671047146|
+-------+-------------------+



In [16]:
## Build linear regression model
lr = LinearRegression(labelCol='yearly_amount_spent', featuresCol='features', predictionCol='prediction')
lr_model = lr.fit(train_data)

21/11/15 20:55:36 WARN org.apache.spark.ml.util.Instrumentation: [1840ad2c] regParam is zero, which might cause numerical instability and overfitting.


In [19]:
## Evaluate Model Accuracy
train_results = lr_model.evaluate(train_data)
test_results = lr_model.evaluate(test_data)

In [21]:
## Show Model Values
print(f'Train R^2: {train_results.r2}')
print(f'Test R^2: {test_results.r2}')

Train R^2: 0.3822976041111297
Test R^2: 0.38489596969993145


In [24]:
##### Mock Model Deployment #####
unlabeled_data = test_data.select('features') # Get "unlabeled" data
unlabeled_data.show(5)

+--------------------+
|            features|
+--------------------+
|[30.4925366965402...|
|[30.8364326747734...|
|[30.9716756438877...|
|[31.0472221394875...|
|[31.1280900496166...|
+--------------------+
only showing top 5 rows



In [25]:
## Model predictions
predictions = lr_model.transform(unlabeled_data)
predictions.show(5)

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.4925366965402...| 410.5278154435233|
|[30.8364326747734...|485.69919521812733|
|[30.9716756438877...| 430.3727231212241|
|[31.0472221394875...|409.48563848490505|
|[31.1280900496166...|    501.0623797646|
+--------------------+------------------+
only showing top 5 rows

