# Linear Regression Using PySpark and MLlib

1. Read a CSV into Spark
2. Do some trivial data wrangling with dataframes
3. Perform a linear regression

### 1. Read a CSV into Spark

In [152]:
# Import the pyspark sql functions necessary for Spark DataFrame operations 
from pyspark.sql.types import *
from pyspark.sql import Row

In [153]:
# Read the CSV file into RDD
rdd = sc.textFile('/resources/data/mllibdata/Sacramentorealestatetransactions.csv')
rdd.take(5)

['street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude',
 '3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879',
 '51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028',
 '2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839',
 '2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146']

### 2. Do some trivial data wrangling with dataframes

In [154]:
# Split dataset on the commas
rdd = rdd.map(lambda line: line.split(","))
rdd.take(2)

[['street',
  'city',
  'zip',
  'state',
  'beds',
  'baths',
  'sq__ft',
  'type',
  'sale_date',
  'price',
  'latitude',
  'longitude'],
 ['3526 HIGH ST',
  'SACRAMENTO',
  '95838',
  'CA',
  '2',
  '1',
  '836',
  'Residential',
  'Wed May 21 00:00:00 EDT 2008',
  '59222',
  '38.631913',
  '-121.434879']]

In [155]:
# Strip the header from the RDD
header = rdd.first()
rdd = rdd.filter(lambda line:line != header)
rdd.take(2)

[['3526 HIGH ST',
  'SACRAMENTO',
  '95838',
  'CA',
  '2',
  '1',
  '836',
  'Residential',
  'Wed May 21 00:00:00 EDT 2008',
  '59222',
  '38.631913',
  '-121.434879'],
 ['51 OMAHA CT',
  'SACRAMENTO',
  '95823',
  'CA',
  '3',
  '1',
  '1167',
  'Residential',
  'Wed May 21 00:00:00 EDT 2008',
  '68212',
  '38.478902',
  '-121.431028']]

In [156]:
# Map every line from the RDD to a Row of our soon to be DataFrame
df = rdd.map(lambda line: Row(street = line[0], city = line[1], zip=line[2], beds=line[4], baths=line[5], sqft=line[6], price=line[9])).toDF()
df.take(5)

[Row(baths='1', beds='2', city='SACRAMENTO', price='59222', sqft='836', street='3526 HIGH ST', zip='95838'),
 Row(baths='1', beds='3', city='SACRAMENTO', price='68212', sqft='1167', street='51 OMAHA CT', zip='95823'),
 Row(baths='1', beds='2', city='SACRAMENTO', price='68880', sqft='796', street='2796 BRANCH ST', zip='95815'),
 Row(baths='1', beds='2', city='SACRAMENTO', price='69307', sqft='852', street='2805 JANETTE WAY', zip='95815'),
 Row(baths='1', beds='2', city='SACRAMENTO', price='81900', sqft='797', street='6001 MCMAHON DR', zip='95824')]

In [157]:
# Or a little bit better visualization
df.show(5)

+-----+----+----------+-----+----+----------------+-----+
|baths|beds|      city|price|sqft|          street|  zip|
+-----+----+----------+-----+----+----------------+-----+
|    1|   2|SACRAMENTO|59222| 836|    3526 HIGH ST|95838|
|    1|   3|SACRAMENTO|68212|1167|     51 OMAHA CT|95823|
|    1|   2|SACRAMENTO|68880| 796|  2796 BRANCH ST|95815|
|    1|   2|SACRAMENTO|69307| 852|2805 JANETTE WAY|95815|
|    1|   2|SACRAMENTO|81900| 797| 6001 MCMAHON DR|95824|
+-----+----+----------+-----+----+----------------+-----+
only showing top 5 rows



### 3. Perform a linear regression

In [158]:
# Import the necessary modules in order to use MLlib to do a linear regression
import pyspark.mllib
import pyspark.mllib.regression
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql.functions import *

In [159]:
# Select only the subset of features I'm interested in
# I'm going to predict home price from the number of baths, beds, and square feet
df = df.select('price','baths','beds','sqft')
df.show(5)

+-----+-----+----+----+
|price|baths|beds|sqft|
+-----+-----+----+----+
|59222|    1|   2| 836|
|68212|    1|   3|1167|
|68880|    1|   2| 796|
|69307|    1|   2| 852|
|81900|    1|   2| 797|
+-----+-----+----+----+
only showing top 5 rows



In [160]:
# Remove those rows that have suspicious 0 values for any of the features we want to use for prediction
df = df[df.baths > 0]
df = df[df.beds > 0]
df = df[df.sqft > 0]
df.describe(['baths','beds','price','sqft']).show()

+-------+------------------+------------------+------------------+------------------+
|summary|             baths|              beds|             price|              sqft|
+-------+------------------+------------------+------------------+------------------+
|  count|               814|               814|               814|               814|
|   mean|1.9606879606879606|3.2444717444717446| 229448.3697788698|1591.1461916461917|
| stddev|0.6698038253879438|0.8521372615281976|119825.57606009026| 663.8419297942894|
|    min|                 1|                 1|            100000|              1000|
|    max|                 5|                 8|             99000|               998|
+-------+------------------+------------------+------------------+------------------+



In [161]:
# MLlib requires that our features be expressed with LabeledPoints. 
# The required format for a labeled point is a tuple of the response value and a vector of predictors. 
# We can call 'map' on df in order to return an RDD of LabeledPoints.
temp = df.map(lambda line:LabeledPoint(line[0],[line[1:]]))
temp.take(5)

[LabeledPoint(59222.0, [1.0,2.0,836.0]),
 LabeledPoint(68212.0, [1.0,3.0,1167.0]),
 LabeledPoint(68880.0, [1.0,2.0,796.0]),
 LabeledPoint(69307.0, [1.0,2.0,852.0]),
 LabeledPoint(81900.0, [1.0,2.0,797.0])]

In [162]:
# We'll be using Stochastic Gradient Descent and the scare footage of these houses is quite large 
#   in comparison to the number of bedrooms and bathrooms. 
# We'll need to scale the data first with Spark's 'StandardScaler.'
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import StandardScaler

In [163]:
# In order to scale the data, we'll have to use an RDD.
features = df.map(lambda row: row[1:])
features.take(5)

[('1', '2', '836'),
 ('1', '3', '1167'),
 ('1', '2', '796'),
 ('1', '2', '852'),
 ('1', '2', '797')]

In [164]:
# Scale (Normalize) the data
standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
features_transform.take(5)

[DenseVector([1.493, 2.347, 1.2593]),
 DenseVector([1.493, 3.5206, 1.7579]),
 DenseVector([1.493, 2.347, 1.1991]),
 DenseVector([1.493, 2.347, 1.2834]),
 DenseVector([1.493, 2.347, 1.2006])]

In [165]:
# Put our labels and features back together as labeled points.
# Because the labels (prices) are in a DataFrame and the scaled features are in the new RDD we just created.
# I can put them together with 'zip' function, but I'll need the labels to be in an RDD first. 
# A simple mapping that grabs the zero element (price) from each row addresses this problem.
lab = df.map(lambda row: row[0])
lab.take(15)

['59222',
 '68212',
 '68880',
 '69307',
 '81900',
 '89921',
 '90895',
 '91002',
 '94905',
 '98937',
 '100309',
 '106250',
 '106852',
 '107502',
 '108750']

In [166]:
# Now the two RDDs can be put together with 'zip' function
transformedData = lab.zip(features_transform)
transformedData.take(5)

[('59222', DenseVector([1.493, 2.347, 1.2593])),
 ('68212', DenseVector([1.493, 3.5206, 1.7579])),
 ('68880', DenseVector([1.493, 2.347, 1.1991])),
 ('69307', DenseVector([1.493, 2.347, 1.2834])),
 ('81900', DenseVector([1.493, 2.347, 1.2006]))]

In [167]:
# Go back to the LabeledPoint structure before using MLlib
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))
transformedData.take(5)

[LabeledPoint(59222.0, [1.49297445326,2.34703972035,1.25933593899]),
 LabeledPoint(68212.0, [1.49297445326,3.52055958053,1.7579486134]),
 LabeledPoint(68880.0, [1.49297445326,2.34703972035,1.19908063091]),
 LabeledPoint(69307.0, [1.49297445326,2.34703972035,1.28343806223]),
 LabeledPoint(81900.0, [1.49297445326,2.34703972035,1.20058701361])]

In [168]:
# Split the data into training / testing subsets
trainingData, testingData = transformedData.randomSplit([.8, .2],seed=1234)

In [169]:
# Check the full data size
transformedData.count()

814

In [170]:
# Check the training data size
trainingData.count()

652

In [171]:
# Check the testing data size
testingData.count()

162

In [172]:
# Import linear regression with stochastic gradient descent and build a model. 
from pyspark.mllib.regression import LinearRegressionWithSGD

# The number of iterations is specified along with the step size and the data set.
linearModel = LinearRegressionWithSGD.train(trainingData, 1000, .2)

In [173]:
# Get coefficients and intercepts from the model.
linearModel.weights

DenseVector([15911.6446, 4526.9663, 68332.1903])

In [174]:
# Examining the first 10 rows from the testing dataset
testingData.take(10)

[LabeledPoint(100309.0, [2.98594890652,3.52055958053,1.36930187625]),
 LabeledPoint(124100.0, [2.98594890652,3.52055958053,2.41171870613]),
 LabeledPoint(148750.0, [2.98594890652,4.69407944071,2.21739533756]),
 LabeledPoint(150000.0, [1.49297445326,1.17351986018,1.14485085363]),
 LabeledPoint(161500.0, [2.98594890652,4.69407944071,2.3906293483]),
 LabeledPoint(166357.0, [1.49297445326,4.69407944071,2.94497818269]),
 LabeledPoint(168000.0, [2.98594890652,3.52055958053,2.22492725107]),
 LabeledPoint(178480.0, [2.98594890652,3.52055958053,1.78506350204]),
 LabeledPoint(181872.0, [1.49297445326,3.52055958053,1.73535287287]),
 LabeledPoint(182587.0, [4.47892335978,4.69407944071,2.78831438167])]

In [175]:
# Make a prediction on one of the points using our model.
linearModel.predict([1.49297445326,3.52055958053,1.73535287287])

158273.59605366364

In [176]:
# Evaluate the model with metrics available within the evaluation package for MLlib
from pyspark.mllib.evaluation import RegressionMetrics

In [177]:
# Need an RDD that's a tuple of predictions from our model and the original home values.
prediObserRDDin = trainingData.map(lambda row: (float(linearModel.predict(row.features[0])),row.label))
prediObserRDDin.take(5)

[(120433.83152581986, 59222.0),
 (159817.6124950933, 68212.0),
 (116316.45434865377, 68880.0),
 (122080.78239668628, 69307.0),
 (116419.38877808292, 81900.0)]

In [178]:
# Calling RegressionMetrics on this RDD builds a variable that contains a number of metrics. 
metrics = RegressionMetrics(prediObserRDDin)

In [179]:
# Check R squared
metrics.r2

0.466721671313607

In [180]:
# Do the same RDD mapping and RegressionMetrics for the test data
prediObserRDDout = testingData.map(lambda row: (float(linearModel.predict(row.features[0])),row.label))
metrics = RegressionMetrics(prediObserRDDout)
prediObserRDDout.take(5)

[(157016.20856897274, 100309.0),
 (228246.8337339457, 124100.0),
 (220280.77716580936, 148750.0),
 (107298.33006098008, 150000.0),
 (232118.2365501618, 161500.0)]

In [181]:
# Check Root Mean Squared Error (RMSE)
# More info about the Spark evaluation metrics can be found in the documentation for the MLlib evaluation module: 
#   https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#module-pyspark.mllib.evaluation
metrics.rootMeanSquaredError

85023.27994408192

In [182]:
# The End#