### Preparing HDFS
Using magic

Create input folder on HDFS if not exists

Copy from data from local

In [1]:
! hadoop fs -mkdir -p  /tmp/input
! hadoop fs -put   -p  ./../data-clean/*.csv             /tmp/input

put: `/tmp/input/cleanMelbourneData.csv': File exists




## Load Clean Dataset into a DataFrame from HDFS 


In [44]:
// Load Clean Dataset into a DataFrame from HDFS after wrangling is completed
var df_clean = spark
    .read
    .format("csv")
    .option("header", "true")
    .load("hdfs://localhost:9000/tmp/input/*.csv")

df_clean: org.apache.spark.sql.DataFrame = [Price: string, MethodOfSale: string ... 11 more fields]


In [45]:
df_clean = df_clean.withColumn("Price",col("Price").cast("Double"))
    .withColumn("Rooms",col("Rooms").cast("Int"))
    .withColumn("DistanceFromCBD",col("DistanceFromCBD").cast("Double"))
    .withColumn("MethodOfSale",col("MethodOfSale").cast("Int"))
    .withColumn("PropertyType",col("PropertyType").cast("Int"))
    .withColumn("Bathroom",col("Bathroom").cast("Int"))
    .withColumn("Car",col("Car").cast("Int"))
    .withColumn("Landsize",col("Landsize").cast("Double"))
    .withColumn("Latitude",col("Latitude").cast("Double"))
    .withColumn("Longtitude",col("Longtitude").cast("Double"))



df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


In [46]:
df_clean.printSchema()

root
 |-- Price: double (nullable = true)
 |-- MethodOfSale: integer (nullable = true)
 |-- PropertyType: integer (nullable = true)
 |-- DistanceFromCBD: double (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Suburb: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- StreetName: string (nullable = true)



## Change attributes into vectors 

#### Transform Sale Date into a numeric value

In [47]:
df_clean = df_clean.withColumn("Date",unix_timestamp($"Date", "dd/mm/yyyy"))



df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


#### Set FeatureHasher for Suburb, StreetName

In [6]:
import org.apache.spark.ml.feature.{FeatureHasher,OneHotEncoder,StandardScaler,VectorAssembler}

val hasher = new FeatureHasher()
 .setInputCols("StreetName","Suburb")
 .setOutputCol("str_name_suburb_vec")

import org.apache.spark.ml.feature.{FeatureHasher, OneHotEncoder, StandardScaler, VectorAssembler}
hasher: org.apache.spark.ml.feature.FeatureHasher = featureHasher_c52632897fac


#### Set OneHotEncoders for PropertyType,  MethodOfSale 

In [7]:
val ms_encoder = new OneHotEncoder()
      .setInputCol("MethodOfSale")
      .setOutputCol("m_sale_vec")

val pt_encoder = new OneHotEncoder()
      .setInputCol("PropertyType")
      .setOutputCol("pt_vec")

ms_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_84595a7a6062
pt_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_c0f4033f6254


#### Assemble  the columns and column vectors into a single column - "features"

In [8]:
val columns = Array("Price", "DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols(columns)
            .setOutputCol("features")


columns: Array[String] = Array(Price, DistanceFromCBD, Rooms, Bathroom, Car, Landsize, Latitude, Longtitude, Date, str_name_suburb_vec, m_sale_vec, pt_vec)
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_ec554a896ced


#### Set StandardScaler

In [9]:
var scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("ScaledFeatures")
      .setWithStd(true).setWithMean(true)


scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_ecc6c9d022aa


### Split Data into a Training and a Testing Set


In [10]:
import org.apache.spark.ml.feature.{VectorAssembler,StandardScaler}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._


def train_test_split(data: DataFrame) = {
    
     val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = 30)
    
     (train, test)
}

import org.apache.spark.ml.feature.{VectorAssembler, StandardScaler}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
train_test_split: (data: org.apache.spark.sql.DataFrame)(org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], org.apache.spark.sql.Dataset[org.apache.spark.sql.Row])


In [11]:

val (train, test) = train_test_split(df_clean)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: double, MethodOfSale: int ... 11 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: double, MethodOfSale: int ... 11 more fields]


### 1. Apply Linear Regression

In [30]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.Pipeline



val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("features")
    .setPredictionCol("Predicted Price")
    .setMaxIter(50)

import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.Pipeline
lr: org.apache.spark.ml.regression.LinearRegression = linReg_bd364196d01b


#### Set Stages to transform all columns into feature

In [31]:
// add linear regression to stages
val lrStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            lr
)

lrStages: Array[org.apache.spark.ml.PipelineStage with org.apache.spark.ml.util.DefaultParamsWritable{def copy(extra: org.apache.spark.ml.param.ParamMap): org.apache.spark.ml.PipelineStage with org.apache.spark.ml.util.DefaultParamsWritable{def copy(extra: org.apache.spark.ml.param.ParamMap): org.apache.spark.ml.PipelineStage with org.apache.spark.ml.util.DefaultParamsWritable}}] = Array(featureHasher_c52632897fac, oneHot_84595a7a6062, oneHot_c0f4033f6254, vecAssembler_ec554a896ced, linReg_bd364196d01b)


In [32]:
//Construct the pipeline
val startTimeMillis = System.currentTimeMillis()

val lrPipe = new Pipeline().setStages(lrStages)

//We fit our DataFrame into the pipeline to generate a model
val lrModel = lrPipe.fit(train)

//Make predictions using the model and the test data
val predictions = lrModel.transform(test)

val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)

2020-05-29 22:37:16,724 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2020-05-29 22:37:16,725 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
pipeline was executed 131

startTimeMillis: Long = 1590791827522
lrPipe: org.apache.spark.ml.Pipeline = pipeline_b442e1ff4d19
lrModel: org.apache.spark.ml.PipelineModel = pipeline_b442e1ff4d19
predictions: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 16 more fields]
endTimeMillis: Long = 1590791958860
durationSeconds: Long = 131


#### Prediction

In [33]:
predictions.columns

res11: Array[String] = Array(Price, MethodOfSale, PropertyType, DistanceFromCBD, Rooms, Bathroom, Car, Landsize, Latitude, Longtitude, Suburb, Date, StreetName, str_name_suburb_vec, m_sale_vec, pt_vec, features, Predicted Price)


In [34]:
predictions.count()

res12: Long = 3108


In [35]:
predictions

res13: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 16 more fields]


In [36]:
predictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()

+--------+---------------+
|   Price|Predicted Price|
+--------+---------------+
|170000.0|       170064.0|
|280000.0|       280013.0|
|280500.0|       280579.0|
|283000.0|       283663.0|
|290000.0|       290036.0|
|300000.0|       300055.0|
|300000.0|       299881.0|
|305000.0|       305248.0|
|310000.0|       310135.0|
|316000.0|       315935.0|
|320000.0|       320027.0|
|320000.0|       320035.0|
|320000.0|       320560.0|
|320000.0|       319303.0|
|325000.0|       325090.0|
|333000.0|       332989.0|
|340000.0|       342929.0|
|345000.0|       345065.0|
|348000.0|       348240.0|
|350000.0|       350108.0|
+--------+---------------+
only showing top 20 rows



In [37]:
def evaluate ( predictions: DataFrame, metric: String) = {
    val eval =  new RegressionEvaluator()
       .setLabelCol("Price")
       .setPredictionCol("Predicted Price")
       .setMetricName(metric)

println("Root Mean Squared Error "+  metric.toUpperCase()+" on test data = " + eval.evaluate(predictions))

    
}

evaluate: (predictions: org.apache.spark.sql.DataFrame, metric: String)Unit


In [38]:
evaluate(predictions,"rmse")


Root Mean Squared Error RMSE on test data = 975.1264181243374


In [39]:
evaluate(predictions,"r2")

Root Mean Squared Error R2 on test data = 0.9999977135307482


#### Testing/Evaluation/ Parameter Tuning

Cross-validation

<span style='color:red'> does not finish run in reasonable time</span>

In [None]:
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
import org.apache.spark.ml.feature.{VectorAssembler, StandardScaler}
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.Pipeline


val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("features")
    .setPredictionCol("Prediction")


val lrParamMap = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(10, 1, 0.1, 0.01, 0.001))
    .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
    .addGrid(lr.maxIter, Array(10000, 250000))
    .build()

val columns = Array("Price", "DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols(columns)
            .setOutputCol("features")


val lrStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            lr
)

val lrPipeline = new Pipeline()
    .setStages(lrStages)

val cvLR = new CrossValidator()
    .setEstimator(lrPipeline)
    .setEvaluator(new RegressionEvaluator()
    .setLabelCol("Price")
    .setPredictionCol("Prediction")
    .setMetricName("rmse"))
    .setEstimatorParamMaps(lrParamMap)
    .setNumFolds(5)
    .setParallelism(2)

val startTimeMillis = System.currentTimeMillis()

val cvLRModel = cvLR.fit(train)
val lrPredictionsAndPrice = cvLRModel
    .transform(test)


val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)

In [None]:
lrPredictionsAndPrice.show()
val bestLRModel = cvLRModel.bestModel
    
println(bestLRModel.extractParamMap)

In [None]:
lrPredictionsAndPrice.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()

### 2. Apply KNN

#### Training


Pipeline Estimator

#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

### 3. Apply Random Forest Regression

#### Build Random Forest model

Specify maxDepth, maxBins, impurity, auto and seed parameters.

**maxDepth** -- Maximum depth of a tree. Increasing the depth makes the model more powerful, but deep trees take longer to train.

**imaxBins** -- Maximum number of bins used for discretizing continuous features and for choosing how to split on features at each node.

**impurity** -- Criterion used for information gain calculation

**iauto** -- Automatically select the number of features to consider for splits at each tree node

**iseed** -- Use a random seed number , allowing to repeat the results



In [18]:
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.Pipeline

val seed = 5043

val rf = new RandomForestRegressor()
  .setMaxBins(4)
  .setMaxDepth(2)
  .setNumTrees(10)
  .setFeatureSubsetStrategy("auto")
  .setSeed(seed)
  .setLabelCol("Price")
  .setFeaturesCol("features")
  .setPredictionCol("Predicted Price")

val rfStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            rf
)

val rfPipe = new Pipeline().setStages(rfStages) 

val startTimeMillis = System.currentTimeMillis()
val rfModel = rfPipe.fit(train)

//Make predictions using the model and the test data
val rfPredictions = rfModel.transform(test)


val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)


pipeline was executed 386

import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.Pipeline
seed: Int = 5043
rf: org.apache.spark.ml.regression.RandomForestRegressor = rfr_7e348110766e
rfStages: Array[org.apache.spark.ml.PipelineStage with org.apache.spark.ml.util.DefaultParamsWritable{def copy(extra: org.apache.spark.ml.param.ParamMap): org.apache.spark.ml.PipelineStage with org.apache.spark.ml.util.DefaultParamsWritable{def copy(extra: org.apache.spark.ml.param.ParamMap): org.apache.spark.ml.PipelineStage with org.apache.spark.ml.util.DefaultParamsWritable}}] = Array(featureHasher_c52632897fac, oneHot_84595a7a6062, oneHot_c0f4033f6254, vecAssembler_ec554a896ced, rfr_7e348110766e)
rfPipe: org.apache.spark.ml.Pipeline = pipeline_7e4d1...

In [42]:
rfPredictions.columns

res19: Array[String] = Array(Price, MethodOfSale, PropertyType, DistanceFromCBD, Rooms, Bathroom, Car, Landsize, Latitude, Longtitude, Suburb, Date, StreetName, str_name_suburb_vec, m_sale_vec, pt_vec, features, Predicted Price)


In [43]:
rfPredictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()

+--------+---------------+
|   Price|Predicted Price|
+--------+---------------+
|170000.0|      1043521.0|
|280000.0|      1072249.0|
|280500.0|       745230.0|
|283000.0|       766050.0|
|290000.0|      1121635.0|
|300000.0|       745230.0|
|300000.0|      1107235.0|
|305000.0|      1107235.0|
|310000.0|      1057849.0|
|316000.0|      1107235.0|
|320000.0|      1121635.0|
|320000.0|      1121635.0|
|320000.0|       824513.0|
|320000.0|      1121635.0|
|325000.0|       784711.0|
|333000.0|       745230.0|
|340000.0|       766050.0|
|345000.0|      1107235.0|
|348000.0|      1022987.0|
|350000.0|       973601.0|
+--------+---------------+
only showing top 20 rows



In [28]:
evaluate(rfPredictions,"rmse")


Root Mean Squared Error RMSE on test data = 434957.1975603789


In [29]:
evaluate(rfPredictions,"r2")

Root Mean Squared Error R2 on test data = 0.5450783935759977


#### Testing/Evaluation/ Parameter Tuning

Cross-validation

<span style='color:red'> to do: 
    
    * finish implementation for Cross-validation 
    * check if finish run in reasonable time
</span>

In [63]:
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
import org.apache.spark.ml.feature.{VectorAssembler, StandardScaler}
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.Pipeline


// Models hypoparameters
val numTrees = Seq(5,10,15)
val maxBins = Seq(2,5,10)
val maxDepth = Seq(2,3,5)
val impurity = Seq("gini","entropy","variance",)
val featureSubsetStrategy = Seq("sqrt")

val rf = new RandomForestRegressor()
  .setLabelCol("Price")
  .setFeaturesCol("features")
  .setPredictionCol("Predicted Price")


val lrParamMap = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(10, 1, 0.1, 0.01, 0.001))
    .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
    .addGrid(lr.maxIter, Array(10000, 250000))
    .build()

val rfParamMap = new ParamGridBuilder()
                      .addGrid(rf.numTrees, numTrees)
                      .addGrid(rf.maxDepth, maxDepth)
                      .addGrid(rf.impurity, impurity)
                      .addGrid(rf.maxBins, maxBins)
                      .addGrid(rf.featureSubsetStrategy, featureSubsetStrategy)
                      .build()


val columns = Array("Price", "DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols(columns)
            .setOutputCol("features")


val rfStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            rf
)

val rfPipeline = new Pipeline()
    .setStages(rfStages)

val cvRF = new CrossValidator()
    .setEstimator(rfPipeline)
    .setEvaluator(new RegressionEvaluator()
    .setLabelCol("Price")
    .setPredictionCol("Prediction")
    .setMetricName("rmse"))
    .setEstimatorParamMaps(rfParamMap)
    .setNumFolds(5)
    .setParallelism(2)

val startTimeMillis = System.currentTimeMillis()

val cvRFModel = cvRF.fit(train)
val rFPredictionsAndPrice = cvRFModel
    .transform(test)


val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)


java.lang.IllegalArgumentException:  rfr_c20e56154079 parameter impurity given invalid value entropy.

In [None]:
params = [{p.name: v for p, v in m.items()} for m in cvModel.getEstimatorParamMaps()]


#### Prediction

In [None]:
rfPredictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()

// this will add new columns rawPrediction, probability and prediction
val predictionDf = randomForestModel.transform(testData)
predictionDf.show(10)

In [None]:
val cv = new CrossValidator().setNumFolds(10).setEstimator(pipeline).
             setEvaluator(new BinaryClassificationEvaluator)
val cmModel = cv.fit(train)

val rfCVPredictions = cmModel.transform(test)

rfCVPredictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()


#### Regression metrics


Mean squared error (MSE) is defined as the average of squared differences between the predicted outcome and the true outcome. 

R2 coefficient represents the proportion of variance in the outcome that our model is capable of predicting based on its features.

**Bias vs Variance**
Graph of Error (validation error and training error) versus training set size. They should converge


#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

### References

Apache Spark (n.d.). _Spark ML Programming Guide._ Retrieved from https://spark.apache.org/docs/1.2.2/ml-guide.html

Gorczynski M. (2017). _Introduction to machine learning with spark and mllib (dataframe API)._ Retrieved from https://scalac.io/scala-spark-ml-machine-learning-introduction/

Hydrospheredata (2020). _Program creek. Scala Code Examples. Scaler_ Retrieved from https://www.programcreek.com/scala/org.apache.spark.ml.feature.StandardScaler

Johnson S (2019). _From sckit-learn to Spark ML._ Retrieved from 
https://towardsdatascience.com/from-scikit-learn-to-spark-ml-f2886fb46852


Johnson S (2019). _Housing Prices - Spark ML Project_ Retrieved from https://github.com/scottdjohnson/HousingPricePredictions/blob/master/HousingPrices-SparkML.ipynb

Masri A. (2019). _FeatureTransformation._ Retrieved from 

https://towardsdatascience.com/apache-spark-mllib-tutorial-7aba8a1dce6e


Scala Doc (n.d.)  Retrieved from https://docs.scala-lang.org

Jen G. (2020) _FeatureHasher._ Retrieved from https://george-jen.gitbook.io/data-science-and-apache-spark/featurehasher


(2019) _Random Forest Classifier with Apache Spark_ Retireved from https://medium.com/rahasak/random-forest-classifier-with-apache-spark-c63b4a23a7cc


In [None]:
import org.apache.spark.ml.feature.FeatureHasher

val hasher = new FeatureHasher()
 .setInputCols("StreetName","Suburb")
 .setOutputCol("str_name_suburb_vec")

var df_featured = hasher.transform(df_clean)

df_featured = df_featured.drop("StreetName").drop("Suburb")
df_featured.select("str_name_suburb_vec").show(false)