### Preparing HDFS
Using magic

Create input folder on HDFS if not exists

Copy from data from local

In [1]:
! hadoop fs -mkdir -p  /tmp/input
! hadoop fs -put   -p  ./../data-clean/*.csv             /tmp/input         

put: `/tmp/input/cleanMelbourneData.csv': File exists




 ### Check Spark Parameters

In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf,SparkContext}

val cs = spark.sparkContext.getConf
sc.getConf.getAll.foreach { println }


Intitializing Scala interpreter ...

Spark Web UI available at http://6228915d44c3:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1590929770377)
SparkSession available as 'spark'


(spark.driver.port,42725)
(spark.repl.class.uri,spark://6228915d44c3:42725/classes)
(spark.repl.class.outputDir,/tmp/tmp7t3xap36)
(spark.executor.id,driver)
(spark.app.id,local-1590929770377)
(spark.app.name,spylon-kernel)
(spark.executor.memory,6g)
(spark.driver.memory,6g)
(spark.rdd.compress,True)
(spark.driver.host,6228915d44c3)
(spark.serializer.objectStreamReset,100)
(spark.master,local[*])
(spark.submit.deployMode,client)
(spark.ui.showConsoleProgress,true)


import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
cs: org.apache.spark.SparkConf = org.apache.spark.SparkConf@76a716c0


In [3]:
// Load Clean Dataset into a DataFrame from HDFS after wrangling is completed
var df_clean = spark
    .read
    .format("csv")
    .option("header", "true")
    .load("hdfs://localhost:9000/tmp/input/*.csv")
df_clean.cache()

df_clean: org.apache.spark.sql.DataFrame = [Price: string, MethodOfSale: string ... 11 more fields]
res1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: string, MethodOfSale: string ... 11 more fields]


In [4]:
df_clean = df_clean.withColumn("Price",col("Price").cast("Double"))
    .withColumn("Rooms",col("Rooms").cast("Int"))
    .withColumn("DistanceFromCBD",col("DistanceFromCBD").cast("Double"))
    .withColumn("MethodOfSale",col("MethodOfSale").cast("Int"))
    .withColumn("PropertyType",col("PropertyType").cast("Int"))
    .withColumn("Bathroom",col("Bathroom").cast("Int"))
    .withColumn("Car",col("Car").cast("Int"))
    .withColumn("Landsize",col("Landsize").cast("Double"))
    .withColumn("Latitude",col("Latitude").cast("Double"))
    .withColumn("Longtitude",col("Longtitude").cast("Double"))



df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


In [5]:
df_clean.cache()
df_clean.printSchema()

root
 |-- Price: double (nullable = true)
 |-- MethodOfSale: integer (nullable = true)
 |-- PropertyType: integer (nullable = true)
 |-- DistanceFromCBD: double (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Suburb: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- StreetName: string (nullable = true)



### Construct vectors from attributes
#### Transform Sale Date into a numeric value

In [7]:
df_clean = df_clean.withColumn("Date",unix_timestamp($"Date", "dd/mm/yyyy"))

df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


#### Set FeatureHasher for Suburb, StreetName

In [8]:
import org.apache.spark.ml.feature.{FeatureHasher,OneHotEncoder,StandardScaler,VectorAssembler}

val hasher = new FeatureHasher()
 .setInputCols("StreetName","Suburb")
 .setOutputCol("str_name_suburb_vec")

import org.apache.spark.ml.feature.{FeatureHasher, OneHotEncoder, StandardScaler, VectorAssembler}
hasher: org.apache.spark.ml.feature.FeatureHasher = featureHasher_f692100607ed


#### Set OneHotEncoders for PropertyType, MethodOfSale

In [9]:
val ms_encoder = new OneHotEncoder()
      .setInputCol("MethodOfSale")
      .setOutputCol("m_sale_vec")

val pt_encoder = new OneHotEncoder()
      .setInputCol("PropertyType")
      .setOutputCol("pt_vec")


ms_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_2a66214ad440
pt_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_c35c6735200d


#### Assemble the columns and column vectors into a single column - "features"

In [10]:
val columns = Array("DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols(columns)
            .setOutputCol("features")

val dd = hasher.transform(df_clean).drop("StreetName","Suburb")


columns: Array[String] = Array(DistanceFromCBD, Rooms, Bathroom, Car, Landsize, Latitude, Longtitude, Date, str_name_suburb_vec, m_sale_vec, pt_vec)
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_2ecf1e388057
dd: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 10 more fields]


In [11]:
val mm = ms_encoder.transform(dd).drop("MethodOfSale")


mm: org.apache.spark.sql.DataFrame = [Price: double, PropertyType: int ... 10 more fields]


In [12]:
val pt = pt_encoder.transform(mm).drop("PropertyType")


pt: org.apache.spark.sql.DataFrame = [Price: double, DistanceFromCBD: double ... 10 more fields]


In [13]:
val feature_ds = assembler.transform(pt).drop("DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")
feature_ds.cache()

feature_ds: org.apache.spark.sql.DataFrame = [Price: double, features: vector]
res4: feature_ds.type = [Price: double, features: vector]


In [14]:
feature_ds.show()

+---------+--------------------+
|    Price|            features|
+---------+--------------------+
|1480000.0|(262163,[0,1,2,3,...|
|1035000.0|(262163,[0,1,2,4,...|
|1465000.0|(262163,[0,1,2,4,...|
| 850000.0|(262163,[0,1,2,3,...|
|1600000.0|(262163,[0,1,2,3,...|
| 941000.0|(262163,[0,1,2,4,...|
|1876000.0|(262163,[0,1,2,4,...|
|1636000.0|(262163,[0,1,2,3,...|
|1097000.0|(262163,[0,1,2,3,...|
|1350000.0|(262163,[0,1,2,3,...|
|1172500.0|(262163,[0,1,2,3,...|
|1310000.0|(262163,[0,1,2,3,...|
|1200000.0|(262163,[0,1,2,3,...|
|1176500.0|(262163,[0,1,2,3,...|
| 955000.0|(262163,[0,1,2,4,...|
| 890000.0|(262163,[0,1,2,3,...|
|1330000.0|(262163,[0,1,2,3,...|
|1090000.0|(262163,[0,1,2,3,...|
|1100000.0|(262163,[0,1,2,3,...|
|1315000.0|(262163,[0,1,2,4,...|
+---------+--------------------+
only showing top 20 rows



#### Set StandardScaler

In [15]:
val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true).setWithMean(true)


scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_95f031774569


### Split Data into a Training and a Testing Set

In [16]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._


def train_test_split(data: DataFrame) = {
    
     val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = 30)
    
     (train, test)
}

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
train_test_split: (data: org.apache.spark.sql.DataFrame)(org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], org.apache.spark.sql.Dataset[org.apache.spark.sql.Row])


In [17]:

val (train, test) = train_test_split(feature_ds)
train.cache()
test.cache()

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: double, features: vector]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: double, features: vector]
res6: test.type = [Price: double, features: vector]


### 1. Apply Linear Regression


In [18]:
import org.apache.spark.ml.regression.LinearRegression

val lr = new LinearRegression()
    .setLabelCol("Price")
    .setMaxIter(1500)
    .setRegParam(0.1)

import org.apache.spark.ml.regression.LinearRegression
lr: org.apache.spark.ml.regression.LinearRegression = linReg_f405be71f6ac


In [19]:
lr.explainParams()

res7: String =
aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty (default: 0.0)
epsilon: The shape parameter to control the amount of robustness. Must be > 1.0. (default: 1.35)
featuresCol: features column name (default: features)
fitIntercept: whether to fit an intercept term (default: true)
labelCol: label column name (default: label, current: Price)
loss: The loss function to be optimized. Supported options: squaredError, huber. (Default squaredError) (default: squaredError)
maxIter: maximum number of iterations (>= 0) (default: 100, current: 1500)
predictionCol: prediction column name (default: prediction)
regParam: ...

#### Define time function

In [20]:
def time[R](block: => R): R = {
  val t0 = System.nanoTime()
  val result = block    // call-by-name
  val t1 = System.nanoTime()
  println("Elapsed time: " + (t1 - t0)/1000000000 + " s")
  result
 }

time: [R](block: => R)R


#### Define predictions function

In [21]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.Pipeline


def predictions[R <: Predictor[Vector, R, M],
                M <: PredictionModel[Vector, M]](
    predictor: Predictor[Vector, R, M],
    train: DataFrame, 
    test: DataFrame) = {
    
    val pipeline = new Pipeline()
      .setStages(Array(scaler, predictor))
     
    val result =pipeline.fit(train).transform(test)
    result

}

import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.Pipeline
predictions: [R <: org.apache.spark.ml.Predictor[org.apache.spark.ml.linalg.Vector,R,M], M <: org.apache.spark.ml.PredictionModel[org.apache.spark.ml.linalg.Vector,M]](predictor: org.apache.spark.ml.Predictor[org.apache.spark.ml.linalg.Vector,R,M], train: org.apache.spark.sql.DataFrame, test: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


#### Prediction

In [22]:
val lrPredictions = time{predictions(lr, train, test)}
lrPredictions.cache()

2020-05-31 12:57:14,304 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2020-05-31 12:57:14,305 WARN  [Thread-4] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
Elapsed time: 239 s


lrPredictions: org.apache.spark.sql.DataFrame = [Price: double, features: vector ... 2 more fields]
res8: lrPredictions.type = [Price: double, features: vector ... 2 more fields]


In [23]:
lrPredictions.columns

res9: Array[String] = Array(Price, features, scaledFeatures, prediction)


In [24]:
lrPredictions.withColumn("prediction", round($"prediction", 0)).select("Price","prediction").show()

+--------+----------+
|   Price|prediction|
+--------+----------+
|170000.0|  359905.0|
|280000.0| -121507.0|
|280500.0|  411023.0|
|283000.0| -244894.0|
|290000.0|  671624.0|
|300000.0|  576442.0|
|300000.0|  120603.0|
|305000.0|  341858.0|
|310000.0|   -6485.0|
|316000.0|  -33210.0|
|320000.0|  311037.0|
|320000.0|  602258.0|
|320000.0|  468252.0|
|320000.0|   99955.0|
|325000.0|  295665.0|
|333000.0| -229038.0|
|340000.0| -324449.0|
|345000.0|  817134.0|
|348000.0| -249289.0|
|350000.0| 3421399.0|
+--------+----------+
only showing top 20 rows



#### Evaluation

In [25]:
import org.apache.spark.ml.evaluation.RegressionEvaluator

def evaluate ( predictions: DataFrame, metric: String) = {
    val eval =  new RegressionEvaluator()
       .setLabelCol("Price")
       .setPredictionCol("prediction")
       .setMetricName(metric)
println("Root Mean Squared Error "+  metric.toUpperCase()+" on test data = " + eval.evaluate(predictions))
    
}

import org.apache.spark.ml.evaluation.RegressionEvaluator
evaluate: (predictions: org.apache.spark.sql.DataFrame, metric: String)Unit


#### Regression metrics

**Mean squared error (MSE)** -- the average of squared differences between the predicted outcome and the true outcome.

**R2 coefficient** -- the proportion of variance in the outcome that our model is capable of predicting based on its features.


In [26]:
evaluate(lrPredictions,"rmse")

Root Mean Squared Error RMSE on test data = 429164.39732878417


In [27]:
evaluate(lrPredictions,"r2")

Root Mean Squared Error R2 on test data = 0.5571150795620388


#### Testing/Evaluation/ Parameter Tuning

Cross-validation

<span style="color:red">
TO DO: does not finish run in reasonable time
</span>

In [None]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}



def train_eval[R <: Predictor[Vector, R, M],
               M <: PredictionModel[Vector, M]](
    predictor: Predictor[Vector, R, M],
    paramMap: Array[ParamMap],
    train: DataFrame, 
    test: DataFrame) = {

    val pipeline = new Pipeline()
      .setStages(Array(scaler, predictor))
    
    val cv = new CrossValidator()
        .setEstimator(pipeline)
        .setEvaluator(new RegressionEvaluator()
        .setLabelCol("Price")
        .setPredictionCol("prediction")
        .setMetricName("rmse"))
        .setEstimatorParamMaps(paramMap)
        .setNumFolds(5)
        .setParallelism(2)

    val cvModel = cv.fit(train)
    val predictions = cvModel.transform(test)
    
    predictions.cache()
    evaluate(predictions,"rmse")
    evaluate(predictions,"r2")
    
    val bestModel = cvModel.bestModel
    
    println(bestModel.extractParamMap)
    
    bestModel
}


#### Parameter Tuning

In [None]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression

val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("features")
    .setPredictionCol("prediction")

val lrParamMap = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(10,1, 0.1, 0.01, 0.001))
    .addGrid(lr.elasticNetParam, Array(0.0,0.5, 1.0))
    .addGrid(lr.maxIter, Array(10, 50, 100, 500, 800))
    .build()

val t0 = System.nanoTime()
val bestLRModel = train_eval(lr, lrParamMap, train, test)
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0)/(1000000000) + " s")



### 2. Apply KNN

#### Training


Pipeline Estimator

#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

### 3. Apply Random Forest Regression

**Build Random Forest model**
Specify maxDepth, maxBins, auto and seed parameters.

**maxDepth** -- Maximum depth of a tree. Increasing the depth makes the model more powerful, but deep trees take longer to train.

**maxBins** -- Maximum number of bins used for discretizing continuous features and for choosing how to split on features at each node.

**auto** -- Automatically select the number of features to consider for splits at each tree node

**seed** -- Use a random seed number , allowing to repeat the results


If the number of trees is 1, then no bootstrapping is used at all. However, if the number of trees is > 1, then the bootstrapping is accomplished. Where, the parameter featureSubsetStrategy signifies the number of features to be considered for splits at each node. The supported values of featureSubsetStrategy are "auto", "all", "sqrt", "log2" and "on third". The supported numerical values, on the other hand, are (0.0-1.0] and [1-n]. However, if featureSubsetStrategy is chosen as "auto", the algorithm chooses the best feature subset strategy automatically


If the numTrees == 1, the featureSubsetStrategy is set to be "all". However, if the numTrees > 1 (i.e., forest), featureSubsetStrategy is set to be "onethird" for regression


Moreover, if a real value "n" is in the range (0, 1.0] is set, n*number_of_features is used consequently. However, if an integer value "n" is in the range (1, the number of features) is set, only n features are used alternatively


The parameter categoricalFeaturesInfo which is a map is used for storing arbitrary of categorical features. An entry (n -> k) indicates that feature n is categorical with k categories indexed from 0: {0, 1,...,k-1}
The impurity criterion used for information gain calculation. The supported values are “gini" and “variance”. The former is the only supported value for classification. The latter is used for regression


The maxDepth is the maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). However, the suggested value is 4 to get a better result


The maxBins signifies the maximum number of bins used for splitting the features; where the suggested value is 100 to get better results


Finally, the random seed is used for bootstrapping and choosing feature subsets to avoid the random nature of the results.

In [28]:
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.tuning.CrossValidator
//import org.apache.spark.ml.Pipeline

val seed = 5043

val rf = new RandomForestRegressor()
  .setMaxBins(100)
  .setMaxDepth(4)
  .setNumTrees(10)
  .setFeatureSubsetStrategy("onethird")
  .setSeed(seed)
  .setLabelCol("Price")

import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.tuning.CrossValidator
seed: Int = 5043
rf: org.apache.spark.ml.regression.RandomForestRegressor = rfr_361d524d0dd4


In [None]:
val rfPredictions = time{predictions(rf, train, test)}
rfPredictions.cache()



In [None]:
rfPredictions.columns

In [None]:
rfPredictions.withColumn("prediction", round($"prediction", 0)).select("Price","prediction").show()

#### Regression metrics


In [None]:
evaluate(rfPredictions,"rmse")

In [None]:
evaluate(rfPredictions,"r2")

#### Testing/Evaluation/ Parameter Tuning

Cross-validation
<span style="color:red">
TO DO: 
* finish implementation for Cross-validation 
* check if finish run in reasonable time
</span>

In [None]:
import org.apache.spark.ml.regression.RandomForestRegressor

// Models hypoparameters
val numTrees = Seq(5)//,10,15)
val maxBins = Seq(2)//,5,10)
val maxDepth = Seq(2)//,3,5)
//val impurity = Seq("gini")//,"entropy","variance",)
val featureSubsetStrategy = Seq("sqrt")

val rf = new RandomForestRegressor()
  .setLabelCol("Price")
  .setFeaturesCol("features")
  .setPredictionCol("prediction")


val rfParamMap = new ParamGridBuilder()
  .addGrid(rf.numTrees, numTrees)
  .addGrid(rf.maxDepth, maxDepth)
  .addGrid(rf.maxBins, maxBins)
  .addGrid(rf.featureSubsetStrategy, featureSubsetStrategy)
  .build()

val t0 = System.nanoTime()
val best_model = train_eval(rf, lrParamMap, train, test)
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0)/(1000000000) + " s")


#### Prediction

In [None]:
rfPredictions.withColumn("prediction", round($"prediction", 0)).select("Price","prediction").show()

// this will add new columns rawPrediction, probability and prediction
val predictionDf = randomForestModel.transform(testData)
predictionDf.show(10)

#### Tuning

#### Bias vs Variance Graph of Error (validation error and training error) versus training set size. 


<span style="color:red">
TO DO: 
produce graph -- validation error and training error should converge
</span>


### References

Apache Spark (n.d.). Spark ML Programming Guide. Retrieved from https://spark.apache.org/docs/1.2.2/ml-guide.html

Gorczynski M. (2017). Introduction to machine learning with spark and mllib (dataframe API). Retrieved from https://scalac.io/scala-spark-ml-machine-learning-introduction/

Hydrospheredata (2020). Program creek. Scala Code Examples. Scaler Retrieved from https://www.programcreek.com/scala/org.apache.spark.ml.feature.StandardScaler

Jen G. (2020) FeatureHasher. Retrieved from https://george-jen.gitbook.io/data-science-and-apache-spark/featurehasher

Johnson S (2019). From sckit-learn to Spark ML. Retrieved from https://towardsdatascience.com/from-scikit-learn-to-spark-ml-f2886fb46852

Johnson S (2019). Housing Prices - Spark ML Project Retrieved from https://github.com/scottdjohnson/HousingPricePredictions/blob/master/HousingPrices-SparkML.ipynb

Masri A. (2019). FeatureTransformation. Retrieved from
https://towardsdatascience.com/apache-spark-mllib-tutorial-7aba8a1dce6e

Scala Doc (n.d.) Retrieved from https://docs.scala-lang.org


(2019) Random Forest Classifier with Apache Spark Retireved from https://medium.com/rahasak/random-forest-classifier-with-apache-spark-c63b4a23a7cc