### Preparing HDFS
Using magic

Create input folder on HDFS if not exists

Copy from data from local

In [1]:
! hadoop fs -mkdir -p  /tmp/input
! hadoop fs -put   -p  ./../data-clean/*.csv             /tmp/input

In [2]:
// Load Clean Dataset into a DataFrame from HDFS after wrangling is completed
var df_clean = spark
    .read
    .format("csv")
    .option("header", "true")
    .load("hdfs://localhost:9000/tmp/input/*.csv")

Intitializing Scala interpreter ...

Spark Web UI available at http://92d0ac546444:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1590763359188)
SparkSession available as 'spark'


df_clean: org.apache.spark.sql.DataFrame = [Price: string, MethodOfSale: string ... 11 more fields]


In [3]:
df_clean = df_clean.withColumn("Price",col("Price").cast("Double"))
    .withColumn("Rooms",col("Rooms").cast("Int"))
    .withColumn("DistanceFromCBD",col("DistanceFromCBD").cast("Double"))
    .withColumn("MethodOfSale",col("MethodOfSale").cast("Int"))
    .withColumn("PropertyType",col("PropertyType").cast("Int"))
    .withColumn("Bathroom",col("Bathroom").cast("Int"))
    .withColumn("Car",col("Car").cast("Int"))
    .withColumn("Landsize",col("Landsize").cast("Double"))
    .withColumn("Latitude",col("Latitude").cast("Double"))
    .withColumn("Longtitude",col("Longtitude").cast("Double"))



df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


In [4]:
df_clean.printSchema()

root
 |-- Price: double (nullable = true)
 |-- MethodOfSale: integer (nullable = true)
 |-- PropertyType: integer (nullable = true)
 |-- DistanceFromCBD: double (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Suburb: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- StreetName: string (nullable = true)



## Change attributes into vectors 

#### Transform Sale Date into a numeric value

In [5]:
df_clean = df_clean.withColumn("Date",unix_timestamp($"Date", "dd/mm/yyyy"))



df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


#### Set FeatureHasher for Suburb, StreetName

In [6]:
import org.apache.spark.ml.feature.{FeatureHasher,OneHotEncoder,StandardScaler,VectorAssembler}

val hasher = new FeatureHasher()
 .setInputCols("StreetName","Suburb")
 .setOutputCol("str_name_suburb_vec")

import org.apache.spark.ml.feature.{FeatureHasher, OneHotEncoder, StandardScaler, VectorAssembler}
hasher: org.apache.spark.ml.feature.FeatureHasher = featureHasher_161fe125e6e7


#### Set OneHotEncoders for PropertyType,  MethodOfSale 

In [7]:
val ms_encoder = new OneHotEncoder()
      .setInputCol("MethodOfSale")
      .setOutputCol("m_sale_vec")

val pt_encoder = new OneHotEncoder()
      .setInputCol("PropertyType")
      .setOutputCol("pt_vec")

ms_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_eaecb9672c30
pt_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_658696ede5c3


#### Assemble  the columns and column vectors into a single column - "features"

In [8]:
val columns = Array("Price", "DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols(columns)
            .setOutputCol("features")


columns: Array[String] = Array(Price, DistanceFromCBD, Rooms, Bathroom, Car, Landsize, Latitude, Longtitude, Date, str_name_suburb_vec, m_sale_vec, pt_vec)
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_c03bcb042059


#### Set StandardScaler

In [9]:
var scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("ScaledFeatures")
      .setWithStd(true).setWithMean(true)


scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_71054d0534a7


### Split Data into a Training and a Testing Set


In [10]:
import org.apache.spark.ml.feature.{VectorAssembler,StandardScaler}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._


def train_test_split(data: DataFrame) = {
    
     val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = 30)
    
     (train, test)
}

import org.apache.spark.ml.feature.{VectorAssembler, StandardScaler}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
train_test_split: (data: org.apache.spark.sql.DataFrame)(org.apache.spark.sql.Dataset[org.apache.spark.sql.Row], org.apache.spark.sql.Dataset[org.apache.spark.sql.Row])


In [11]:

val (train, test) = train_test_split(df_clean)

train: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: double, MethodOfSale: int ... 11 more fields]
test: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: double, MethodOfSale: int ... 11 more fields]


### 1. Apply Linear Regression

In [None]:
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.Pipeline



val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("features")
    .setPredictionCol("Predicted Price")
    .setMaxIter(50)

#### Set Stages to transform all columns into feature

In [None]:
// add linear regression to stages
val lrStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            lr
)

In [None]:
//Construct the pipeline
val startTimeMillis = System.currentTimeMillis()

val lrPipe = new Pipeline().setStages(lrStages)

//We fit our DataFrame into the pipeline to generate a model
val lrModel = lrPipe.fit(train)

//Make predictions using the model and the test data
val predictions = lrModel.transform(test)

val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)

#### Prediction

In [None]:
predictions.columns

In [None]:
predictions.count()

In [None]:
predictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()

In [None]:
import org.apache.spark.ml.evaluation.RegressionEvaluator

val rmse = new RegressionEvaluator()
  .setLabelCol("Price")
  .setPredictionCol("Predicted Price")
  .setMetricName("rmse")

println("Root Mean Squared Error (RMSE) on test data = " + rmse.evaluate(predictions))


In [None]:

val r2 = new RegressionEvaluator()
  .setLabelCol("Price")
  .setPredictionCol("Predicted Price")
  .setMetricName("r2")

println("R^2 on test data = " + r2.evaluate(predictions))


#### Testing/Evaluation

Pipeline Model Transformer

In [None]:
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
import org.apache.spark.ml.feature.{VectorAssembler, StandardScaler}
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.Pipeline


val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("features")
    .setPredictionCol("Prediction")


val lrParamMap = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(10, 1, 0.1, 0.01, 0.001))
    .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
    .addGrid(lr.maxIter, Array(10000, 250000))
    .build()

val columns = Array("Price", "DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols(columns)
            .setOutputCol("features")


val lrStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            lr
)

val lrPipeline = new Pipeline()
    .setStages(lrStages)

val cvLR = new CrossValidator()
    .setEstimator(lrPipeline)
    .setEvaluator(new RegressionEvaluator()
    .setLabelCol("Price")
    .setPredictionCol("Prediction")
    .setMetricName("rmse"))
    .setEstimatorParamMaps(lrParamMap)
    .setNumFolds(5)
    .setParallelism(2)

val startTimeMillis = System.currentTimeMillis()

val cvLRModel = cvLR.fit(train)
val lrPredictionsAndPrice = cvLRModel
    .transform(test)
    .select("Price", "Prediction")

val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)

2020-05-29 14:43:29,941 WARN  [CrossValidator-thread-pool-0] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2020-05-29 14:43:29,942 WARN  [CrossValidator-thread-pool-0] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


In [None]:
lrPredictionsAndPrice.show()
val bestLRModel = cvLRModel.bestModel
    
println(bestLRModel.extractParamMap)

In [None]:
lrPredictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()

In [None]:
import org.apache.spark.ml.evaluation
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression

// utilit function to train and evaluate model
// bug
def train_eval[M <: PredictionModel[Vector, M]](
    pipeline: Pipeline,
    paramMap: Array[ParamMap],
    train: DataFrame, 
    test: DataFrame) = {

    val cv = new CrossValidator()
      .setEstimator(pipeline)
      .setEvaluator(new RegressionEvaluator()
      .setLabelCol("Price")
      .setPredictionCol("Prediction")
      .setMetricName("rmse"))
      .setEstimatorParamMaps(paramMap)
      .setNumFolds(5)
      .setParallelism(2)

    val cvModel = cv.fit(train)
    val predictions = cvModel.transform(test)
        
    println("Root Mean Squared Error (RMSE) on test data = " + rmse.evaluate(predictions))
    println("R^2 on test data = " + r2.evaluate(predictions))

    val bestModel = cvModel.bestModel
    
    println(bestModel.extractParamMap)
    
    bestModel
}


In [None]:

val lrParamMap = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(10, 1, 0.1, 0.01, 0.001))
    .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
    .addGrid(lr.maxIter, Array(10000, 250000))
    .build()

val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("features")
    .setPredictionCol("Predicted Price")

val lrStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            lr
)

val startTimeMillis = System.currentTimeMillis()
train_eval(new Pipeline().setStages(lrStages), lrParamMap, train, test)

val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)

### 2. Apply KNN

#### Training


Pipeline Estimator

#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

In [None]:
val ms_encoder = new OneHotEncoder()
      .setInputCol("MethodOfSale")
      .setOutputCol("m_sale_vec")

df_featured = ms_encoder.transform(df_featured)
df_featured = df_featured.drop("MethodOfSale")

val pt_encoder = new OneHotEncoder()
      .setInputCol("PropertyType")
      .setOutputCol("pt_vec")

df_featured = pt_encoder.transform(df_featured)
df_featured = df_featured.drop("PropertyType")

df_featured.select("str_name_suburb_vec","m_sale_vec","pt_vec").show(false)

### 3. Apply Random Forest Regression

#### Training

Pipeline Estimator

When you transform a column in your dataframe using pyspark.ml.feature.StringIndexer extra meta-data gets stored in the dataframe that specifically marks the transformed feature as a categorical feature.

When you print the dataframe you will see a numeric value (which is an index that corresponds with one of your categorical values) and if you look at the schema you will see that your new transformed column is of type double. However, this new column you created with pyspark.ml.feature.StringIndexer.transform is not just a normal double column, it has extra meta-data associated with it that is very important. You can inspect this meta-data by looking at the metadata property of the appropriate field in your dataframe's schema (you can access the schema objects of your dataframe by looking at yourdataframe.schema)

This extra metadata has two important implications:

When you call .fit() when using a tree based model, it will scan the meta-data of your dataframe and recognize fields that you encoded as categorical with transformers such as pyspark.ml.feature.StringIndexer (as noted above there are other transformers that will also have this effect such as pyspark.ml.feature.VectorIndexer). Because of this, you DO NOT have to one-hot encode your features after you have transformed them with StringIndxer when using tree-based models in spark ML (however, you still have to perform one-hot encoding when using other models that do not naturally handle categoricals like linear regression, etc.).

Because this metadata is stored in the data frame, you can use pyspark.ml.feature.IndexToString to reverse the numeric indices back to the original categorical values (which are often strings) at any time.

**Build Random Forest model**

Specify maxDepth, maxBins, impurity, auto and seed parameters.
maxDepth: Maximum depth of a tree. Increasing the depth makes the model more powerful, but deep trees take longer to train.

maxBins: Maximum number of bins used for discretizing continuous features and for choosing how to split on features at each node.

impurity:Criterion used for information gain calculation

auto:Automatically select the number of features to consider for splits at each tree node

seed:Use a random seed number , allowing to repeat the results



In [None]:
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.Pipeline

// Models hypoparameters
//val algorithm = Algo.Classification
//val impurity = Gini
//val maximumDepth = 3
//val treeCount = 20
//val featureSubsetStrategy = "auto"
val seed = 5043

estimator = RandomForestRegressor()
evaluator = RegressionEvaluator()
paramGrid = ParamGridBuilder().addGrid(estimator.numTrees, [2,3])\
                              .addGrid(estimator.maxDepth, [2,3])\
                              .addGrid(estimator.impurity, ['variance'])\
                              .addGrid(estimator.featureSubsetStrategy, ['sqrt'])\
                              .build()
pipeline = Pipeline(stages=[estimator])

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cvModel = crossval.fit(result)


val rf = new RandomForestRegressor()
  .setMaxBins(4)
  .setMaxDepth(2)
  .setNumTrees(10)
  .setFeatureSubsetStrategy("auto")
  .setSeed(seed)
  .setLabelCol("Price")
  .setFeaturesCol("features")
  .setPredictionCol("Predicted Price")

val rfStages = Array(
            hasher,
            ms_encoder, 
            pt_encoder,
            assembler,
            //scaler,
            rf
)

val rfPipe = new Pipeline().setStages(rfStages) 

val startTimeMillis = System.currentTimeMillis()
val rfModel = rfPipe.fit(train)

//Make predictions using the model and the test data
val rfPredictions = rfModel.transform(test)


val endTimeMillis = System.currentTimeMillis()
val durationSeconds = (endTimeMillis - startTimeMillis) / 1000
print("pipeline was executed "+durationSeconds)


In [None]:
params = [{p.name: v for p, v in m.items()} for m in cvModel.getEstimatorParamMaps()]
Th

#### Prediction

In [None]:
rfPredictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()

// this will add new columns rawPrediction, probability and prediction
val predictionDf = randomForestModel.transform(testData)
predictionDf.show(10)

In [None]:
val cv = new CrossValidator().setNumFolds(10).setEstimator(pipeline).
             setEvaluator(new BinaryClassificationEvaluator)
val cmModel = cv.fit(train)

val rfCVPredictions = cmModel.transform(test)

rfCVPredictions.withColumn("Predicted Price", round($"Predicted Price", 0)).select("Price","Predicted Price").show()


## Train/Test/Validation split: 60%/20%20%

shaffle before splitting
train model on train , tune on validation, test on test

#### Regression metrics


Mean squared error (MSE) is defined as the average of squared differences between the predicted outcome and the true outcome. 

R2 coefficient represents the proportion of variance in the outcome that our model is capable of predicting based on its features.

**Bias vs Variance**
Graph of Error (validation error and training error) versus training set size. They should converge


#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

### References

Apache Spark (n.d.). _Spark ML Programming Guide._ Retrieved from https://spark.apache.org/docs/1.2.2/ml-guide.html

Gorczynski M. (2017). _Introduction to machine learning with spark and mllib (dataframe API)._ Retrieved from https://scalac.io/scala-spark-ml-machine-learning-introduction/

Hydrospheredata (2020). _Program creek. Scala Code Examples. Scaler_ Retrieved from https://www.programcreek.com/scala/org.apache.spark.ml.feature.StandardScaler

Johnson S (2019). _From sckit-learn to Spark ML._ Retrieved from 
https://towardsdatascience.com/from-scikit-learn-to-spark-ml-f2886fb46852


Johnson S (2019). _Housing Prices - Spark ML Project_ Retrieved from https://github.com/scottdjohnson/HousingPricePredictions/blob/master/HousingPrices-SparkML.ipynb

Masri A. (2019). _FeatureTransformation._ Retrieved from 

https://towardsdatascience.com/apache-spark-mllib-tutorial-7aba8a1dce6e


Scala Doc (n.d.)  Retrieved from https://docs.scala-lang.org

Jen G. (2020) _FeatureHasher._ Retrieved from https://george-jen.gitbook.io/data-science-and-apache-spark/featurehasher


(2019) _Random Forest Classifier with Apache Spark_ Retireved from https://medium.com/rahasak/random-forest-classifier-with-apache-spark-c63b4a23a7cc


In [None]:
import org.apache.spark.ml.feature.FeatureHasher

val hasher = new FeatureHasher()
 .setInputCols("StreetName","Suburb")
 .setOutputCol("str_name_suburb_vec")

var df_featured = hasher.transform(df_clean)

df_featured = df_featured.drop("StreetName").drop("Suburb")
df_featured.select("str_name_suburb_vec").show(false)