### Preparing HDFS
Using magic

Create input folder on HDFS if not exists

Copy from data from local

In [21]:
! hadoop fs -mkdir -p  /tmp/output
! hadoop fs -put   -p  /../data-clean/*.csv             

put: `.': No such file or directory




In [22]:
// Load Clean Dataset into a DataFrame from HDFS after wrangling is completed
var df_clean = spark
    .read
    .format("csv")
    .option("header", "true")
    .load("hdfs://localhost:9000/tmp/output/*.csv")

df_clean: org.apache.spark.sql.DataFrame = [Price: string, MethodOfSale: string ... 11 more fields]


In [23]:
df_clean = df_clean.withColumn("Price",col("Price").cast("Double"))
    .withColumn("Rooms",col("Rooms").cast("Int"))
    .withColumn("DistanceFromCBD",col("DistanceFromCBD").cast("Double"))
    .withColumn("MethodOfSale",col("MethodOfSale").cast("Int"))
    .withColumn("PropertyType",col("PropertyType").cast("Int"))
    .withColumn("Bathroom",col("Bathroom").cast("Int"))
    .withColumn("Car",col("Car").cast("Int"))
    .withColumn("Landsize",col("Landsize").cast("Double"))
    .withColumn("Latitude",col("Latitude").cast("Double"))
    .withColumn("Longtitude",col("Longtitude").cast("Double"))



df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


In [24]:
df_clean.printSchema()

root
 |-- Price: double (nullable = true)
 |-- MethodOfSale: integer (nullable = true)
 |-- PropertyType: integer (nullable = true)
 |-- DistanceFromCBD: double (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Suburb: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- StreetName: string (nullable = true)



### Split Data into a Training and a Testing Set

In [25]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._


def train_test_split(data: DataFrame) = {
    
    val assembler = new VectorAssembler().
       setInputCols(data.drop("Price").columns).
       setOutputCol("features")
    
    val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = 30)

    (assembler.transform(train), assembler.transform(test))
}

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
train_test_split: (data: org.apache.spark.sql.DataFrame)(org.apache.spark.sql.DataFrame, org.apache.spark.sql.DataFrame)


In [26]:

val (train, test) = train_test_split(df_clean)

java.lang.IllegalArgumentException:  Data type string of column Suburb is not supported.

### Scale and Center the data.

Our data is grouped into:

categorical:

* nominal: MethodOfSale, PropertyType, Suburb, StreetName
* ordinal: Date

numerical: Price, DistanceFromCBD, Rooms, Bathroom, Car, Landsize, Latitude, Longtitude



#### Transforming categorical nominal values for Type and Method of Sale to features

In [None]:
import spark.implicits._
case class PropertyType(id: Integer, p_type: String)

val propertyTypeDS = Seq(PropertyType(1, "H"), PropertyType(2, "U"),PropertyType(3, "T")).toDS()

case class MethodOfSale(id: Integer, m_sale: String)

val methodOfSaleDS = Seq(MethodOfSale(1, "S"), MethodOfSale(2, "SP"),MethodOfSale(3, "PI"),
                       MethodOfSale(4, "PN"), MethodOfSale(5, "SN"),MethodOfSale(6, "VB"),
                       MethodOfSale(7, "W"), MethodOfSale(8, "SA"),MethodOfSale(9, "SS")).toDS()

methodOfSaleDS.show()
propertyTypeDS.show()

In [None]:
import org.apache.spark.ml.feature.{CountVectorizer, OneHotEncoder, StringIndexer}
import org.apache.spark.sql.SparkSession

//transforming PropertyType
val pt_indexer = new StringIndexer().setInputCol("p_type")
      .setOutputCol("pt_indexed")
      .fit(propertyTypeDS)
val pt_indexed = pt_indexer.transform(propertyTypeDS)
    
val pt_encoder = new OneHotEncoder()
      .setInputCol("pt_indexed")
      .setOutputCol("pt_vec")

val pt_encoded = pt_encoder.transform(pt_indexed)
    pt_encoded.select("id", "pt_vec").show()


//transforming MethodOfSale
val ms_indexer = new StringIndexer().setInputCol("m_sale")
      .setOutputCol("m_sale_indexed")
      .fit(methodOfSaleDS)
val ms_indexed = ms_indexer.transform(methodOfSaleDS)

val ms_encoder = new OneHotEncoder()
      .setInputCol("m_sale_indexed")
      .setOutputCol("m_sale_vec")

val ms_encoded = ms_encoder.transform(ms_indexed)
    ms_encoded.select("id", "m_sale_vec").show()






#### Scaling numerical values

In [27]:
// StandardScaler https://www.programcreek.com/scala/org.apache.spark.ml.feature.StandardScaler


### 1. Apply Linear Regression

#### Training


Pipeline Estimator-implements a method fit(), which accepts a DataFrame and produces a Model-Transformer.




#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

In [28]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression

// utilit function to train and evaluate model
// bug
def train_eval[R <: Predictor[Vector, R, M],
               M <: PredictionModel[Vector, M]](
    predictor: Predictor[Vector, R, M],
    paramMap: Array[ParamMap],
    train: DataFrame, 
    test: DataFrame) = {

    val cv = new CrossValidator()
      .setEstimator( predictor    
                    .setLabelCol("lastsoldprice")
                    .setFeaturesCol("features"))
      .setEvaluator(new RegressionEvaluator()
          .setLabelCol("lastsoldprice")
          .setPredictionCol("prediction")
          .setMetricName("rmse"))
      .setEstimatorParamMaps(paramMap)
      .setNumFolds(5)
      .setParallelism(2)

    val cvModel = cv.fit(train)
    val predictions = cvModel.transform(test)
    
    println("Root Mean Squared Error (RMSE) on test data = " + rmse.evaluate(predictions))
    println("R^2 on test data = " + r2.evaluate(predictions))

    val bestModel = cvModel.bestModel
    
    println(bestModel.extractParamMap)
    
    bestModel
}


<console>: 64: error: not found: type RegressionEvaluator

### 2. Apply KNN

#### Training


Pipeline Estimator

#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

### 3. Apply Random Forest Regression

#### Training

Pipeline Estimator

#### Prediction

#### Testing/Evaluation

Pipeline Model Transformer

### References

Apache Spark (n.d.). _Spark ML Programming Guide._ Retrieved from https://spark.apache.org/docs/1.2.2/ml-guide.html

Hydrospheredata (n.d.). _org.apache.spark.ml.feature.StandardScaler Scala Examples._ Retrieved from https://towardsdatascience.com/from-scikit-learn-to-spark-ml-f2886fb46852

Johnson S (2019). _From sckit-learn to Spark ML._ Retrieved from https://www.programcreek.com/scala/org.apache.spark.ml.feature.StandardScaler
Masri A. (2019). _FeatureTransformation._ Retrieved from https://towardsdatascience.com/apache-spark-mllib-tutorial-7aba8a1dce6e
