### Preparing HDFS
Using magic

Create input folder on HDFS if not exists

Copy from data from local

In [1]:
! hadoop fs -mkdir -p  /tmp/input
! hadoop fs -put   -p  ./../data-clean/*.csv             /tmp/input         

 ### Check Spark Parameters

In [2]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf,SparkContext}

// Set log level to ERROR (less verbose)
sc.setLogLevel("WARN")
val cs = spark.sparkContext.getConf
sc.getConf.toDebugString

Intitializing Scala interpreter ...

Spark Web UI available at http://b0c0236ce0ed:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1591457837947)
SparkSession available as 'spark'


import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
cs: org.apache.spark.SparkConf = org.apache.spark.SparkConf@6c4d6d28
res0: String =
spark.app.id=local-1591457837947
spark.app.name=spylon-kernel
spark.driver.host=b0c0236ce0ed
spark.driver.memory=8g
spark.driver.port=35567
spark.executor.id=driver
spark.executor.memory=8g
spark.executor.memoryOverhead=8g
spark.master=local[*]
spark.memory.offHeap.enabled=true
spark.memory.offHeap.size=8g
spark.network.timeout=10000000
spark.rdd.compress=True
spark.repl.class.outputDir=/tmp/tmpxteknk8n
spark.repl.class.uri=spark://b0c0236ce0ed:35567/classes
spark.serializer.objectStreamReset=100
spark.storage.memoryFraction=0.4
spark.submit.deployMode=client
spark.ui.showConsoleProgress=true


In [3]:
// Load Clean Dataset into a DataFrame from HDFS after wrangling is completed
var df_clean = spark
    .read
    .format("csv")
    .option("header", "true")
    .load("hdfs://localhost:9000/tmp/input/*.csv")
df_clean.cache()

df_clean: org.apache.spark.sql.DataFrame = [Price: string, MethodOfSale: string ... 11 more fields]
res1: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Price: string, MethodOfSale: string ... 11 more fields]


In [4]:
df_clean = df_clean.withColumn("Price",col("Price").cast("Double"))
    .withColumn("Rooms",col("Rooms").cast("Int"))
    .withColumn("DistanceFromCBD",col("DistanceFromCBD").cast("Double"))
    .withColumn("MethodOfSale",col("MethodOfSale").cast("Int"))
    .withColumn("PropertyType",col("PropertyType").cast("Int"))
    .withColumn("Bathroom",col("Bathroom").cast("Int"))
    .withColumn("Car",col("Car").cast("Int"))
    .withColumn("Landsize",col("Landsize").cast("Double"))
    .withColumn("Latitude",col("Latitude").cast("Double"))
    .withColumn("Longtitude",col("Longtitude").cast("Double"))



df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


In [5]:
df_clean.cache()
df_clean.printSchema()

root
 |-- Price: double (nullable = true)
 |-- MethodOfSale: integer (nullable = true)
 |-- PropertyType: integer (nullable = true)
 |-- DistanceFromCBD: double (nullable = true)
 |-- Rooms: integer (nullable = true)
 |-- Bathroom: integer (nullable = true)
 |-- Car: integer (nullable = true)
 |-- Landsize: double (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Suburb: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- StreetName: string (nullable = true)



### Construct vectors from attributes
#### Transform Sale Date into a numeric value

In [6]:
df_clean = df_clean.withColumn("Date",unix_timestamp($"Date", "dd/mm/yyyy"))

df_clean: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 11 more fields]


In [7]:
df_clean.toDF.createOrReplaceTempView("df_py_clean")

In [8]:
%%python
df_clean_p = spark.sql("select * from df_py_clean")
df_clean_p.show(5)

+---------+------------+------------+---------------+-----+--------+---+--------+--------+----------+----------+----------+-------------+
|    Price|MethodOfSale|PropertyType|DistanceFromCBD|Rooms|Bathroom|Car|Landsize|Latitude|Longtitude|    Suburb|      Date|   StreetName|
+---------+------------+------------+---------------+-----+--------+---+--------+--------+----------+----------+----------+-------------+
|1480000.0|           1|           1|            2.5|    2|       1|  1|   202.0|-37.7996|  144.9984|Abbotsford|1451779920|    Turner St|
|1035000.0|           1|           1|            2.5|    2|       1|  0|   156.0|-37.8079|  144.9934|Abbotsford|1451865720| Bloomburg St|
|1465000.0|           2|           1|            2.5|    3|       2|  0|   134.0|-37.8093|  144.9944|Abbotsford|1483488180|   Charles St|
| 850000.0|           3|           1|            2.5|    3|       2|  1|    94.0|-37.7969|  144.9969|Abbotsford|1483488180|Federation La|
|1600000.0|           6|          

In [9]:
! pip install matplotlib
! pip install pandas

Collecting matplotlib


  Downloading matplotlib-3.2.1-cp36-cp36m-manylinux1_x86_64.whl (12.4 MB)


[?25l

[K     |                                | 10 kB 4.4 MB/s eta 0:00:03

[K     |                                | 20 kB 1.3 MB/s eta 0:00:10

[K     |                                | 30 kB 1.4 MB/s eta 0:00:10

[K     |                                | 40 kB 1.7 MB/s eta 0:00:08

[K     |                                | 51 kB 1.5 MB/s eta 0:00:09

[K     |                                | 61 kB 1.7 MB/s eta 0:00:08

[K     |                                | 71 kB 2.0 MB/s eta 0:00:07

[K     |                                | 81 kB 1.9 MB/s eta 0:00:07

[K     |                                | 92 kB 1.9 MB/s eta 0:00:07

[K     |                                | 102 kB 2.1 MB/s eta 0:00:06

[K     |                                | 112 kB 2.1 MB/s eta 0:00:06

[K     |                                | 122 kB 2.1 MB/s eta 0:00:06

[K     |                            


[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.2 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.3 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.3 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.3 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.3 MB 2.1 MB/s eta 0:00:06

[K     |###                             | 1.3 MB 2.1 MB/s eta 


[K     |######                          | 2.3 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.3 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.4 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.5 MB 2.1 MB/s eta 0:00:05

[K     |######                          | 2.5 MB 2.1 MB/s eta 


[K     |#########                       | 3.5 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.5 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.5 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.5 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.5 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 0:00:05

[K     |#########                       | 3.6 MB 2.1 MB/s eta 


[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.7 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.8 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.8 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.8 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.8 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.8 MB 3.1 MB/s eta 0:00:03

[K     |############                    | 4.8 MB 3.1 MB/s eta 


[K     |###############                 | 5.8 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.8 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 5.9 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 6.0 MB 3.1 MB/s eta 0:00:03

[K     |###############                 | 6.0 MB 3.1 MB/s eta 


[K     |##################              | 7.0 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.0 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.0 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.0 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.0 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 0:00:02

[K     |##################              | 7.1 MB 3.1 MB/s eta 


[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.2 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.3 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.3 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.3 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.3 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.3 MB 3.1 MB/s eta 0:00:02

[K     |#####################           | 8.3 MB 3.1 MB/s eta 


[K     |########################        | 9.3 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.3 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 3.1 MB/s eta 0:00:01

[K     |########################        | 9.4 MB 5.2 MB/s eta 0:00:01

[K     |########################        | 9.5 MB 5.2 MB/s eta 0:00:01

[K     |########################        | 9.5 MB 5.2 MB/s eta 0:00:01

[K     |########################        | 9.5 MB 5.2 MB/s eta 


[K     |###########################     | 10.5 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.5 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.5 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.5 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.5 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB 5.2 MB/s eta 0:00:01

[K     |###########################     | 10.6 MB


[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.7 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.8 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.8 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.8 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.8 MB 5.2 MB/s eta 0:00:01

[K     |##############################  | 11.8 MB


[K     |                                | 399 kB 8.8 MB/s eta 0:00:03

[K     |                                | 409 kB 8.8 MB/s eta 0:00:03

[K     |                                | 419 kB 8.8 MB/s eta 0:00:03

[K     |                                | 430 kB 8.8 MB/s eta 0:00:03

[K     |                                | 440 kB 8.8 MB/s eta 0:00:03

[K     |                                | 450 kB 8.8 MB/s eta 0:00:03

[K     |                                | 460 kB 8.8 MB/s eta 0:00:03

[K     |                                | 471 kB 8.8 MB/s eta 0:00:03

[K     |                                | 481 kB 8.8 MB/s eta 0:00:03

[K     |                                | 491 kB 8.8 MB/s eta 0:00:03

[K     |                                | 501 kB 8.8 MB/s eta 0:00:03

[K     |                                | 512 kB 8.8 MB/s eta 0:00:03

[K     |                                | 522 kB 8.8 MB/s eta 0:00:03

[K     |                                | 532 kB 8.8 MB/s eta 


[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.6 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.7 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.7 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.7 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.7 MB 8.8 MB/s eta 0:00:03

[K     |##                              | 1.7 MB 8.8 MB/s eta 


[K     |####                            | 2.7 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.7 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.8 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.9 MB 8.8 MB/s eta 0:00:02

[K     |####                            | 2.9 MB 8.8 MB/s eta 


[K     |######                          | 3.9 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 3.9 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 3.9 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 3.9 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 3.9 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 0:00:02

[K     |######                          | 4.0 MB 8.8 MB/s eta 


[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.1 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.2 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.2 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.2 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.2 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.2 MB 8.8 MB/s eta 0:00:02

[K     |########                        | 5.2 MB 8.8 MB/s eta 


[K     |#########                       | 6.2 MB 8.1 MB/s eta 0:00:02

[K     |#########                       | 6.2 MB 8.1 MB/s eta 0:00:02

[K     |#########                       | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |#########                       | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |#########                       | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |#########                       | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.3 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.4 MB 8.1 MB/s eta 0:00:02

[K     |##########                      | 6.4 MB 8.1 MB/s eta 


[K     |###########                     | 7.4 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.4 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.4 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.4 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.4 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 0:00:02

[K     |###########                     | 7.5 MB 8.1 MB/s eta 


[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.6 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.7 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.7 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.7 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.7 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.7 MB 8.1 MB/s eta 0:00:02

[K     |#############                   | 8.7 MB 8.1 MB/s eta 


[K     |###############                 | 9.7 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.7 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.8 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.9 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.9 MB 8.1 MB/s eta 0:00:02

[K     |###############                 | 9.9 MB 8.1 MB/s eta 


[K     |#################               | 10.9 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 10.9 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 10.9 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 10.9 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 10.9 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 10.9 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB 8.1 MB/s eta 0:00:02

[K     |#################               | 11.0 MB


[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:02

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:02

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.1 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.2 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.2 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.2 MB 8.1 MB/s eta 0:00:01

[K     |###################             | 12.2 MB


[K     |####################            | 13.2 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.2 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.2 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.2 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB 7.0 MB/s eta 0:00:01

[K     |#####################           | 13.3 MB


[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.4 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.5 MB 7.0 MB/s eta 0:00:01

[K     |######################          | 14.5 MB 7.0 MB/s eta 0:00:01

[K     |#######################         | 14.5 MB 7.0 MB/s eta 0:00:01

[K     |#######################         | 14.5 MB 7.0 MB/s eta 0:00:01

[K     |#######################         | 14.5 MB


[K     |########################        | 15.5 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.5 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.5 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.6 MB 7.0 MB/s eta 0:00:01

[K     |########################        | 15.7 MB


[K     |##########################      | 16.7 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.7 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.7 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.7 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.7 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.7 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.7 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.8 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.8 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.8 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.8 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.8 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.8 MB 7.0 MB/s eta 0:00:01

[K     |##########################      | 16.8 MB


[K     |############################    | 17.8 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.8 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 17.9 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 18.0 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 18.0 MB 7.0 MB/s eta 0:00:01

[K     |############################    | 18.0 MB


[K     |##############################  | 19.0 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.0 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.0 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.0 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.0 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.0 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB 6.0 MB/s eta 0:00:01

[K     |##############################  | 19.1 MB







Collecting cycler>=0.10


  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)


Collecting kiwisolver>=1.0.1


  Downloading kiwisolver-1.2.0-cp36-cp36m-manylinux1_x86_64.whl (88 kB)


[?25l

[K     |###                             | 10 kB 4.1 MB/s eta 0:00:01

[K     |#######                         | 20 kB 3.3 MB/s eta 0:00:01

[K     |###########                     | 30 kB 4.7 MB/s eta 0:00:01

[K     |##############                  | 40 kB 5.8 MB/s eta 0:00:01

[K     |##################              | 51 kB 6.8 MB/s eta 0:00:01

[K     |######################          | 61 kB 7.8 MB/s eta 0:00:01

[K     |#########################       | 71 kB 8.0 MB/s eta 0:00:01

[K     |#############################   | 81 kB 8.9 MB/s eta 0:00:01

[K     |################################| 88 kB 5.1 MB/s 




Installing collected packages: numpy, cycler, kiwisolver, matplotlib


Successfully installed cycler-0.10.0 kiwisolver-1.2.0 matplotlib-3.2.1 numpy-1.18.5


Collect


[K     |###                             | 972 kB 2.0 MB/s eta 0:00:05

[K     |###                             | 983 kB 2.0 MB/s eta 0:00:05

[K     |###                             | 993 kB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.0 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.0 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.0 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.0 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.0 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.1 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.1 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.1 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.1 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.1 MB 2.0 MB/s eta 0:00:05

[K     |###                             | 1.1 MB 2.0 MB/s eta 


[K     |######                          | 2.1 MB 2.0 MB/s eta 0:00:04

[K     |######                          | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |######                          | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |######                          | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |######                          | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |######                          | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |######                          | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |#######                         | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |#######                         | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |#######                         | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |#######                         | 2.2 MB 2.0 MB/s eta 0:00:04

[K     |#######                         | 2.3 MB 2.0 MB/s eta 0:00:04

[K     |#######                         | 2.3 MB 2.0 MB/s eta 0:00:04

[K     |#######                         | 2.3 MB 2.0 MB/s eta 


[K     |##########                      | 3.3 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.3 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.3 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.3 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.3 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 0:00:04

[K     |##########                      | 3.4 MB 2.0 MB/s eta 


[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.5 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.6 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.6 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.6 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.6 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.6 MB 2.0 MB/s eta 0:00:03

[K     |##############                  | 4.6 MB 2.0 MB/s eta 


[K     |#################               | 5.6 MB 5.7 MB/s eta 0:00:01

[K     |#################               | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |#################               | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |#################               | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |#################               | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.7 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.8 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.8 MB 5.7 MB/s eta 0:00:01

[K     |##################              | 5.8 MB 5.7 MB/s eta 


[K     |#####################           | 6.8 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.8 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.8 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.8 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 0:00:01

[K     |#####################           | 6.9 MB 5.7 MB/s eta 


[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.0 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.1 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.1 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.1 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.1 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.1 MB 5.7 MB/s eta 0:00:01

[K     |#########################       | 8.1 MB 5.7 MB/s eta 


[K     |############################    | 9.1 MB 5.7 MB/s eta 0:00:01

[K     |############################    | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.2 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.3 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.3 MB 5.7 MB/s eta 0:00:01

[K     |#############################   | 9.3 MB 5.7 MB/s eta 


[K     |###########                     | 184 kB 5.0 MB/s eta 0:00:01

[K     |############                    | 194 kB 5.0 MB/s eta 0:00:01

[K     |############                    | 204 kB 5.0 MB/s eta 0:00:01

[K     |#############                   | 215 kB 5.0 MB/s eta 0:00:01

[K     |##############                  | 225 kB 5.0 MB/s eta 0:00:01

[K     |##############                  | 235 kB 5.0 MB/s eta 0:00:01

[K     |###############                 | 245 kB 5.0 MB/s eta 0:00:01

[K     |################                | 256 kB 5.0 MB/s eta 0:00:01

[K     |################                | 266 kB 5.0 MB/s eta 0:00:01

[K     |#################               | 276 kB 5.0 MB/s eta 0:00:01

[K     |#################               | 286 kB 5.0 MB/s eta 0:00:01

[K     |##################              | 296 kB 5.0 MB/s eta 0:00:01

[K     |###################             | 307 kB 5.0 MB/s eta 0:00:01

[K     |###################             | 317 kB 5.0 MB/s eta 

In [10]:
%matplotlib notebook 
%%python

import pandas as pd
import matplotlib.pyplot as plt

df_clean_pd = df_clean_p.toPandas() 

# 1 -House, 2- Unit, 3 -Townhouse
ax = df_clean_pd.groupby("PropertyType")["Price"].nunique().plot(kind="bar")
ax.set_ylabel("Price in (mln)")
ax.set_xlabel("Propetry Type")
plt.grid()
plt.title("Price Distribution by Property Type")
plt.show()

<IPython.core.display.Javascript object>

#### Set FeatureHasher for Suburb, StreetName

In [11]:
import org.apache.spark.ml.feature.{FeatureHasher,OneHotEncoder,StandardScaler,VectorAssembler}

val hasher = new FeatureHasher()
 .setInputCols("StreetName","Suburb")
 .setOutputCol("str_name_suburb_vec")

import org.apache.spark.ml.feature.{FeatureHasher, OneHotEncoder, StandardScaler, VectorAssembler}
hasher: org.apache.spark.ml.feature.FeatureHasher = featureHasher_92f1c3171840


#### Set OneHotEncoders for PropertyType, MethodOfSale

In [12]:
val ms_encoder = new OneHotEncoder()
      .setInputCol("MethodOfSale")
      .setOutputCol("m_sale_vec")

val pt_encoder = new OneHotEncoder()
      .setInputCol("PropertyType")
      .setOutputCol("pt_vec")


ms_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_cee832cfec0a
pt_encoder: org.apache.spark.ml.feature.OneHotEncoder = oneHot_cef905298d65


#### Assemble the columns and column vectors into a single column - "features"

In [13]:
val columns = Array("DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols(columns)
            .setOutputCol("features")

val dd = hasher.transform(df_clean).drop("StreetName","Suburb")


columns: Array[String] = Array(DistanceFromCBD, Rooms, Bathroom, Car, Landsize, Latitude, Longtitude, Date, str_name_suburb_vec, m_sale_vec, pt_vec)
assembler: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_a43d6c282929
dd: org.apache.spark.sql.DataFrame = [Price: double, MethodOfSale: int ... 10 more fields]


In [14]:
val mm = ms_encoder.transform(dd).drop("MethodOfSale")


mm: org.apache.spark.sql.DataFrame = [Price: double, PropertyType: int ... 10 more fields]


In [15]:
val pt = pt_encoder.transform(mm)


pt: org.apache.spark.sql.DataFrame = [Price: double, PropertyType: int ... 11 more fields]


In [16]:
val feature_ds = assembler.transform(pt).drop("DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")
feature_ds.cache()

feature_ds: org.apache.spark.sql.DataFrame = [Price: double, PropertyType: int ... 1 more field]
res4: feature_ds.type = [Price: double, PropertyType: int ... 1 more field]


In [17]:
feature_ds.show()

+---------+------------+--------------------+
|    Price|PropertyType|            features|
+---------+------------+--------------------+
|1480000.0|           1|(262163,[0,1,2,3,...|
|1035000.0|           1|(262163,[0,1,2,4,...|
|1465000.0|           1|(262163,[0,1,2,4,...|
| 850000.0|           1|(262163,[0,1,2,3,...|
|1600000.0|           1|(262163,[0,1,2,3,...|
| 941000.0|           1|(262163,[0,1,2,4,...|
|1876000.0|           1|(262163,[0,1,2,4,...|
|1636000.0|           1|(262163,[0,1,2,3,...|
|1097000.0|           1|(262163,[0,1,2,3,...|
|1350000.0|           1|(262163,[0,1,2,3,...|
|1172500.0|           1|(262163,[0,1,2,3,...|
|1310000.0|           1|(262163,[0,1,2,3,...|
|1200000.0|           1|(262163,[0,1,2,3,...|
|1176500.0|           1|(262163,[0,1,2,3,...|
| 955000.0|           1|(262163,[0,1,2,4,...|
| 890000.0|           1|(262163,[0,1,2,3,...|
|1330000.0|           1|(262163,[0,1,2,3,...|
|1090000.0|           2|(262163,[0,1,2,3,...|
|1100000.0|           1|(262163,[0

#### Set StandardScaler

In [18]:
val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")
      .setWithStd(true).setWithMean(true)


scaler: org.apache.spark.ml.feature.StandardScaler = stdScal_d9ef5d637b5d


### Split Data into a Training and a Testing Set

In [19]:
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._


def train_test_split(data: DataFrame, fraction: Double) = {
    
    
    //val sample_data = data.stat.sampleBy("PropertyType",data,36L)
    val train_fractions = Map(1 -> 0.8*fraction,2 ->0.8*fraction, 3 -> 0.8*fraction)
    val test_fractions = Map(1 -> 0.2*fraction,2 ->0.2*fraction, 3 -> 0.2*fraction)
    
    
    var train = data.stat.sampleBy("PropertyType",train_fractions,36L)
    
    //sample from the data without train set 
    var test = data.except(train).stat.sampleBy("PropertyType",test_fractions,36L)
    
    //val Array(train, test) = data.randomSplit(Array(0.8, 0.2), seed = 30)
    train =train.drop("PropertyType")
    test = test.drop("PropertyType")
    (train, test)
}

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
train_test_split: (data: org.apache.spark.sql.DataFrame, fraction: Double)(org.apache.spark.sql.DataFrame, org.apache.spark.sql.DataFrame)


In [20]:

val (train_full, test_full) = train_test_split(feature_ds,1)
train_full.cache()
test_full.cache()

train_full: org.apache.spark.sql.DataFrame = [Price: double, features: vector]
test_full: org.apache.spark.sql.DataFrame = [Price: double, features: vector]
res6: test_full.type = [Price: double, features: vector]


In [21]:
feature_ds.count()

res7: Long = 15728


In [22]:
train_full.count()

res8: Long = 12524


In [23]:
test_full.count()

res9: Long = 618


In [24]:
val (train_sample, test_sample) = train_test_split(feature_ds,0.1)
train_sample.cache()
test_sample.cache()

train_sample: org.apache.spark.sql.DataFrame = [Price: double, features: vector]
test_sample: org.apache.spark.sql.DataFrame = [Price: double, features: vector]
res10: test_sample.type = [Price: double, features: vector]


In [25]:
train_sample.count()

res11: Long = 1285


In [26]:
test_sample.count()

res12: Long = 284


### 1. Apply Linear Regression


#### Estimator

In [27]:
import org.apache.spark.ml.regression.LinearRegression

val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("scaledFeatures")
    .setPredictionCol("prediction")

import org.apache.spark.ml.regression.LinearRegression
lr: org.apache.spark.ml.regression.LinearRegression = linReg_3fdaa78f59b1


#### Parameters

In [28]:
lr.extractParamMap

res13: org.apache.spark.ml.param.ParamMap =
{
	linReg_3fdaa78f59b1-aggregationDepth: 2,
	linReg_3fdaa78f59b1-elasticNetParam: 0.0,
	linReg_3fdaa78f59b1-epsilon: 1.35,
	linReg_3fdaa78f59b1-featuresCol: scaledFeatures,
	linReg_3fdaa78f59b1-fitIntercept: true,
	linReg_3fdaa78f59b1-labelCol: Price,
	linReg_3fdaa78f59b1-loss: squaredError,
	linReg_3fdaa78f59b1-maxIter: 100,
	linReg_3fdaa78f59b1-predictionCol: prediction,
	linReg_3fdaa78f59b1-regParam: 0.0,
	linReg_3fdaa78f59b1-solver: auto,
	linReg_3fdaa78f59b1-standardization: true,
	linReg_3fdaa78f59b1-tol: 1.0E-6
}


#### Define time function

In [29]:
def time[R](block: => R): R = {
  val t0 = System.nanoTime()
  val result = block    // call-by-name
  val t1 = System.nanoTime()
  println("Elapsed time: " + (t1 - t0)/1000000000 + " s")
  result
 }

time: [R](block: => R)R


#### Define get_predictions function

In [30]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.Pipeline


def get_predictions[R <: Predictor[Vector, R, M],
                M <: PredictionModel[Vector, M]](
    predictor: Predictor[Vector, R, M],
    train: DataFrame, 
    test: DataFrame) = {
    
    val pipeline = new Pipeline()
      .setStages(Array(scaler, predictor))
     
    val result =pipeline.fit(train).transform(test)
    result

}

import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.Pipeline
get_predictions: [R <: org.apache.spark.ml.Predictor[org.apache.spark.ml.linalg.Vector,R,M], M <: org.apache.spark.ml.PredictionModel[org.apache.spark.ml.linalg.Vector,M]](predictor: org.apache.spark.ml.Predictor[org.apache.spark.ml.linalg.Vector,R,M], train: org.apache.spark.sql.DataFrame, test: org.apache.spark.sql.DataFrame)org.apache.spark.sql.DataFrame


#### Prediction

In [31]:
train_full.show()

+---------+--------------------+
|    Price|            features|
+---------+--------------------+
|1480000.0|(262163,[0,1,2,3,...|
|1035000.0|(262163,[0,1,2,4,...|
| 850000.0|(262163,[0,1,2,3,...|
|1600000.0|(262163,[0,1,2,3,...|
| 941000.0|(262163,[0,1,2,4,...|
|1876000.0|(262163,[0,1,2,4,...|
|1097000.0|(262163,[0,1,2,3,...|
|1172500.0|(262163,[0,1,2,3,...|
|1310000.0|(262163,[0,1,2,3,...|
|1200000.0|(262163,[0,1,2,3,...|
|1176500.0|(262163,[0,1,2,3,...|
| 955000.0|(262163,[0,1,2,4,...|
|1330000.0|(262163,[0,1,2,3,...|
|1090000.0|(262163,[0,1,2,3,...|
|1100000.0|(262163,[0,1,2,3,...|
|1315000.0|(262163,[0,1,2,4,...|
|1447500.0|(262163,[0,1,2,3,...|
|1290000.0|(262163,[0,1,2,3,...|
|1290000.0|(262163,[0,1,2,3,...|
|1195000.0|(262163,[0,1,2,3,...|
+---------+--------------------+
only showing top 20 rows



In [32]:
test_full.show()

+---------+--------------------+
|    Price|            features|
+---------+--------------------+
|1706000.0|(262163,[0,1,2,3,...|
| 730000.0|(262163,[0,1,2,3,...|
|1177000.0|(262163,[0,1,2,3,...|
| 830000.0|(262163,[0,1,2,3,...|
|1080000.0|(262163,[0,1,2,3,...|
| 686000.0|(262163,[0,1,2,3,...|
| 745000.0|(262163,[0,1,2,3,...|
| 660000.0|(262163,[0,1,2,3,...|
| 725000.0|(262163,[0,1,2,3,...|
|1600000.0|(262163,[0,1,2,3,...|
| 852000.0|(262163,[0,1,2,3,...|
| 400000.0|(262163,[0,1,2,3,...|
|3100000.0|(262163,[0,1,2,3,...|
| 940000.0|(262163,[0,1,2,3,...|
| 651500.0|(262163,[0,1,2,3,...|
| 631000.0|(262163,[0,1,2,3,...|
|1225000.0|(262163,[0,1,2,3,...|
| 290000.0|(262163,[0,1,2,3,...|
|1780000.0|(262163,[0,1,2,3,...|
|1605000.0|(262163,[0,1,2,3,...|
+---------+--------------------+
only showing top 20 rows



In [None]:
val lrPredictions = time{get_predictions(lr, train_full, test_full)}
lrPredictions.cache()

In [None]:
lrPredictions.columns

In [None]:
lrPredictions.withColumn("prediction", round($"prediction", 0)).select("Price","prediction").show()

#### Evaluation

In [33]:
import org.apache.spark.ml.evaluation.RegressionEvaluator

def assess ( predictions: DataFrame, metric: String) = {
    val eval =  new RegressionEvaluator()
       .setLabelCol("Price")
       .setPredictionCol("prediction")
       .setMetricName(metric)
println(metric.toUpperCase()+" on test data = " + eval.evaluate(predictions))
    
}

import org.apache.spark.ml.evaluation.RegressionEvaluator
assess: (predictions: org.apache.spark.sql.DataFrame, metric: String)Unit


#### Regression metrics

**Mean squared error (MSE)** -- the average of squared differences between the predicted outcome and the true outcome.

**R2 coefficient** -- the proportion of variance in the outcome that our model is capable of predicting based on its features.


In [None]:
assess(lrPredictions,"rmse")

In [None]:
assess(lrPredictions,"r2")

#### Training/Testing/Evaluating
Cross-validation

<span style="color:red">
TO DO: does not finish run in reasonable time
</span>

In [34]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}



def train_eval[R <: Predictor[Vector, R, M],
               M <: PredictionModel[Vector, M]](
    predictor: Predictor[Vector, R, M],
    paramMap: Array[ParamMap],
    train: DataFrame, 
    test: DataFrame) = {

    val pipeline = new Pipeline()
      .setStages(Array(scaler, predictor))
    
    val cv = new CrossValidator()
        .setEstimator(pipeline)
        .setEvaluator(new RegressionEvaluator()
        .setLabelCol("Price")
        .setPredictionCol("prediction")
        .setMetricName("rmse"))
        .setEstimatorParamMaps(paramMap)
        .setNumFolds(5)
        .setParallelism(2)

    val cvModel = cv.fit(train)
    val predictions = cvModel.transform(test)
    
    predictions.cache()
    assess(predictions,"rmse")
    assess(predictions,"r2")
    
    val bestModel = cvModel.bestModel
    
    println(bestModel.extractParamMap)
    
    bestModel
}


import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
train_eval: [R <: org.apache.spark.ml.Predictor[org.apache.spark.ml.linalg.Vector,R,M], M <: org.apache.spark.ml.PredictionModel[org.apache.spark.ml.linalg.Vector,M]](predictor: org.apache.spark.ml.Predictor[org.apache.spark.ml.linalg.Vector,R,M], paramMap: Array[org.apache.spark.ml.param.ParamMap], train: org.apache.spark.sql.DataFrame, test: org.apache.spark.sql.DataFrame)org.apache.spark.ml.Model[_]


#### Parameter Tuning

In [36]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression

val lr = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("scaledFeatures")
    .setPredictionCol("prediction")

val lrParamMap = new ParamGridBuilder()
    .addGrid(lr.regParam, Array(10,0.1))
    .addGrid(lr.elasticNetParam, Array(0.0,0.5, 1.0))
    .addGrid(lr.maxIter, Array(50, 100))
    .build()

val t0 = System.nanoTime()
val bestLRModel = train_eval(lr, lrParamMap, train_sample, test_sample)
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0)/(1000000000) + " s")



2020-06-06 15:40:33,315 WARN  [Executor task launch worker for task 439] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
2020-06-06 15:40:33,316 WARN  [Executor task launch worker for task 439] netlib.BLAS (BLAS.java:<clinit>(61)) - Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
































































RMSE on test data = 637862.5674174298
R2 on test data = -0.2757758825353258
{

}
Elapsed time: 13406 s


import org.apache.spark.ml.Predictor
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.regression.LinearRegression
lr: org.apache.spark.ml.regression.LinearRegression = linReg_dd239a74c804
lrParamMap: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	linReg_dd239a74c804-elasticNetParam: 0.0,
	linReg_dd239a74c804-maxIter: 50,
	linReg_dd239a74c804-regParam: 10.0
}, {
	linReg_dd239a74c804-elasticNetParam: 0.0,
	linReg_dd239a74c804-maxIter: 50,
	linReg_dd239a74c804-regParam: 0.1
}, {
	linReg_dd239a74c804-elasticNetParam: 0.0,
	linReg_dd239a74c804-maxIter: 100,
	linReg_dd239a74c804-regParam: 10.0
}, {
	linReg_dd239a74c8...

In [42]:
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.regression.LinearRegressionModel

val bestPipelineModel = bestLRModel.asInstanceOf[PipelineModel]

val stages = bestPipelineModel.stages
for (i <- stages) println

val best_lr_model = stages.last.asInstanceOf[LinearRegressionModel]

println("LR params = " + best_lr_model.extractParamMap())




LR params = {
	linReg_dd239a74c804-aggregationDepth: 2,
	linReg_dd239a74c804-elasticNetParam: 1.0,
	linReg_dd239a74c804-epsilon: 1.35,
	linReg_dd239a74c804-featuresCol: scaledFeatures,
	linReg_dd239a74c804-fitIntercept: true,
	linReg_dd239a74c804-labelCol: Price,
	linReg_dd239a74c804-loss: squaredError,
	linReg_dd239a74c804-maxIter: 50,
	linReg_dd239a74c804-predictionCol: prediction,
	linReg_dd239a74c804-regParam: 10.0,
	linReg_dd239a74c804-solver: auto,
	linReg_dd239a74c804-standardization: true,
	linReg_dd239a74c804-tol: 1.0E-6
}


import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.regression.LinearRegressionModel
bestPipelineModel: org.apache.spark.ml.PipelineModel = pipeline_d0e803da0f54
stages: Array[org.apache.spark.ml.Transformer] = Array(stdScal_d9ef5d637b5d, linReg_dd239a74c804)
best_lr_model: org.apache.spark.ml.regression.LinearRegressionModel = linReg_dd239a74c804


In [None]:
bestLRModel
// save model
bestLRModel.write.overwrite()
  .save("./lr-model")

// load model
val bestLRModel = LinearRegression
  .load("./lr-model")

#### Prediction on the Best Model

In [54]:
val lr_best = new LinearRegression()
    .setLabelCol("Price")
    .setFeaturesCol("scaledFeatures")
    .setPredictionCol("prediction")
    .setElasticNetParam(best_lr_model.getElasticNetParam)
    .setMaxIter(best_lr_model.getMaxIter)
    .setRegParam(best_lr_model.getRegParam)

lr_best: org.apache.spark.ml.regression.LinearRegression = linReg_8c080e7267ca


In [56]:
val best_lrPredictions = time{get_predictions(lr_best, train_full, test_full)}
best_lrPredictions.cache()



Elapsed time: 2276 s


best_lrPredictions: org.apache.spark.sql.DataFrame = [Price: double, features: vector ... 2 more fields]
res26: best_lrPredictions.type = [Price: double, features: vector ... 2 more fields]


#### Regression metrics


In [57]:
assess(best_lrPredictions,"rmse")

RMSE on test data = 444670.7252851715


In [58]:
assess(best_lrPredictions,"r2")

R2 on test data = 0.6251742683983778


### 2. Apply Random Forest Regression

**Build Random Forest model**

Specify _numTrees, maxDepth, maxBins, featureSubsetStrategy_ and _seed_ parameters.

* **numTrees** -- Number of trees in the forest

* **maxDepth** -- Maximum depth of a tree. Increasing the depth makes the model more powerful, but deep trees take longer to train.

* **maxBins** -- Maximum number of bins used for discretizing continuous features and for choosing how to split on features at each node.

* **featureSubsetStrategy** -- Automatically select the number of features to consider for splits at each tree node

* **seed** -- Use a random seed number , allowing to repeat the results


If the number of trees is 1, then no bootstrapping is used at all. However, if the number of trees is > 1, then the bootstrapping is accomplished. Where, the parameter _featureSubsetStrategy_ signifies the number of features to be considered for splits at each node. The supported values of _featureSubsetStrategy_ are "auto", "all", "sqrt", "log2" and "on third". The supported numerical values, on the other hand, are $(0.0-1.0]$ and $[1-n]$. However, if _featureSubsetStrategy_ is chosen as $"auto"$, the algorithm chooses the best feature subset strategy automatically


If the $numTrees == 1$, the featureSubsetStrategy is set to be "all". However, if the $numTrees > 1$ (i.e., forest), featureSubsetStrategy is set to be "onethird" for regression


Moreover, if a real value "n" is in the range $(0, 1.0]$ is set, n*number_of_features is used consequently. However, if an integer value "n" is in the range (1, the number of features) is set, only n features are used alternatively


The parameter _categoricalFeaturesInfo_ which is a map is used for storing arbitrary of categorical features. An entry $(n -> k)$ indicates that feature n is categorical with k categories indexed from $0: {0, 1,...,k-1}$
The impurity criterion used for regression is “variance”. 

The _maxDepth_ is the maximum depth of the tree. (e.g., depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes). However, the suggested value is 4 to get a better result


The _maxBins_ signifies the maximum number of bins used for splitting the features; where the suggested value is 100 to get better results


Finally, the random seed is used for bootstrapping and choosing feature subsets to avoid the random nature of the results.

In [None]:
import org.apache.spark.ml.regression.RandomForestRegressor

val rf = new RandomForestRegressor()
  .setSeed(seed)
  .setLabelCol("Price")
  .setFeaturesCol("scaledFeatures")
  .setPredictionCol("prediction")
rf.extractParamMap()

In [None]:
rf.getNumTrees

In [None]:
rf.getMaxDepth

In [None]:
rf.getMaxBins

In [None]:
rf.getImpurity

In [None]:
rf.getNumTrees

In [None]:
val rfPredictions = time{get_predictions(rf, train_full, test_full)}
rfPredictions.cache()

In [None]:
rfPredictions.columns

In [None]:
rfPredictions.withColumn("prediction", round($"prediction", 0)).select("Price","prediction").show()

#### Regression metrics


In [None]:
assess(rfPredictions,"rmse")

In [None]:
assess(rfPredictions,"r2")

####  Finding the Best Model (Parameter Tuning) by Cross-Validation

Cross-validation
<span style="color:red">
TO DO: 
* finish implementation for Cross-validation 
* check if finish run in reasonable time
</span>

In [None]:
import org.apache.spark.ml.regression.RandomForestRegressor

// Models hypoparameters
val numTrees = Seq(5,10,30,50)
val maxBins = Seq(50,100)
val maxDepth = Seq(2,3,5)
//val featureSubsetStrategy = Seq("sqrt","onethird")

val rf = new RandomForestRegressor()
  .setSeed(seed)
  .setLabelCol("Price")
  .setFeaturesCol("scaledFeatures")
  .setPredictionCol("prediction")
  .setImpurity("variance")


val rfParamMap = new ParamGridBuilder()
  .addGrid(rf.numTrees, numTrees)
  .addGrid(rf.maxDepth, maxDepth)
  .addGrid(rf.maxBins, maxBins)
//  .addGrid(rf.featureSubsetStrategy, featureSubsetStrategy)
  .build()

val t0 = System.nanoTime()
val best_model = train_eval(rf, rfParamMap, train_sample, test_sample)
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0)/(1000000000) + " s")


In [None]:
import org.apache.spark.ml.PipelineModel
import org.apache.spark.ml.regression.RandomForestRegressionModel

val bestPipelineModel = best_model.asInstanceOf[PipelineModel]

val stages = bestPipelineModel.stages
for (i <- stages) println

val bestRFModel = stages.last.asInstanceOf[RandomForestRegressionModel]

println("RF params = " + bestRFModel.extractParamMap())


#### Prediction on the Best Model

In [None]:
val seed = 5043
val rf_best = new RandomForestRegressor()
  .setSeed(seed)
  .setLabelCol("Price")
  .setNumTrees(50)
  .setMaxDepth(5)
  .setMaxBins(100)

val rf_best_Predictions = time{get_predictions(rf_best, train_full, test_full)}
rfPredictions.cache()

#### Regression metrics


In [None]:
assess(rf_best_Predictions,"rmse")

In [None]:
assess(rf_best_Predictions,"r2")

### 3. Apply K-means Model  

Unsupervised learning problem. To find opimal number of clusters run many models with differnet numer of clusters. When the number of clusters increses the differences between clusters gets smaller while the differences between points inside clusters increase as well. Optimally, WCSSE -  Within Set Sum of Squared Errors, should be as small as possible, while Slhouette coefficient [-1,1] cohesion and separation should get as close to 1 as possible.

* WSSSE -  Within Set Sum of Squared Errors
* Slhouette coefficient  for a data point is  $(b-a)/max(a,b)$, where $a$ - measure of cohesion (mean intra-cluster distance), b - measure of separarion (the distance between a sample and the nearest cluster that the sample is not a part of)


#### Training and Predicting


In [None]:
import org.apache.spark.ml.clustering.{KMeans,KMeansModel}

val kmeans = new KMeans().setSeed(5043L)
                         .setPredictionCol("cluster")
                         .setFeaturesCol("scaledFeatures")


In [None]:
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel
import org.apache.spark.ml.Pipeline


val pipeline = new Pipeline()
      .setStages(Array(scaler, kmeans))
     
val t0 = System.nanoTime()
 
val pipelineModel = pipeline.fit(train_full)



val kmPredictions = pipelineModel.transform(train_full) 
kmPredictions.cache()

val kmModel = pipelineModel.stages.last.asInstanceOf[KMeansModel]
val t1 = System.nanoTime()
println("Elapsed time: " + (t1 - t0)/1000000000 + " s")


In [None]:
kmPredictions.show(10)

In [None]:
kmModel.extractParamMap

In [None]:
//no of categories
val num_clusters = kmPredictions.groupBy("cluster").count().count()

#### Evaluation.

The silhouette metrics ranges between 1 and -1, where a value close to 1 means that the points in a cluster are close to the other points in the same cluster and far from the points of the other clusters.

In [None]:
val num_train_full =train_full.count()

In [None]:
val WSSSE = kmModel.computeCost(kmPredictions)
println(s"WSSSE error = $WSSSE")

In [None]:
val scaled_WSSSE = WSSSE/num_train_full

print(s"Normalised WCSSE error = ")
println("%06.2f".format(scaled_WSSSE))

In [None]:
import org.apache.spark.ml.evaluation.ClusteringEvaluator

// Evaluate clustering by computing Silhouette score
val evaluator = new ClusteringEvaluator()

//assess(kmPredictions,"WCSSE")
//assess(kmPredictions,"silhouette")
evaluator.setPredictionCol("cluster")
val silhouette = evaluator.evaluate(kmPredictions)
println(s"Silhouette with squared euclidean distance =")
println("%06.2f".format(silhouette))


In [None]:
for (i <- Array.range(0, num_clusters.toInt)){
  println("Cluster "+i+":")
  kmPredictions.where($"cluster" === i).describe().select("Summary", "Price").show()
}   

In [None]:
import org.apache.spark.ml.evaluation.ClusteringEvaluator
import org.apache.spark.ml.Predictor
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.PredictionModel

def eval_points(maxK: Int): Map[Int, Double] = {
  val evaluator = new ClusteringEvaluator()
  var points = Map(2 -> WSSSE)
  for (i <- Array.range(3, maxK)){

    val kmeans = new KMeans().setSeed(5043L).setK(i)
    val kmeansModel = kmeans.fit(train_sample)
    val wssse = kmeansModel.computeCost(test_sample)
    points += (i -> wssse)
  }
  points
}

In [None]:
val points = time{eval_points(100)}

In [None]:
val num_sample = test_sample.count()

In [None]:

val pnt_sort = points.toArray.sortWith(_._2 < _._2)

In [None]:
val (best_num_centroids, best_wssse) = pointspoints.toArray.sortWith(_._2 < _._2)(0)
best_num_centroids
best_wssse/num_sample

#### Visualize predictions


In [None]:
%matplotlib notebook 
%%python
from matplotlib import pyplot as plt

-i pnt_sort -o points

fig = plt.figure()
ax = plt.axes()
(x,y) = points
ax.plot(x, y);
ax.grid()
ax.set_xlabel("Number of clusters")
ax.set_ylabel("Costs");

In [None]:
val bestPredictions  = new KMeans().setSeed(1L).setK(best_num_centroids).fit(train_sample)
bestPredictions.cache()
bestPredictions.show(10)

In [None]:
for (i <- Array.range(0, best_num_centroids.toInt)){
  println("Cluster "+i+":")
  bestPredictions.where($"prediction" === i).describe().select("Summary", "Price").show()
}   

In [None]:
bestPredictions.clusterCenters.foreach(println)

In [None]:
bestPredictions.clusterCenters.foreach(println)

In [None]:
val columns = Array("DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols("features")
            .setOutputCol(columns)

val unassembled_bc = assembler.transform(bestPredictions)


### Reverse Best Cluster column to data row

In [None]:
val columns = Array("DistanceFromCBD", "Rooms", "Bathroom", "Car", "Landsize", "Latitude", "Longtitude", "Date", 
                    "str_name_suburb_vec", "m_sale_vec", "pt_vec")

val assembler = new VectorAssembler()
            .setInputCols("features")
            .setOutputCol(columns)

val unassembled_bc = assembler.transform(bestClust)


In [None]:
val ms_encoder = new OneHotEncoder()
      .setInputCol("m_sale_vec")
      .setOutputCol("MethodOfSale")

val pt_encoder = new OneHotEncoder()
      .setInputCol("pt_vec")
      .setOutputCol("PropertyType")

val unassembled_ms_pt_bc = pt_encoder.transform(ms_encoder.transform(unassembled_bc))


In [None]:
import org.apache.spark.ml.feature.{FeatureHasher,OneHotEncoder,StandardScaler,VectorAssembler}

val hasher = new FeatureHasher()
 .setInputCols("str_name_suburb_vec")
 .setOutputCol("StreetName","Suburb")

val unhashed_bc = hasher.transform(unassembled_ms_pt_bc)

In [None]:
val reversed_bc = unhashed_bc.drop("str_name_suburb_vec", "m_sale_vec", "pt_vec","features")
reversed_bc.show()

In [None]:
// save model
kmeansModel.write.overwrite()
  .save("./kmeans-model")

// load model
val kmeansModelLoded = KMeansModel
  .load("./kmeans-model")

#### Bias vs Variance Graph of Error (validation error and training error) versus training set size. 


<span style="color:red">
TO DO: 
produce graph -- validation error and training error should converge
</span>


### References

Apache Spark (n.d.). _Spark ML Programming Guide._ Retrieved from https://spark.apache.org/docs/1.2.2/ml-guide.html

Bahadoor N., (2020). _Spark tutorials_ Retrieved from https://allaboutscala.com/big-data/spark/
Gorczynski M. (2017). _Introduction to machine learning with spark and mllib (dataframe API)._ Retrieved from https://scalac.io/scala-spark-ml-machine-learning-introduction/

Hydrospheredata (2020). Program creek. Scala Code Examples. Scaler Retrieved from https://www.programcreek.com/scala/org.apache.spark.ml.feature.StandardScaler

Jen G. (2020) _FeatureHasher_. Retrieved from https://george-jen.gitbook.io/data-science-and-apache-spark/featurehasher

Johnson S (2019). _From sckit-learn to Spark ML._ Retrieved from https://towardsdatascience.com/from-scikit-learn-to-spark-ml-f2886fb46852

Johnson S (2019). _Housing Prices - Spark ML Project_ Retrieved from https://github.com/scottdjohnson/HousingPricePredictions/blob/master/HousingPrices-SparkML.ipynb

Masri A. (2019). _FeatureTransformation. Retrieved from
https://towardsdatascience.com/apache-spark-mllib-tutorial-7aba8a1dce6e

Poddutur S. (). _Distribution of Executors, Cores and Memory for a Spark Application running in Yarn:_ Retrieved from https://spoddutur.github.io/spark-notes/distribution_of_executors_cores_and_memory_for_spark_application.html 

Rai D.,(2018) _Feature Engineering in pyspark — Part I._ Retrieved from https://medium.com/@dhiraj.p.rai/essentials-of-feature-engineering-in-pyspark-part-i-76a57680a85

Sarkar A. (2017). _Learning Spark SQL. Implementing a Spark ML clustering model._ Packt Publishing.

Scala Doc (n.d.) Retrieved from https://docs.scala-lang.org


(2019) _Random Forest Classifier with Apache Spark_ Retireved from https://medium.com/rahasak/random-forest-classifier-with-apache-spark-c63b4a23a7cc

Wagle M.(2020) _Predicting House Prices using Machine Learning_. Retrieved from https://medium.com/@manilwagle/predicting-house-prices-using-machine-learning-cab0b82cd3f


Zecevic P., Bonaci M. (2020) _Spark in Action_ Retireved from https://livebook.manning.com/book/spark-in-action/about-this-book/