## Initialise Spark Session:

In [1]:
val spark = org.apache.spark.sql.SparkSession.builder
        .master("local") 
        .appName("Spark CSV Reader")
        .getOrCreate;

Intitializing Scala interpreter ...

Spark Web UI available at http://356e2a7fd1b6:4040
SparkContext available as 'sc' (version = 2.4.5, master = local[*], app id = local-1590037167430)
SparkSession available as 'spark'




spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@1fc84f07


### Preparing HDFS

In [2]:
!pwd


/home/sandpit/big-data-realestate/scripts



In [3]:
!cat ./../data-raw/Melbourne_housing_FULL.csv| wc -l

34858



In [4]:
! hadoop fs -mkdir -p  /tmp/rs_in
! hadoop fs -put   -p  ./../data-raw/Melbourne_housing_FULL.csv             /tmp/rs_in/mh.csv
! hadoop fs -ls        /tmp/rs_in/

put: `/tmp/rs_in/mh.csv': File exists


Found 1 items


-rw-r--r--   1 root root    5018236 2020-05-15 05:20 /tmp/rs_in/mh.csv




In [5]:
!hadoop fs -cat /tmp/rs_in/mh.csv | wc -l

34858



### Get config info about hdfs:

In [6]:
!hdfs getconf -confKey fs.defaultFS

hdfs://localhost:9000



In [7]:
val df = spark.read.format("csv").option("header", "true").load("hdfs://localhost:9000/tmp/rs_in/mh.csv")

df: org.apache.spark.sql.DataFrame = [Suburb: string, Address: string ... 19 more fields]


### Print schema:

In [8]:
df.printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: string (nullable = true)
 |-- Bathroom: string (nullable = true)
 |-- Car: string (nullable = true)
 |-- Landsize: string (nullable = true)
 |-- BuildingArea: string (nullable = true)
 |-- YearBuilt: string (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: string (nullable = true)
 |-- Longtitude: string (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: string (nullable = true)



In [9]:
df.columns

res1: Array[String] = Array(Suburb, Address, Rooms, Type, Price, Method, SellerG, Date, Distance, Postcode, Bedroom2, Bathroom, Car, Landsize, BuildingArea, YearBuilt, CouncilArea, Lattitude, Longtitude, Regionname, Propertycount)


### Show column types:

In [10]:
df.dtypes

res2: Array[(String, String)] = Array((Suburb,StringType), (Address,StringType), (Rooms,StringType), (Type,StringType), (Price,StringType), (Method,StringType), (SellerG,StringType), (Date,StringType), (Distance,StringType), (Postcode,StringType), (Bedroom2,StringType), (Bathroom,StringType), (Car,StringType), (Landsize,StringType), (BuildingArea,StringType), (YearBuilt,StringType), (CouncilArea,StringType), (Lattitude,StringType), (Longtitude,StringType), (Regionname,StringType), (Propertycount,StringType))


In [11]:
df.dtypes.filter(colTup => colTup._1 == "Suburb")

res3: Array[(String, String)] = Array((Suburb,StringType))


### Display first 12 columns:

In [12]:
df.select("Suburb","Address","Rooms","Type","Price","Method","SellerG","Date","Distance","Postcode","Bathroom","Car").show()

+----------+-------------------+-----+----+-------+------+-------+---------+--------+--------+--------+----+
|    Suburb|            Address|Rooms|Type|  Price|Method|SellerG|     Date|Distance|Postcode|Bathroom| Car|
+----------+-------------------+-----+----+-------+------+-------+---------+--------+--------+--------+----+
|Abbotsford|      68 Studley St|    2|   h|   null|    SS| Jellis|3/09/2016|     2.5|    3067|       1|   1|
|Abbotsford|       85 Turner St|    2|   h|1480000|     S| Biggin|3/12/2016|     2.5|    3067|       1|   1|
|Abbotsford|    25 Bloomburg St|    2|   h|1035000|     S| Biggin|4/02/2016|     2.5|    3067|       1|   0|
|Abbotsford| 18/659 Victoria St|    3|   u|   null|    VB| Rounds|4/02/2016|     2.5|    3067|       2|   1|
|Abbotsford|       5 Charles St|    3|   h|1465000|    SP| Biggin|4/03/2017|     2.5|    3067|       2|   0|
|Abbotsford|   40 Federation La|    3|   h| 850000|    PI| Biggin|4/03/2017|     2.5|    3067|       2|   1|
|Abbotsford|       

### Display last 8 columns:

In [13]:
df.select("Landsize","BuildingArea","YearBuilt","CouncilArea","Lattitude","Longtitude","Regionname","Propertycount").show(10)

+--------+------------+---------+------------------+---------+----------+--------------------+-------------+
|Landsize|BuildingArea|YearBuilt|       CouncilArea|Lattitude|Longtitude|          Regionname|Propertycount|
+--------+------------+---------+------------------+---------+----------+--------------------+-------------+
|     126|        null|     null|Yarra City Council| -37.8014|  144.9958|Northern Metropol...|         4019|
|     202|        null|     null|Yarra City Council| -37.7996|  144.9984|Northern Metropol...|         4019|
|     156|          79|     1900|Yarra City Council| -37.8079|  144.9934|Northern Metropol...|         4019|
|       0|        null|     null|Yarra City Council| -37.8114|  145.0116|Northern Metropol...|         4019|
|     134|         150|     1900|Yarra City Council| -37.8093|  144.9944|Northern Metropol...|         4019|
|      94|        null|     null|Yarra City Council| -37.7969|  144.9969|Northern Metropol...|         4019|
|     120|         

In [100]:
df.describe().select("summary","Price", "Rooms","Distance","Bathroom","Car").show()

+-------+-----------------+------------------+------------------+------------------+------------------+
|summary|            Price|             Rooms|          Distance|          Bathroom|               Car|
+-------+-----------------+------------------+------------------+------------------+------------------+
|  count|            27247|             34857|             34857|             26631|             26129|
|   mean|1050173.344955408|3.0310124221820582|11.184929423916007| 1.624798167549097|1.7288453442535114|
| stddev|641467.1301045999|0.9699329348975204| 6.788892455935938|0.7242120114699068|1.0107707853554244|
|    min|          1000000|                 1|              #N/A|                 0|                 0|
|    max|           999999|                 9|               9.9|                 9|                 9|
+-------+-----------------+------------------+------------------+------------------+------------------+



In [15]:
df.describe().select("summary","Price","Landsize", "BuildingArea").show()

+-------+-----------------+------------------+------------------+
|summary|            Price|          Landsize|      BuildingArea|
+-------+-----------------+------------------+------------------+
|  count|            27247|             23047|             13742|
|   mean|1050173.344955408|  593.598993361392| 160.2564003565711|
| stddev|641467.1301045999|3398.8419464599056|401.26706008485496|
|    min|          1000000|                 0|                 0|
|    max|           999999|               999|               999|
+-------+-----------------+------------------+------------------+



#### Change "#N/A" to null

In [110]:
var df_result =df
for (colName<-df.columns){ 
  df_result = df.withColumn(colName, when(trim(df(colName))==="#N/A",null).otherwise(df(colName)))
  }

df_result: org.apache.spark.sql.DataFrame = [Suburb: string, Address: string ... 19 more fields]


In [113]:
df_result.describe().select("summary","Price", "Rooms","Distance","Bathroom","Car").show()

+-------+-----------------+------------------+------------------+------------------+------------------+
|summary|            Price|             Rooms|          Distance|          Bathroom|               Car|
+-------+-----------------+------------------+------------------+------------------+------------------+
|  count|            27247|             34857|             34856|             26631|             26129|
|   mean|1050173.344955408|3.0310124221820582|11.184929423916007| 1.624798167549097|1.7288453442535114|
| stddev|641467.1301045999|0.9699329348975204| 6.788892455935938|0.7242120114699068|1.0107707853554244|
|    min|          1000000|                 1|                 0|                 0|                 0|
|    max|           999999|                 9|               9.9|                 9|                 9|
+-------+-----------------+------------------+------------------+------------------+------------------+



#### Convert numeric data represented as string into double 

In [152]:
val doubleColNames = df_result.select("Price", "Rooms","Bedroom2","Distance","Bathroom","Car", "Landsize", "BuildingArea","Propertycount",
               "YearBuilt","Lattitude", "Longtitude").columns
//val colNames =df.columns
for (colName<-doubleColNames){ 
    df_result=df_result.withColumn(colName,col(colName).cast("Double"))
}

doubleColNames: Array[String] = Array(Price, Rooms, Bedroom2, Distance, Bathroom, Car, Landsize, BuildingArea, Propertycount, YearBuilt, Lattitude, Longtitude)


In [115]:
df_result.printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: double (nullable = true)
 |-- Bathroom: double (nullable = true)
 |-- Car: double (nullable = true)
 |-- Landsize: double (nullable = true)
 |-- BuildingArea: double (nullable = true)
 |-- YearBuilt: double (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: double (nullable = true)



### Filtering

In [116]:
df_result.filter($"Suburb"==="Glen Waverley").select("Address","Rooms","Type","Price","Method","SellerG","Date","Distance","Postcode","Bathroom","Car").show()

+-----------------+-----+----+---------+------+---------+----------+--------+--------+--------+----+
|          Address|Rooms|Type|    Price|Method|  SellerG|      Date|Distance|Postcode|Bathroom| Car|
+-----------------+-----+----+---------+------+---------+----------+--------+--------+--------+----+
|     7 Marbray Dr|  4.0|   h|     null|    SN|Harcourts| 1/07/2017|    16.7|    3150|     1.0| 2.0|
|      24 Owens Av|  4.0|   h|1250000.0|     S|      Ray| 1/07/2017|    16.7|    3150|    null|null|
|515 Springvale Rd|  3.0|   h|     null|    PI|      Ray| 1/07/2017|    16.7|    3150|     1.0| 2.0|
| 22 Stableford Av|  3.0|   h|     null|     S|      Ray| 1/07/2017|    16.7|    3150|    null|null|
|  28 Brentwood Dr|  5.0|   h|     null|    PI|      Ray| 3/06/2017|    16.7|    3150|     5.0| 2.0|
|2/70 Leicester Av|  3.0|   t|     null|    SP|      LLC| 3/06/2017|    16.7|    3150|    null|null|
|    38 Margate Cr|  3.0|   h|     null|    SN| Woodards| 3/06/2017|    16.7|    3150|    n

In [117]:
df_result.where("Suburb = 'Abbotsford'").select("Address","Rooms","Type","Price","Method","SellerG","Date","Distance","Postcode","Bathroom","Car", "Propertycount").show()

+-------------------+-----+----+---------+------+-------+---------+--------+--------+--------+----+-------------+
|            Address|Rooms|Type|    Price|Method|SellerG|     Date|Distance|Postcode|Bathroom| Car|Propertycount|
+-------------------+-----+----+---------+------+-------+---------+--------+--------+--------+----+-------------+
|      68 Studley St|  2.0|   h|     null|    SS| Jellis|3/09/2016|     2.5|    3067|     1.0| 1.0|       4019.0|
|       85 Turner St|  2.0|   h|1480000.0|     S| Biggin|3/12/2016|     2.5|    3067|     1.0| 1.0|       4019.0|
|    25 Bloomburg St|  2.0|   h|1035000.0|     S| Biggin|4/02/2016|     2.5|    3067|     1.0| 0.0|       4019.0|
| 18/659 Victoria St|  3.0|   u|     null|    VB| Rounds|4/02/2016|     2.5|    3067|     2.0| 1.0|       4019.0|
|       5 Charles St|  3.0|   h|1465000.0|    SP| Biggin|4/03/2017|     2.5|    3067|     2.0| 0.0|       4019.0|
|   40 Federation La|  3.0|   h| 850000.0|    PI| Biggin|4/03/2017|     2.5|    3067|   

In [118]:
df_result.where("Price >1000000").filter("Suburb = 'Abbotsford'").select("Address","Rooms","Type","Price","Method","SellerG","Date","Distance","Postcode","Bathroom","Car", "Propertycount").count()

res97: Long = 53


In [119]:
df_result.where("Price >1000000").filter("Suburb = 'Abbotsford'").select("Address","Rooms","Type","Price","Method","SellerG","Date","Distance","Postcode","Bathroom","Car", "Propertycount").show()

+-------------------+-----+----+---------+------+--------+----------+--------+--------+--------+----+-------------+
|            Address|Rooms|Type|    Price|Method| SellerG|      Date|Distance|Postcode|Bathroom| Car|Propertycount|
+-------------------+-----+----+---------+------+--------+----------+--------+--------+--------+----+-------------+
|       85 Turner St|  2.0|   h|1480000.0|     S|  Biggin| 3/12/2016|     2.5|    3067|     1.0| 1.0|       4019.0|
|    25 Bloomburg St|  2.0|   h|1035000.0|     S|  Biggin| 4/02/2016|     2.5|    3067|     1.0| 0.0|       4019.0|
|       5 Charles St|  3.0|   h|1465000.0|    SP|  Biggin| 4/03/2017|     2.5|    3067|     2.0| 0.0|       4019.0|
|        55a Park St|  4.0|   h|1600000.0|    VB|  Nelson| 4/06/2016|     2.5|    3067|     1.0| 2.0|       4019.0|
|       124 Yarra St|  3.0|   h|1876000.0|     S|  Nelson| 7/05/2016|     2.5|    3067|     2.0| 0.0|       4019.0|
|      98 Charles St|  2.0|   h|1636000.0|     S|  Nelson| 8/10/2016|   

In [120]:
df_result.where("Price >1000000").filter("Suburb = 'Abbotsford'").collect()

res99: Array[org.apache.spark.sql.Row] = Array([Abbotsford,85 Turner St,2.0,h,1480000.0,S,Biggin,3/12/2016,2.5,3067,2.0,1.0,1.0,202.0,null,null,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0], [Abbotsford,25 Bloomburg St,2.0,h,1035000.0,S,Biggin,4/02/2016,2.5,3067,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0], [Abbotsford,5 Charles St,3.0,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0], [Abbotsford,55a Park St,4.0,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council,-37.8072,144.9941,Northern Metropolitan,4019.0], [Abbotsford,124 Yarra St,3.0,h,1876000.0,S,Nelson,7/05/2016,2.5,3067,4.0,2.0,0...

In [121]:
df_result.select("Address","Type","Method","SellerG","Postcode","CouncilArea","Regionname").count()

res100: Long = 34857


### Categorical Attributes

#### 1. Address

In [122]:
// rename into street keep only street name
import org.apache.spark.sql.functions.countDistinct
df_result.select("Address").distinct.show()

+-------------------+
|            Address|
+-------------------+
|      557 Orrong Rd|
|      19 Poulter St|
|    43 Riverside Av|
|       11 South Tce|
|  41 Marlborough St|
|          4 Park Cr|
|        3/3 Dega Av|
|        93 Tudor St|
|         10 Kent Rd|
|       18 Thomas St|
|   1/1 Glen Iris Rd|
|      7 Allambee Av|
|    83 Truganini Rd|
|       130 Keele St|
|       8 Winters Wy|
|     36a Mitford St|
|   7/223 Station St|
|1/146 Ascot Vale Rd|
|    5/60 Farnham St|
|      22 Renwick St|
+-------------------+
only showing top 20 rows



import org.apache.spark.sql.functions.countDistinct


In [123]:
df_result.filter("Address IS NULL").count()

res102: Long = 0


In [124]:
df_result.select("Address").distinct.count()

res103: Long = 34009


In [125]:
df_result.printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Rooms: double (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: double (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: double (nullable = true)
 |-- Bathroom: double (nullable = true)
 |-- Car: double (nullable = true)
 |-- Landsize: double (nullable = true)
 |-- BuildingArea: double (nullable = true)
 |-- YearBuilt: double (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: double (nullable = true)
 |-- Longtitude: double (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: double (nullable = true)



#### Split Address on Street and Suffix

In [126]:
//split address on Street and Suffix
df_result = df_result.withColumn("Street",split(col("Address")," ").getItem(1)).
                   withColumn("Suffix",split(col("Address")," ").getItem(2)).drop("Address")
df_result.show()

+----------+-----+----+-------+------+-------+---------+--------+--------+--------+--------+----+--------+------------+---------+------------------+---------+----------+--------------------+-------------+----------+------+
|    Suburb|Rooms|Type|  Price|Method|SellerG|     Date|Distance|Postcode|Bedroom2|Bathroom| Car|Landsize|BuildingArea|YearBuilt|       CouncilArea|Lattitude|Longtitude|          Regionname|Propertycount|    Street|Suffix|
+----------+-----+----+-------+------+-------+---------+--------+--------+--------+--------+----+--------+------------+---------+------------------+---------+----------+--------------------+-------------+----------+------+
|Abbotsford|    2|   h|   null|    SS| Jellis|3/09/2016|     2.5|    3067|       2|       1|   1|     126|        null|     null|Yarra City Council| -37.8014|  144.9958|Northern Metropol...|         4019|   Studley|    St|
|Abbotsford|    2|   h|1480000|     S| Biggin|3/12/2016|     2.5|    3067|       2|       1|   1|     202|  

df_result: org.apache.spark.sql.DataFrame = [Suburb: string, Rooms: string ... 20 more fields]


In [127]:
df_result.printSchema()

root
 |-- Suburb: string (nullable = true)
 |-- Rooms: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Method: string (nullable = true)
 |-- SellerG: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Distance: string (nullable = true)
 |-- Postcode: string (nullable = true)
 |-- Bedroom2: string (nullable = true)
 |-- Bathroom: string (nullable = true)
 |-- Car: string (nullable = true)
 |-- Landsize: string (nullable = true)
 |-- BuildingArea: string (nullable = true)
 |-- YearBuilt: string (nullable = true)
 |-- CouncilArea: string (nullable = true)
 |-- Lattitude: string (nullable = true)
 |-- Longtitude: string (nullable = true)
 |-- Regionname: string (nullable = true)
 |-- Propertycount: string (nullable = true)
 |-- Street: string (nullable = true)
 |-- Suffix: string (nullable = true)



In [128]:
df_result.filter("Street IS NULL").count()

res107: Long = 0


#### 2. Postcode

In [129]:
var postcodes = df_result.select("Postcode").distinct()

postcodes: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Postcode: string]


In [130]:
df_result.filter("Postcode IS NULL").count()

res108: Long = 0


In [131]:
postcodes.count()

res109: Long = 212


#### 3. Suburb

In [132]:
val suburbs = df_result.select("Suburb").distinct

suburbs: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Suburb: string]


In [133]:
suburbs.count()

res110: Long = 351


In [134]:

df.filter("Suburb IS NULL").count()

res111: Long = 0


In [135]:
// make first letter of suburb upper case
import org.apache.spark.sql.functions._
df_result = df_result.withColumn("Suburb", initcap(col("Suburb")))
df_result.select("Suburb").distinct.show()


+----------------+
|          Suburb|
+----------------+
|  Brunswick West|
| South Melbourne|
|    Ivanhoe East|
|    Princes Hill|
|      Cranbourne|
|         Ashwood|
|       Brunswick|
|South Kingsville|
|        Brighton|
|        Oak Park|
|         Doveton|
|       Albanvale|
|      Brookfield|
|        Lynbrook|
|     Ferny Creek|
|     Pascoe Vale|
| Blackburn North|
|     Sandringham|
|   Botanic Ridge|
|          Carrum|
+----------------+
only showing top 20 rows



import org.apache.spark.sql.functions._
df_result: org.apache.spark.sql.DataFrame = [Suburb: string, Rooms: string ... 20 more fields]


#### 4. Type 
#### Distinct values 

In [136]:
import org.apache.spark.sql.functions.countDistinct
df_result.select("Type").distinct.show()

+----+
|Type|
+----+
|   h|
|   u|
|   t|
+----+



import org.apache.spark.sql.functions.countDistinct


In [137]:
df_result = df_result.withColumn("Type", initcap(col("Type")))
df_result.select("Type").distinct.show()

+----+
|Type|
+----+
|   T|
|   U|
|   H|
+----+



df_result: org.apache.spark.sql.DataFrame = [Suburb: string, Rooms: string ... 20 more fields]


#### Null values  

In [138]:
df_result.filter("Type IS NULL").count()

res115: Long = 0


#### 5. Method

In [139]:
df_result.select("Method").distinct.show()

+------+
|Method|
+------+
|    PI|
|    SA|
|    SP|
|    VB|
|    PN|
|     W|
|     S|
|    SN|
|    SS|
+------+



#### Null values  

In [140]:
df_result.filter("Method IS NULL").count()

res117: Long = 0


#### 6. SellerG

In [141]:
val sellers = df_result.select("SellerG").distinct

sellers: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [SellerG: string]


In [142]:
sellers.show()

+--------------------+
|             SellerG|
+--------------------+
|              LITTLE|
|                 S&L|
|              Ristic|
|            Langwell|
|             Ruralco|
|             Xynergy|
|               Ryder|
|               iSell|
|               Scott|
|              Wilson|
|          McNaughton|
|           Blackbird|
|hockingstuart/Biggin|
|               Lucas|
|                 One|
|         Buxton/Find|
|                Real|
|            Sterling|
|             Compton|
|           Tiernan's|
+--------------------+
only showing top 20 rows



In [143]:
sellers.count()

res119: Long = 388


In [144]:

df_result.filter("SellerG IS NULL").count()

res120: Long = 0


#### 7. Date

In [145]:
val dates = df_result.select("Date").distinct()

dates: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [Date: string]


In [146]:
dates.count()

res121: Long = 78


In [147]:
df_result.filter("Date IS NULL").count()

res122: Long = 0


In [148]:
dates.show()

+----------+
|      Date|
+----------+
|16/04/2016|
|29/04/2017|
|10/12/2016|
|19/08/2017|
| 7/05/2016|
| 8/07/2017|
| 4/03/2017|
|29/07/2017|
|27/05/2017|
|28/10/2017|
| 9/09/2017|
|26/07/2016|
|12/11/2016|
|25/02/2017|
| 6/05/2017|
|18/11/2017|
| 3/09/2016|
| 3/12/2016|
|25/11/2017|
| 3/06/2017|
+----------+
only showing top 20 rows



#### 8. CouncilArea

In [149]:
val sareas = df_result.select("CouncilArea").distinct()

sareas: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [CouncilArea: string]


In [150]:
sareas.count()

res124: Long = 34


In [151]:
sareas.show(34)

+--------------------+
|         CouncilArea|
+--------------------+
|Bayside City Council|
|Greater Dandenong...|
|   Hume City Council|
|Glen Eira City Co...|
|Kingston City Cou...|
|Nillumbik Shire C...|
| Monash City Council|
|Macedon Ranges Sh...|
|   Knox City Council|
|Wyndham City Council|
|Mitchell Shire Co...|
|Maribyrnong City ...|
|Whittlesea City C...|
|Whitehorse City C...|
|Frankston City Co...|
|Manningham City C...|
|Darebin City Council|
|Moreland City Cou...|
|Cardinia Shire Co...|
|Moonee Valley Cit...|
|Boroondara City C...|
|Yarra Ranges Shir...|
|  Casey City Council|
|Port Phillip City...|
|Brimbank City Cou...|
|                #N/A|
|Hobsons Bay City ...|
|Banyule City Council|
|Stonnington City ...|
| Melton City Council|
|  Yarra City Council|
|Melbourne City Co...|
|Moorabool Shire C...|
|Maroondah City Co...|
+--------------------+



In [98]:

df_result.filter("CouncilArea IS NULL").count()

res80: Long = 0


#### 9. Regionname

In [56]:
df_result.select("Regionname").distinct.show()

+--------------------+
|          Regionname|
+--------------------+
|South-Eastern Met...|
|Western Metropolitan|
|Eastern Metropolitan|
|    Eastern Victoria|
|                #N/A|
|   Northern Victoria|
|Northern Metropol...|
|Southern Metropol...|
|    Western Victoria|
+--------------------+



In [57]:

df_result.filter("Regionname IS NULL").count()

res44: Long = 0


#### 10. YearBuilt

In [None]:
df_result.filter("YearBuilt IS NULL").count()

In [None]:
df_result.select("YearBuilt").distinct.show()

#### 1. Price

In [None]:
df_result.filter("Price IS NULL").count()

In [None]:
//get rid of null in price
df_result=df_result.filter(!df_result("Price").isNull)

In [None]:
df_result.select("Price").distinct().show()

In [None]:
// histogram
val (startValues,counts) = df_result.select($"Price")
    .rdd.map(r => r.getDouble(0))
    .histogram()

In [None]:
val zippedValues = startValues.zip(counts)
case class HistRow(startPoint:Double,count:Long)
val rowRDD = zippedValues.map( value => HistRow(value._1,value._2))
val histDf = org.apache.spark.sql.SparkSession.createDataFrame(rowRDD)
histDf.createOrReplaceTempView("histogramTable")

In [None]:
val _tmpHist = df_result
    .select($"Price" cast "double")
    .rdd.map(r => r.getDouble(0))
    .histogram(thresholds)

// Result DataFrame contains `from`, `to` range and the `value`.
val histogram = sc.parallelize((thresholds, thresholds.tail, _tmpHist).zipped.toList).toDF("from", "to", "value")

In [None]:
df_result.describe().select("summary","Price","Suburb").show()

In [None]:
df.where("Price >10000000").select("Address","Rooms","Type","Price","Method","SellerG","Date","Distance","Postcode","Bathroom","Car", "Propertycount").count()

#### 1. Rooms

In [None]:
df_result.filter("Rooms IS NULL").count()

In [None]:
df_result.select("Rooms").distinct().show()

<span style="font-size:16pt;color: red">TO DO: HISTOGRAMS IS IT POSSIBLE? </span> .



In [None]:
df_result.groupBy("Rooms").count().rdd.histogram()

In [None]:
histogram(df_result, df_result("Rooms"), nbins = 10)

In [None]:
df_result.select("Rooms").createOrReplaceTempView("histogramTable").show()

#### 2. Distance

In [None]:
df_result.filter("Distance IS NULL").count()

In [None]:
df_result.select("Distance").distinct.count()

In [None]:
df_result.select("Distance")

In [None]:
df_result.select("Distance").distinct.show(213)

#### 3. Bathroom

In [None]:
df_result.filter("Bathroom IS NULL").count()

In [None]:
df_result= df_result.filter(!df_result("Bathroom").isNull)
df_result.select("Bathroom").distinct.show()

#### 4. Car

In [None]:
df.filter("Car IS NULL").count()

In [None]:
df_result.select("Car").distinct.show()

In [None]:
df_result= df_result.filter(!df_result("Car").isNull)
df_result.select("Car").distinct.show()

#### 5. Landsize

In [None]:
df_result.filter("Landsize IS NULL").count()

In [None]:
df_result.select("Landsize").distinct.count()

In [None]:
df_result= df_result.filter(!df_result("Landsize").isNull)
df_result.select("Landsize").distinct.count()

#### 6. BuildingArea

In [None]:
df_result.filter("BuildingArea IS NULL").count()

In [None]:
df_result.select("BuildingArea").distinct.count()

In [None]:
df_result= df_result.filter(!df_result("BuildingArea").isNull)
df_result.select("BuildingArea").distinct.show(640)

In [None]:
df_result.count

#### 7. Propertycount

In [None]:
df_result.filter("Propertycount IS NULL").count()

In [None]:
df_result.select("BuildingArea").distinct.count()

In [None]:
df_result.select("Propertycount").distinct.show(639)

In [None]:
df_result.describe().select("summary","Price", "Rooms","Distance","Bathroom","Car").show()

In [None]:
df_result.describe().select("summary","Landsize","Propertycount").show()

In [None]:
df_result.count()

In [None]:
df_result.filter(df_result.isNull)

#### Filtering null values

In [None]:
val df_not_null = df_result.na.drop
df_not_null.count()

In [None]:
df_not_null.printSchema()


#### Imputing null values

### Group By and  Aggregation

In [None]:
df_result.groupBy("Suburb").agg(max("Price")).show()

In [None]:
df_result.groupBy("Suburb").agg(min("Price")).show()

In [None]:
df_result.groupBy("Distance").agg(round(mean("Price"),0)).show()

### Correlation
 <span style="font-size:16pt;color: red">TO DO: FIND OUT CORRELATIONS between theses attributes </span> .

Rooms:
|-- Price:
|-- Distance:
|-- Rooms:
|-- Bathroom:
|-- Car:
|-- Landsize:
|-- BuildingArea:
|-- YearBuilt:

In [None]:
import org.apache.spark.sql.functions.corr
df_result.select(corr("Distance","Price")).show()

In [None]:
df_result.select(corr("Rooms","Price")).show()

In [None]:
df_result.select(corr("Bathroom","Price")).show()

In [None]:
df_result.select(corr("Car","Price")).show()

In [None]:
df_result.select(corr("BuildingArea","Price")).show()

In [None]:
df_result.select(corr("Landsize","Price")).show()

In [None]:
df_result.select(corr("Date","Price")).show()

In [None]:
df_result.select(corr("YearBuilt","Price")).show()

In [None]:
df_result.select(corr("Propertycount","Price")).show()

### Select relevant features:

In [None]:

df_result = df_result.select("Price","Method","Type","Distance","Rooms","Bathroom","Car","Landsize","Propertycount", "Suburb","Street","Date")

#### Write down clean data:

In [None]:
! hadoop fs -mkdir -p /tmp/output

In [None]:
! hadoop fs -ls -R /tmp

In [None]:
df_result.coalesce(1).write.format("csv").option("header","true").mode("overwrite").option("sep",",").save("hdfs://localhost:9000/tmp/output")


In [None]:
df_result.coalesce(1).write.format("csv").option("header","true").mode("overwrite").csv("hdfs://localhost:9000/tmp/output") 

In [None]:
! hadoop fs -ls /tmp/output

In [None]:
!rm ./output.csv 

Save the clean data to disk

In [None]:
! hadoop fs -copyToLocal /tmp/output/\*.csv./../data-clean/mh.csv

## References

Apache Spark (n.d.). _Spark Scala API (Scaladoc). Overview._ https://spark.apache.org/docs/latest/api/java/overview-summary.html

Apache Spark (n.d.). _Basic Statistic._ https://spark.apache.org/docs/latest/ml-statistics.html

Bahadoor N. (2020). _Spark Tutorials_ https://allaboutscala.com/big-data/spark/#dataframe-statistics-correlation

Databricks. (2020). _Introduction to DataFrames - Scala._  https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-scala.html 

Grimaldi E. (2018). _Pandas vs. Spark: how to handle dataframes (Part II.)_  https://towardsdatascience.com/python-pandas-vs-scala-how-to-handle-dataframes-part-ii-d3e5efe8287d 

