### Preparing HDFS
using magic

create input folder on HDFS if not exists
copy from data from local

In [14]:
!pwd
! hadoop fs -mkdir -p  /tmp/rs_input
! hadoop fs -put   -p  ./../data-raw/Melbourne_housing_FULL.csv             /tmp/rs_input/raw.csv
! hadoop fs -ls        /tmp/rs_input/

/home/big-data-realestate-master/scripts

put: `/tmp/rs_input/raw.csv': File exists


Found 1 items


-rwxrwxrwx   1 1000 staff    5018236 2020-05-22 06:09 /tmp/rs_input/raw.csv




In [16]:
//load raw into df
val df_raw = spark
    .read
    .format("csv")
    .option("header", "true")
    .load("hdfs://localhost:9000/tmp/rs_input/raw.csv")

df_raw: org.apache.spark.sql.DataFrame = [Suburb: string, Address: string ... 19 more fields]


In [17]:
//suggest better name
//only select columns we need now
var df_working= df_raw.select("Price",
                          "Method",
                          "Type",
                          "Distance",
                          "Rooms",
                          "Bathroom",
                          "Car",
                          "Landsize",
                          "Propertycount",
                          "Suburb",
                          "Address",
                          "Date")

df_working: org.apache.spark.sql.DataFrame = [Price: string, Method: string ... 10 more fields]


#### Change Remove "#n/a" records

In [18]:
//when profiling there are a number of columns with a "#N/A"
df_working = df_working.filter(($"Distance" =!= "#N/A")||($"Propertycount" =!= "#N/A"))

df_working: org.apache.spark.sql.DataFrame = [Price: string, Method: string ... 10 more fields]


In [22]:
//refactored to remove the for column loop
df_working = df_working.withColumn("Price",col("Price").cast("Double"))
    .withColumn("Rooms",col("Rooms").cast("Int"))
    .withColumn("Distance",col("Distance").cast("Double"))
    .withColumn("Bathroom",col("Bathroom").cast("Double"))
    .withColumn("Car",col("Car").cast("Int"))
    .withColumn("Landsize",col("Landsize").cast("Double"))
    .withColumn("Propertycount",col("Propertycount").cast("Int"))
    

df_working: org.apache.spark.sql.DataFrame = [Price: double, Method: string ... 10 more fields]


#### Split Address on Street and Suffix

In [23]:
//split address on Street
df_working = df_working.withColumn("StreetName",split(col("Address")," ").getItem(1)).
    withColumn("StreetSuffix",split(col("Address")," ").getItem(2)).drop("Address")

df_working: org.apache.spark.sql.DataFrame = [Price: double, Method: string ... 11 more fields]


In [24]:
import org.apache.spark.sql.functions._

// make first letter of suburb upper case
df_working= df_working.withColumn("Suburb", initcap(col("Suburb")))

//make type code upper
df_working = df_working.withColumn("Type", initcap(col("Type")))

import org.apache.spark.sql.functions._
df_working: org.apache.spark.sql.DataFrame = [Price: double, Method: string ... 11 more fields]
df_working: org.apache.spark.sql.DataFrame = [Price: double, Method: string ... 11 more fields]


#### Filtering null values

In [25]:
val df_not_null = df_working.na.drop()
df_not_null.count()

df_not_null: org.apache.spark.sql.DataFrame = [Price: double, Method: string ... 11 more fields]
res1: Long = 17701


#### Write down clean data:

In [26]:
! hadoop fs -mkdir -p /tmp/output

In [27]:
val df_output = df_not_null
   .coalesce(1)
   .write
   .format("csv")
   .option("header","true")
   .mode("overwrite").option("sep",",")
   .save("hdfs://localhost:9000/tmp/output")

df_output: Unit = ()


Save the clean data to disk

In [28]:
! hadoop fs -mkdir -p /tmp/output
! hadoop fs -copyToLocal /tmp/output/\*.csv ./../data-clean/cleanMelbourneData.csv

copyToLocal: `./../data-clean/cleanMelbourneData.csv': File exists




## References

Apache Spark (n.d.). _Spark Scala API (Scaladoc). Overview._ https://spark.apache.org/docs/latest/api/java/overview-summary.html

Apache Spark (n.d.). _Basic Statistic._ https://spark.apache.org/docs/latest/ml-statistics.html

Bahadoor N. (2020). _Spark Tutorials_ https://allaboutscala.com/big-data/spark/#dataframe-statistics-correlation

Databricks. (2020). _Introduction to DataFrames - Scala._  https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-scala.html 

Grimaldi E. (2018). _Pandas vs. Spark: how to handle dataframes (Part II.)_  https://towardsdatascience.com/python-pandas-vs-scala-how-to-handle-dataframes-part-ii-d3e5efe8287d 

