<img src=http://fd.perso.eisti.fr/Logos/TORUS2.png>

Another type of data abstraction which Spark officially provides now to use is DataFrame. 

"a DataFrame is a distributed collection of data organized into named columns. It is conceptually equivalent to a table in a relational database or a data frame in R/Python, but with richer optimizations under the hood. DataFrames can be constructed from a wide array of sources such as: structured data files, tables in Hive, external databases, or existing RDDs." (source : https://databricks.com/blog/2015/02/17/introducing-dataframes-in-spark-for-large-scale-data-science.html)

To understand how DataFrame works, let's take an example from Databricks (https://docs.databricks.com/spark/latest/dataframes-datasets/introduction-to-dataframes-scala.html)

### Firstly, we create a DataFrame 

In [ ]:
// Create the case classes for our domain
case class Department(id: String, name: String)
case class Employee(firstName: String, lastName: String, email: String, salary: Int)
case class DepartmentWithEmployees(department: Department, employees: Seq[Employee])

defined class Department
defined class Employee
defined class DepartmentWithEmployees


In [ ]:
// Create the Departments
val department1 = new Department("123456", "Computer Science")
val department2 = new Department("789012", "Mechanical Engineering")
val department3 = new Department("345678", "Theater and Drama")
val department4 = new Department("901234", "Indoor Recreation")

department1: Department = Department(123456,Computer Science)
department2: Department = Department(789012,Mechanical Engineering)
department3: Department = Department(345678,Theater and Drama)
department4: Department = Department(901234,Indoor Recreation)


In [ ]:
// Create the Employees
val employee1 = new Employee("michael", "armbrust", "michael@berkeley.edu", 100000)
val employee2 = new Employee("xiangrui", "meng", "xiangrui@stanford.edu", 120000)
val employee3 = new Employee("matei", null, "matei@waterloo.edu", 140000)
val employee4 = new Employee(null, "wendell", "wendell@princeton.edu", 160000)

employee1: Employee = Employee(michael,armbrust,michael@berkeley.edu,100000)
employee2: Employee = Employee(xiangrui,meng,xiangrui@stanford.edu,120000)
employee3: Employee = Employee(matei,null,matei@waterloo.edu,140000)
employee4: Employee = Employee(null,wendell,wendell@princeton.edu,160000)


In [ ]:
// Create the DepartmentWithEmployees instances from Departments and Employees
val departmentWithEmployees1 = new DepartmentWithEmployees(department1, Seq(employee1, employee2))
val departmentWithEmployees2 = new DepartmentWithEmployees(department2, Seq(employee3, employee4))
val departmentWithEmployees3 = new DepartmentWithEmployees(department3, Seq(employee1, employee4))
val departmentWithEmployees4 = new DepartmentWithEmployees(department4, Seq(employee2, employee3))

departmentWithEmployees1: DepartmentWithEmployees = DepartmentWithEmployees(Department(123456,Computer Science),List(Employee(michael,armbrust,michael@berkeley.edu,100000), Employee(xiangrui,meng,xiangrui@stanford.edu,120000)))
departmentWithEmployees2: DepartmentWithEmployees = DepartmentWithEmployees(Department(789012,Mechanical Engineering),List(Employee(matei,null,matei@waterloo.edu,140000), Employee(null,wendell,wendell@princeton.edu,160000)))
departmentWithEmployees3: DepartmentWithEmployees = DepartmentWithEmployees(Department(345678,Theater and Drama),List(Employee(michael,armbrust,michael@berkeley.edu,100000), Employee(null,wendell,wendell@princeton.edu,160000)))
departmentWithEmployees4: DepartmentWithEmployees = DepartmentWithEmployees(Department(901234,Indoor Recreation),Lis...

In [ ]:
val departmentsWithEmployeesSeq1 = Seq(departmentWithEmployees1, departmentWithEmployees2)
val dataframe1 = departmentsWithEmployeesSeq1.toDF()

departmentsWithEmployeesSeq1: Seq[DepartmentWithEmployees] = List(DepartmentWithEmployees(Department(123456,Computer Science),List(Employee(michael,armbrust,michael@berkeley.edu,100000), Employee(xiangrui,meng,xiangrui@stanford.edu,120000))), DepartmentWithEmployees(Department(789012,Mechanical Engineering),List(Employee(matei,null,matei@waterloo.edu,140000), Employee(null,wendell,wendell@princeton.edu,160000))))
dataframe1: org.apache.spark.sql.DataFrame = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>>]


### To display dataframe1 

In [ ]:
display(dataframe1)

res6: notebook.front.Widget = <Tabs widget>


### To print the schema of dataframe1 

In [ ]:
dataframe1.printSchema()

root
 |-- department: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- name: string (nullable = true)
 |-- employees: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- firstName: string (nullable = true)
 |    |    |-- lastName: string (nullable = true)
 |    |    |-- email: string (nullable = true)
 |    |    |-- salary: integer (nullable = false)



### We create another dataframe... 

In [ ]:
val departmentsWithEmployeesSeq2 = Seq(departmentWithEmployees3, departmentWithEmployees4)
val dataframe2 = departmentsWithEmployeesSeq2.toDF()
display(dataframe2)

departmentsWithEmployeesSeq2: Seq[DepartmentWithEmployees] = List(DepartmentWithEmployees(Department(345678,Theater and Drama),List(Employee(michael,armbrust,michael@berkeley.edu,100000), Employee(null,wendell,wendell@princeton.edu,160000))), DepartmentWithEmployees(Department(901234,Indoor Recreation),List(Employee(xiangrui,meng,xiangrui@stanford.edu,120000), Employee(matei,null,matei@waterloo.edu,140000))))
dataframe2: org.apache.spark.sql.DataFrame = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>>]
res10: notebook.front.Widget = <Tabs widget>


### We can union 2 dataframes 

In [ ]:
val unionDF = dataframe1.union(dataframe2)
display(unionDF)

unionDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>>]
res12: notebook.front.Widget = <Tabs widget>


### To flatten the schema... 

In [ ]:
val explodeDF = unionDF.explode($"employees") {
  case Row(employee: Seq[Row]) => employee.map{ employee =>
    val firstName = employee(0).asInstanceOf[String]
    val lastName = employee(1).asInstanceOf[String]
    val email = employee(2).asInstanceOf[String]
    val salary = employee(3).asInstanceOf[Int]
    Employee(firstName, lastName, email, salary)
  }
}
display(explodeDF)

         case Row(employee: Seq[Row]) => employee.map{ employee =>
                            ^
       val explodeDF = unionDF.explode($"employees") {
                               ^
explodeDF: org.apache.spark.sql.DataFrame = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>> ... 4 more fields]
res14: notebook.front.Widget = <Tabs widget>


### We can also select the first columns 

In [ ]:
explodeDF.select("email")

res16: org.apache.spark.sql.DataFrame = [email: string]


### or filter and sort... 

In [ ]:
val filterDF = explodeDF
  .filter($"firstName" === "xiangrui" || $"firstName" === "michael")
  .sort($"lastName".asc)
display(filterDF)

filterDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>> ... 4 more fields]
res18: notebook.front.Widget = <Tabs widget>


### We can apply "where" directly inside Dataframe

In [ ]:
val whereDF = explodeDF.where(($"firstName" === "xiangrui") || ($"firstName" === "michael")).sort($"lastName".asc)
display(whereDF)

whereDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [department: struct<id: string, name: string>, employees: array<struct<firstName:string,lastName:string,email:string,salary:int>> ... 4 more fields]
res20: notebook.front.Widget = <Tabs widget>


### From DataFrame to Dataset

For example, if we want to get 2 columns firstName and lastName and convert it to an RDD, we should know the type of theses 2 columns.

In [ ]:
val dataset = explodeDF.map(l => (l(2).asInstanceOf[String], l(3).asInstanceOf[String]))

dataset: org.apache.spark.sql.Dataset[(String, String)] = [_1: string, _2: string]


### We can also save a DataFrame with format Parquet

In [ ]:
val savePath = "hdfs://hupi-factory-02-01-01-01/user/hupi/dataset_torusVN/formation2_parquetEx"

unionDF.write.parquet(savePath)

org.apache.spark.sql.AnalysisException: path hdfs://hupi-factory-02-01-01-01/user/hupi/dataset_torusVN/formation2_parquetEx already exists.;
  at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:80)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:58)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:56)
  at org.apache.spark.sql.execution.command.ExecutedCommandExec.doExecute(commands.scala:74)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
  at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
  at org.apache.spark.sql.execution.SparkPlan.executeQuery(S

In [ ]:
// Then we can read this parquetFile
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val parquetDF = sqlContext.read.parquet(savePath)

<console>:69: error: not found: value savePath
       val parquetDF = sqlContext.read.parquet(savePath)
                                               ^


In [ ]:
parquetDF.show()

<console>:69: error: not found: value parquetDF
       parquetDF.show()
       ^
