In [None]:
%%configure -f 
{ "numExecutors":4, "executorMemory":"1G", "executorCores":1, "driverMemory":"1G", "driverCores":1 }

In [None]:
case class Student(name: String, subject: String, major: String, school: String, year: Int)

### Initialize the data for creating an example Student DataFrame

In [None]:
import scala.collection.mutable.ListBuffer

val alphabets: Array[Char] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray()

val nameListBuffer: ListBuffer[String] = new ListBuffer
val subjectListBuffer: ListBuffer[String] = new ListBuffer
val majorListBuffer: ListBuffer[String] = new ListBuffer
val schoolListBuffer: ListBuffer[String] = new ListBuffer
val yearListBuffer: ListBuffer[Int] = new ListBuffer

for (alphabet <- alphabets) {
    
    nameListBuffer += "Name" + alphabet 
}

for (i <- 0 until 15) {
    
    subjectListBuffer += "Subject" + i 
}

for (i <- 0 until 10) {
    
    majorListBuffer += "Major" + i 
}

for (i <- 0 until 5) {
    
    schoolListBuffer += "School" + i 
}

for (i <- 1 until 5) {
    
    yearListBuffer += i
}

### Generate the data for creating the example Student DataFrame

In [None]:
val randomGenerator = scala.util.Random

val studentListBuffer: ListBuffer[Student] = new ListBuffer

for (i <- 1 to 100) {

    studentListBuffer += new Student(nameListBuffer(randomGenerator.nextInt(alphabets.size)),
                                     subjectListBuffer(randomGenerator.nextInt(subjectListBuffer.size)),
                                     majorListBuffer(randomGenerator.nextInt(majorListBuffer.size)),
                                     schoolListBuffer(randomGenerator.nextInt(schoolListBuffer.size)),
                                     yearListBuffer(randomGenerator.nextInt(yearListBuffer.size))
                                    )
}

### Create the Student DataFrame

In [None]:
val students = sqlContext.createDataFrame(studentListBuffer.toList)

students.take(5)

### Partition DataFrame vertically on one or more column boundaries for caching

In [None]:
val verticalPartitionCount = 2

val columnCountPerPartition: Int = math.ceil(students.columns.size / verticalPartitionCount).toInt

for (verticalPartitionIndex <- 0 until verticalPartitionCount + 1) {
    
    val startColumnIndex: Int = verticalPartitionIndex * columnCountPerPartition
    var endColumnIndex: Int = startColumnIndex + columnCountPerPartition
    
    endColumnIndex = if (endColumnIndex > students.columns.size) students.columns.size else endColumnIndex
    
    val selectedColumns: Array[String] = students.columns.slice(startColumnIndex, endColumnIndex)
    
    val verticalPartition = students.select(selectedColumns.head, selectedColumns.tail: _*)
    
    verticalPartition.persist()
    
    //Alternatively, one can do any other transformation and/or store the output in persistent store
    
    for (columnName <- selectedColumns) {
        
        verticalPartition.select(columnName).rdd.map((_, 1)).reduceByKey(_ + _)
        .sortBy(_._2, false)
        .take(5)
        .map(println(_))
    }
    
    verticalPartition.unpersist()
}

### Partition DataFrame vertically on one or more column boundaries and horizontally on one or more row boundaries for caching

In [None]:
import org.apache.spark.sql.types.{LongType, StructField, StructType}
import org.apache.spark.sql.Row
import org.apache.spark.rdd.RDD

val horizontalPartitionCount = 3

val rowCount = students.count

val rowCountPerPartition: Int = math.ceil(rowCount / horizontalPartitionCount).toInt

for (verticalPartitionIndex <- 0 until verticalPartitionCount + 1) {
    
    val startColumnIndex: Int = verticalPartitionIndex * columnCountPerPartition
    var endColumnIndex: Int = startColumnIndex + columnCountPerPartition
    
    endColumnIndex = if (endColumnIndex > students.columns.size) students.columns.size else endColumnIndex
    
    val selectedColumns: Array[String] = students.columns.slice(startColumnIndex, endColumnIndex)
    
    val verticalPartition = students.select(selectedColumns.head, selectedColumns.tail: _*)
    
    val verticalPartitionWithIndex = sqlContext.createDataFrame(verticalPartition.rdd.zipWithIndex().map(
            r => Row.fromSeq(Seq(r._2) ++ r._1.toSeq)), StructType(
            Array(StructField("index", LongType, false)) ++ verticalPartition.schema.fields))
    
    var columnNameOutputMap : scala.collection.mutable.Map[String, RDD[(Row, Int)]]
    = scala.collection.mutable.Map[String, RDD[(Row, Int)]]()
    
    for (horizontalPartitionIndex <- 0 until horizontalPartitionCount + 1) {
        
        val startRowIndex: Long = horizontalPartitionIndex * rowCountPerPartition
        var endRowIndex: Long = startRowIndex + rowCountPerPartition
        
        endRowIndex = if (endRowIndex > rowCount) rowCount else endRowIndex
        
        val blockPartition = verticalPartitionWithIndex.filter(f"index >= $startRowIndex and index < $endRowIndex")
        
        blockPartition.persist()
    
        for (columnName <- selectedColumns) {
        
            //Alternatively, one can do any other transformation and/or store the output in persistent store
            
            if (columnNameOutputMap.contains(columnName)) {
            
                columnNameOutputMap(columnName) = columnNameOutputMap(columnName)
                .union(blockPartition.select(columnName).rdd.map((_, 1)).reduceByKey(_ + _))
                .reduceByKey(_ + _)
            
            } else {
                
                columnNameOutputMap += (columnName -> blockPartition.select(columnName).rdd.map((_, 1)).reduceByKey(_ + _))
            } 
        }
       
        blockPartition.unpersist()
    }
    
    for (columnName <- selectedColumns) {
        
        columnNameOutputMap(columnName)
        .sortBy(_._2, false)
        .take(5)
        .map(println(_))
    }
}