In [1]:
//create a case class
case class Flight(DEST_COUNTRY_NAME: String,ORIGIN_COUNTRY_NAME: String, count: BigInt)

defined class Flight


In [2]:
//read data using above case class
val flightsDF = spark.read.parquet("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/parquet/2010-summary.parquet/")
val flights = flightsDF.as[Flight]

flightsDF = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]
flights = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [1]:
flights.show(2)
flights.first.DEST_COUNTRY_NAME

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|    1|
|    United States|            Ireland|  264|
+-----------------+-------------------+-----+
only showing top 2 rows



United States

In [3]:
//filter , these filters are expensive compared to SQL filters
def originIsDestination(flight_row: Flight): Boolean = {
return flight_row.ORIGIN_COUNTRY_NAME == flight_row.DEST_COUNTRY_NAME}
flights.filter(flight_row => originIsDestination(flight_row)).first()

originIsDestination: (flight_row: Flight)Boolean


Flight(United States,United States,348113)

In [4]:
flights.map(f => f.DEST_COUNTRY_NAME).take(5)

Array(United States, United States, United States, Egypt, Equatorial Guinea)

In [7]:
//join
import spark.implicits._
case class FlightMetadata(count: BigInt, randomData: BigInt)
val flightsMeta = spark.range(500).map(x => (x, scala.util.Random.nextLong))
.withColumnRenamed("_1", "count")
.withColumnRenamed("_2", "randomData")
.as[FlightMetadata]

val flights2 = flights.joinWith(flightsMeta, flights.col("count") === flightsMeta.col("count")).show(2)//dataset way
val flights2 = flights.join(flightsMeta, Seq("count")).show(2)//dataframe way
val flights2 = flights.join(flightsMeta.toDF(), Seq("count"))//mixed way

Name: Compile Error
Message: <console>:16: error: stable identifier required, but this.$line7$read.spark.implicits found.
       import spark.implicits._
                    ^

StackTrace: 

In [11]:
//Grouping and Aggregations
flights.groupBy("DEST_COUNTRY_NAME").count().show(2)//returns dataframe
flights.groupByKey(x => x.DEST_COUNTRY_NAME).count().show(2)//returns dataset

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|         Anguilla|    1|
|           Russia|    1|
+-----------------+-----+
only showing top 2 rows

+--------+--------+
|   value|count(1)|
+--------+--------+
|Anguilla|       1|
|  Russia|       1|
+--------+--------+
only showing top 2 rows



In [12]:
def grpSum(countryName:String, values: Iterator[Flight]) = {values.dropWhile(_.count < 5).map(x => (countryName, x))}
flights.groupByKey(x => x.DEST_COUNTRY_NAME).flatMapGroups(grpSum).show(2)// the function works on a dataset

grpSum: (countryName: String, values: Iterator[Flight])Iterator[(String, Flight)]


+--------+--------------------+
|      _1|                  _2|
+--------+--------------------+
|Anguilla|[Anguilla, United...|
|Paraguay|[Paraguay, United...|
+--------+--------------------+
only showing top 2 rows



In [14]:
def sum2(left:Flight, right:Flight) = {Flight(left.DEST_COUNTRY_NAME, null, left.count + right.count)}
flights.groupByKey(x => x.DEST_COUNTRY_NAME).reduceGroups((l, r) => sum2(l, r)).show(2)

sum2: (left: Flight, right: Flight)Flight


+--------+----------------------------------------------+
|   value|ReduceAggregator($line13.$read$$iw$$iw$Flight)|
+--------+----------------------------------------------+
|Anguilla|                          [Anguilla, United...|
|  Russia|                          [Russia, United S...|
+--------+----------------------------------------------+
only showing top 2 rows

