In [None]:
//creating a DataFrame
val df1 = spark.read.format("json")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json")

In [66]:
df1.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [67]:
df1.first()

[United States,Romania,15]

In [None]:
//creating a DataFrame another way
val df2 = spark.read.json("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json")

In [68]:
df2.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [None]:
df2.first()

In [None]:
//getting  the DataFrame schema of json data
val jsonSchema = spark.read.format("json")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json").schema

In [None]:
//getting  the DataFrame schema of csv(additional information to read the first line as header)
val csvSchema = spark.read.format("csv").option("header", "true")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/csv/2015-summary.csv").schema

In [None]:
//creating a DataFrame with  custom schema of json and printing the schema
import org.apache.spark.sql.types._
val myManualSchema = new StructType(Array(
new StructField("DEST_COUNTRY_NAME", StringType, true),
new StructField("ORIGIN_COUNTRY_NAME", StringType, true),
new StructField("count", LongType, false)))// just to illustrate flipping ))
val df3 = spark.read.format("json").schema(myManualSchema)
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json")

In [69]:
df3.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [None]:
//Columns
spark.read.format("json")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json")
.columns

In [None]:
spark.read.format("CSV").option("header", "true")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/csv/2015-summary.csv")
.columns

In [None]:
//creating a custom row 
import org.apache.spark.sql.Row
val myRow = Row("Hello", null, 1, false)

In [70]:
myRow(0)

Hello

In [71]:
myRow.getString(0)

Hello

In [72]:
myRow.getInt(2)

1

In [None]:
val df = spark.read.format("json")
.load("/home/koushik/git/spark/input/Spark-The-Definitive-Guide/flight-data/json/2015-summary.json")
df.createOrReplaceTempView("dfTable")

In [None]:
val newRows = Seq(Row("New Country", "Other Country", 5L),Row("New Country 2", "Other Country 3", 1L))
val parallelizedRows = spark.sparkContext.parallelize(newRows)
val schema = df.schema
val newDF = spark.createDataFrame(parallelizedRows, schema)

In [None]:
//create DataFrames on the fly by taking a set of rows and converting them to a DataFrame.
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StructType,StringType, LongType}
val myManualSchema = new StructType(Array(
new StructField("some", StringType, true),
new StructField("col", StringType, true),
new StructField("names", LongType, false)  ))
val myRows = Seq(Row("Hello", null, 1L))
val myRDD = spark.sparkContext.parallelize(myRows)
val myDf = spark.createDataFrame(myRDD, myManualSchema)

In [73]:
myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+



In [None]:
//Taking advantage of Spark’s implicits in the console (and if you import them in your jar code), by running toDF on a Seq type.
//This does not play well with null types, so it’s not necessarily recommended for production use cases.
val myDF = Seq(("Hello", 2, 1L)).toDF()

In [74]:
myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+



In [None]:
//working with dataframe.use the select method and pass in the column names as string that you would like to work with.
df.select("DEST_COUNTRY_NAME").show(2)

In [None]:
df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(2)

In [75]:
//Using all different  ways to access a column, all gives same result.
import org.apache.spark.sql.functions.{expr, col, column}
df.select(
df.col("DEST_COUNTRY_NAME"),
col("DEST_COUNTRY_NAME"),
column("DEST_COUNTRY_NAME"),
'DEST_COUNTRY_NAME,//"DEST_COUNTRY_NAME" will throw an error 
$"DEST_COUNTRY_NAME",
expr("DEST_COUNTRY_NAME")
).show(2)

+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|DEST_COUNTRY_NAME|
+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
|    United States|    United States|    United States|    United States|    United States|    United States|
|    United States|    United States|    United States|    United States|    United States|    United States|
+-----------------+-----------------+-----------------+-----------------+-----------------+-----------------+
only showing top 2 rows



In [76]:
//expr is the most flexible reference that we can use. It can refer to a plain column or a string manipulation of a column. 
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [77]:
//with expr let’s change our column name, then change it back as an example using the AS keyword and then the alias method on the column.
df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST_COUNTRY_NAME")).show(2)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
+-----------------+
only showing top 2 rows



In [78]:
//Because select followed by a series of expr is such a common pattern, Spark has a shorthand for doing so efficiently: selectExpr.
df.select(expr("DEST_COUNTRY_NAME as newColumnName"),expr("DEST_COUNTRY_NAME")).show(2)
df.selectExpr("DEST_COUNTRY_NAME as newColumnName","DEST_COUNTRY_NAME").show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [79]:
//We can treat selectExpr as a simple way to build up complex expressions that create new DataFrames. 
//In fact, we can add any valid non-aggregating SQL statement and as long as the columns resolve - it will be valid!
df.selectExpr(
"*", // all original columns
"(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry"
).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [80]:
//pass explicit values into Spark that aren’t a new column but are just a value.(like constant in java)
//if was in SQL => SELECT *,1 as One FROM dfTable LIMIT 2
import org.apache.spark.sql.functions.lit
df.select(
expr("*"),
lit(1).as("something")
).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|something|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [81]:
//There’s also a more formal way of adding a new column to a DataFrame using the withColumn method on our DataFrame.
// in SQL => SELECT *,1 as One FROM dfTable LIMIT 2
df.withColumn("numberOne", lit(1)).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|numberOne|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [82]:
//self explanatory from above
df.withColumn("withinCountry",expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [83]:
//self explanatory from above
df.withColumn("Destination",df.col("DEST_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|  Destination|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|United States|
|    United States|            Croatia|    1|United States|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [None]:
//withColumnRenamed method. This will rename the column with the name of the string in the first argument, to the string in the second argument.
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

In [84]:
//self explanatory from above
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").show(2)

+-------------+-------------------+-----+
|         dest|ORIGIN_COUNTRY_NAME|count|
+-------------+-------------------+-----+
|United States|            Romania|   15|
|United States|            Croatia|    1|
+-------------+-------------------+-----+
only showing top 2 rows



In [None]:
import org.apache.spark.sql.functions.expr
val dfWithLongColName = df.withColumn("This Long Column-Name",expr("ORIGIN_COUNTRY_NAME"))
//while coming across reserved characters like spaces or dashes in column names. Handling these means escaping column names
//appropriately. In Spark this is done with backtick (`) characters.

In [86]:
dfWithLongColName.show(2)

+-----------------+-------------------+-----+---------------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|This Long Column-Name|
+-----------------+-------------------+-----+---------------------+
|    United States|            Romania|   15|              Romania|
|    United States|            Croatia|    1|              Croatia|
+-----------------+-------------------+-----+---------------------+
only showing top 2 rows



In [87]:
dfWithLongColName.selectExpr("`This Long Column-Name`","`This Long Column-Name` as `new col`").show(2)

+---------------------+-------+
|This Long Column-Name|new col|
+---------------------+-------+
|              Romania|Romania|
|              Croatia|Croatia|
+---------------------+-------+
only showing top 2 rows



In [None]:
//remove columns from DataFrames.( not from the actual data)
df.drop("ORIGIN_COUNTRY_NAME").columns

In [88]:
//Changing a Column’s Type (cast)
df.printSchema()
df.withColumn("count", col("count").cast("int")).printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [89]:
//Filtering Rows with "filter" and/or "where"
df.filter(col("count") < 2).show(2)
df.where("count < 2").show(2)
df.where(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") =!= "Croatia").show(2)
df.filter(col("count") < 2).where(col("ORIGIN_COUNTRY_NAME") =!= "Croatia").show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|          Singapore|    1|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+----

In [90]:
//Distinct
println(df.select("*").count())
println(df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").count())

256
256


In [91]:
println(df.select("ORIGIN_COUNTRY_NAME").distinct().count())
println(df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count())

125
256


In [None]:
//sample
val seed = 50
val withReplacement = false
val fraction = 0.5

In [92]:
df.sample(withReplacement, fraction, seed).count()

125

In [None]:
//Random splits, to take a sample of data randomly. mainly used for testing 
val dataFrames = df.randomSplit(Array(0.25, 0.75), seed)

In [94]:
println(dataFrames(0).count())
println(dataFrames(1).count())

61
195


In [96]:
//Union( from 17 & 18)
df.union(newDF)
.where("count = 1")
.where($"ORIGIN_COUNTRY_NAME" =!= "United States")
.show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
+-----------------+-------------------+-----+
only showing top 2 rows



In [97]:
//sort and orderBy work the exact same way.
import org.apache.spark.sql.functions.{desc, asc}
df.sort("count").show(2)
df.orderBy("count", "DEST_COUNTRY_NAME").show(2)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(2)
df.orderBy(expr("count desc")).show(2)
df.orderBy(desc("count"), asc("DEST_COUNTRY_NAME")).show(2)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|          Moldova|      United States|    1|
|    United States|            Croatia|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
+-----------------+-------------------+-----+
only showing top 2 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+----

In [None]:
//optimization: partition the data according to some frequently filtered columns(like hadoop partition) which controls 
//the physical layout of data across the cluster including the partitioning scheme and the number of partitions.

//Repartition will incur a full shuffle of the data, regardless of whether or not one is necessary. This means that you 
//should typically only repartition when the future number of partitions is greater than your current number of partitions 
//or when you are looking to partition by a set of columns.
df.rdd.getNumPartitions
df.repartition(5)
//If we know we are going to be filtering by a certain column often, it can be worth repartitioning based on that column.
df.repartition(col("DEST_COUNTRY_NAME"))
df.repartition(5, col("DEST_COUNTRY_NAME"))

In [None]:
//Coalesce on the other hand will not incur a full shuffle and will try to combine partitions. This operation will shuffle our 
//data into 5 partitions based on the destination country name, then coalesce them (without a full shuffle).
df.repartition(5, col("DEST_COUNTRY_NAME")).coalesce(2)

In [None]:
//Spark has a Driver that maintains cluster information and runs user code. This means that when we call some method to 
//collect data, this is collected to the Spark Driver.
val collectDF = df.limit(10)
collectDF.take(5) // take works with an Integer count
collectDF.collect()

In [99]:
collectDF.show() // this prints it out nicely
collectDF.show(5, false)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|United States    |Romania            |15   |
|United States    |Croatia            |1    |
|United States    |Ireland            |344  |
|Egypt            |United States      |15   |
|United States    |India         