In [1]:
spark

org.apache.spark.sql.SparkSession@64555905

In [4]:
val df = spark.read.format("json").load("flight-data/2015-summary.json")

df = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


lastException: Throwable = null


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [5]:
df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: long (nullable = true)



In [6]:
spark.read.format("json")
        .load("flight-data/2015-summary.json")
        .schema

StructType(StructField(DEST_COUNTRY_NAME,StringType,true), StructField(ORIGIN_COUNTRY_NAME,StringType,true), StructField(count,LongType,true))

## Defining a schema manually

In [10]:
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}

In [12]:
val myManualSchema = new StructType(Array(
                                    new StructField("DEST_COUNTRY_NAME", StringType, true),
                                    new StructField("ORIGIN_COUNTRY_NAME", StringType, true),
                                    new StructField("count", LongType, false) // just to illustrate flippin
                                    ))
val df = spark.read.format("json")
            .schema(myManualSchema)
            .load("flight-data/2015-summary.json")

myManualSchema = StructType(StructField(DEST_COUNTRY_NAME,StringType,true), StructField(ORIGIN_COUNTRY_NAME,StringType,true), StructField(count,LongType,false))
df = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


lastException: Throwable = null


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

## Invoking columns

In [13]:
import org.apache.spark.sql.functions.{expr, col}

In [16]:
df.columns

Array(DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count)

In [17]:
df.col("DEST_COUNTRY_NAME")

DEST_COUNTRY_NAME

In [15]:
expr("(((someCol + 5) * 200) - 6) < otherCol")

((((someCol + 5) * 200) - 6) < otherCol)

## Invoking Rows

In [18]:
import org.apache.spark.sql.Row

In [19]:
val myRow = Row("Hello", null, 1, false)

myRow = [Hello,null,1,false]


[Hello,null,1,false]

In [23]:
val myrow = df.first()

myrow = [United States,Romania,15]


[United States,Romania,15]

In [28]:
myrow(0)

United States

## Creating DataFrames

In [29]:
val df = spark.read.format("json").load("flight-data/2015-summary.json")

df = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [31]:
// create or replace view
df.createOrReplaceTempView("dfTable")

In [32]:
%%SQL

SELECT *
FROM dfTable

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 10 rows



In [33]:
// We can also create DataFrames on the fly by taking a set of rows and converting them to a DataFrame.

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StructField, StructType, StringType, LongType}

val myManualSchema = new StructType(Array(
                                    new StructField("some", StringType, true),
                                    new StructField("col", StringType, true),
                                    new StructField("names", LongType, false) // just to illustrate flippi
                                    ))
val myRows = Seq(Row("Hello", null, 1L))
val myRDD = spark.sparkContext.parallelize(myRows)
val myDf = spark.createDataFrame(myRDD, myManualSchema)
myDf.show()

+-----+----+-----+
| some| col|names|
+-----+----+-----+
|Hello|null|    1|
+-----+----+-----+



myManualSchema = StructType(StructField(some,StringType,true), StructField(col,StringType,true), StructField(names,LongType,false))
myRows = List([Hello,null,1])
myRDD = ParallelCollectionRDD[21] at parallelize at <console>:44
myDf = [some: string, col: string ... 1 more field]


[some: string, col: string ... 1 more field]

In [35]:
// selecting columns
df.select("DEST_COUNTRY_NAME").show(10)

+-----------------+
|DEST_COUNTRY_NAME|
+-----------------+
|    United States|
|    United States|
|    United States|
|            Egypt|
|    United States|
|    United States|
|    United States|
|       Costa Rica|
|          Senegal|
|          Moldova|
+-----------------+
only showing top 10 rows



In [37]:
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [38]:
df.selectExpr(
"DEST_COUNTRY_NAME as newColumnName",
"DEST_COUNTRY_NAME")
.show(2)

+-------------+-----------------+
|newColumnName|DEST_COUNTRY_NAME|
+-------------+-----------------+
|United States|    United States|
|United States|    United States|
+-------------+-----------------+
only showing top 2 rows



In [40]:
df.selectExpr(
            "*", // all original columns
            "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) as withinCountry"
            ).filter("withinCountry==True").show(2)

+-----------------+-------------------+------+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME| count|withinCountry|
+-----------------+-------------------+------+-------------+
|    United States|      United States|370002|         true|
+-----------------+-------------------+------+-------------+



### Aggregations on a dataframe

In [43]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show(10)

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



## Spark Literals

In [44]:
import org.apache.spark.sql.functions.lit
df.select(
        expr("*"),
        lit(1).alias("something")
        ).show(2)

+-----------------+-------------------+-----+---------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|something|
+-----------------+-------------------+-----+---------+
|    United States|            Romania|   15|        1|
|    United States|            Croatia|    1|        1|
+-----------------+-------------------+-----+---------+
only showing top 2 rows



In [45]:
// Adding columns

df.withColumn(
            "withinCountry",
            expr("ORIGIN_COUNTRY_NAME == DEST_COUNTRY_NAME")
            ).show(2)

+-----------------+-------------------+-----+-------------+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|withinCountry|
+-----------------+-------------------+-----+-------------+
|    United States|            Romania|   15|        false|
|    United States|            Croatia|    1|        false|
+-----------------+-------------------+-----+-------------+
only showing top 2 rows



In [46]:
// rename columns
df.withColumnRenamed("DEST_COUNTRY_NAME", "dest").columns

Array(dest, ORIGIN_COUNTRY_NAME, count)

In [47]:
// remove columns
df.drop("ORIGIN_COUNTRY_NAME").columns

Array(DEST_COUNTRY_NAME, count)

In [48]:
// filtering rows
val conditional = df.where("count < 2").take(2)

conditional = Array([United States,Croatia,1], [United States,Singapore,1])


Array([United States,Croatia,1], [United States,Singapore,1])

In [50]:
df.where("count < 2").where(col("ORIGIN_COUNTRY_NAME") =!= "Croatia").take(200)

Array([United States,Singapore,1], [Moldova,United States,1], [Malta,United States,1], [United States,Gibraltar,1], [Saint Vincent and the Grenadines,United States,1], [Suriname,United States,1], [United States,Cyprus,1], [Burkina Faso,United States,1], [Djibouti,United States,1], [United States,Estonia,1], [Zambia,United States,1], [Cyprus,United States,1], [United States,Lithuania,1], [United States,Bulgaria,1], [United States,Georgia,1], [United States,Bahrain,1], [Cote d'Ivoire,United States,1], [United States,Papua New Guinea,1], [Kosovo,United States,1], [Iraq,United States,1], [Indonesia,United States,1], [New Caledonia,United States,1], [United States,Montenegro,1], [United States,Namibia,1])

In [51]:
// getting unique rows
df.select("ORIGIN_COUNTRY_NAME").distinct().count()

125

In [55]:
//rnadom splits
df.randomSplit(Array(0.25, 0.75), 29)

Array([DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field], [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field])

## Concatenating and Appending rows to a dataframe

In [56]:
// first simulate the news rows

import org.apache.spark.sql.Row
val schema = df.schema
val newRows = Seq(
                Row("New Country", "Other Country", 5L),
                Row("New Country 2", "Other Country 3", 1L)
                )
val parallelizedRows = spark.sparkContext.parallelize(newRows)

schema = StructType(StructField(DEST_COUNTRY_NAME,StringType,true), StructField(ORIGIN_COUNTRY_NAME,StringType,true), StructField(count,LongType,true))
newRows = List([New Country,Other Country,5], [New Country 2,Other Country 3,1])
parallelizedRows = ParallelCollectionRDD[82] at parallelize at <console>:44


ParallelCollectionRDD[82] at parallelize at <console>:44

In [58]:
val newDF = spark.createDataFrame(parallelizedRows, schema)

newDF = [DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]


[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [59]:
df.union(newDF)
        .where("count = 1")
        .where($"ORIGIN_COUNTRY_NAME" =!= "United States")
        .show() // get all of them and we'll see our new rows at the end

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Croatia|    1|
|    United States|          Singapore|    1|
|    United States|          Gibraltar|    1|
|    United States|             Cyprus|    1|
|    United States|            Estonia|    1|
|    United States|          Lithuania|    1|
|    United States|           Bulgaria|    1|
|    United States|            Georgia|    1|
|    United States|            Bahrain|    1|
|    United States|   Papua New Guinea|    1|
|    United States|         Montenegro|    1|
|    United States|            Namibia|    1|
|    New Country 2|    Other Country 3|    1|
+-----------------+-------------------+-----+



## Sorting

In [60]:
df.sort("count").show(5)
df.orderBy("count", "DEST_COUNTRY_NAME").show(5)
df.orderBy(col("count"), col("DEST_COUNTRY_NAME")).show(5)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|               Malta|      United States|    1|
|Saint Vincent and...|      United States|    1|
|       United States|            Croatia|    1|
|       United States|          Gibraltar|    1|
|       United States|          Singapore|    1|
+--------------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|     Burkina Faso|      United States|    1|
|    Cote d'Ivoire|      United States|    1|
|           Cyprus|      United States|    1|
|         Djibouti|      United States|    1|
|        Indonesia|      United States|    1|
+-----------------+-------------------+-----+
only showing top 5 rows

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--

In [61]:
df.sortWithinPartitions("count").show(100)

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Croatia|    1|
|       United States|          Singapore|    1|
|             Moldova|      United States|    1|
|               Malta|      United States|    1|
|       United States|          Gibraltar|    1|
|Saint Vincent and...|      United States|    1|
|            Suriname|      United States|    1|
|       United States|             Cyprus|    1|
|        Burkina Faso|      United States|    1|
|            Djibouti|      United States|    1|
|       United States|            Estonia|    1|
|              Zambia|      United States|    1|
|              Cyprus|      United States|    1|
|       United States|          Lithuania|    1|
|       United States|           Bulgaria|    1|
|       United States|            Georgia|    1|
|       United States|            Bahrain|    1|
|       Cote d'Ivoir

In [62]:
// working with only 5 rows

df.limit(5).show()

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
+-----------------+-------------------+-----+



## Repartition and Coalesce

In [63]:
df.rdd.getNumPartitions

1

In [64]:
df.repartition(col("DEST_COUNTRY_NAME"))

[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string ... 1 more field]

In [65]:
df.rdd.getNumPartitions

1