# [Learning Spark Second Edition](https://github.com/databricks/LearningSparkV2)


 _all-spark-notebook_ 

### [Chapter Four](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch04.html)
> spark SQL

In [1]:
from os import path,popen
#SPARK_HOME = popen('echo $SPARK_HOME').read().strip()
PARENT_DIR = popen('dirname $PWD').read().strip()

In [3]:
# In Python
from pyspark.sql import SparkSession
# Create a SparkSession
spark = (SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate())

# Path to data set
csv_file = path.join(
    PARENT_DIR,
    "databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
)

# Read and create a temporary view
# Infer schema (note that for larger files you
# may want to specify the schema)
df = (
    spark.read.format("csv").option("inferSchema",
                                    "true").option("header",
                                                   "true").load(csv_file)
)
df.createOrReplaceTempView("us_delay_flights_tbl")

In [6]:
spark.sql(
    """SELECT distance, origin, destination 
FROM us_delay_flights_tbl WHERE distance > 1000 
ORDER BY distance DESC"""
).show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
|    4330|   JFK|        HNL|
+--------+------+-----------+
only showing top 10 rows



In [8]:
spark.sql(
    """SELECT date, delay, origin, destination 
FROM us_delay_flights_tbl 
WHERE delay > 120 AND ORIGIN = 'SFO' AND DESTINATION = 'ORD' 
ORDER by delay DESC"""
).show(10)

+-------+-----+------+-----------+
|   date|delay|origin|destination|
+-------+-----+------+-----------+
|2190925| 1638|   SFO|        ORD|
|1031755|  396|   SFO|        ORD|
|1022330|  326|   SFO|        ORD|
|1051205|  320|   SFO|        ORD|
|1190925|  297|   SFO|        ORD|
|2171115|  296|   SFO|        ORD|
|1071040|  279|   SFO|        ORD|
|1051550|  274|   SFO|        ORD|
|3120730|  266|   SFO|        ORD|
|1261104|  258|   SFO|        ORD|
+-------+-----+------+-----------+
only showing top 10 rows



In [9]:
spark.sql(
    """SELECT delay, origin, destination,
              CASE
                  WHEN delay > 360 THEN 'Very Long Delays'
                  WHEN delay > 120 AND delay < 360 THEN 'Long Delays'
                  WHEN delay > 60 AND delay < 120 THEN 'Short Delays'
                  WHEN delay > 0 and delay < 60  THEN  'Tolerable Delays'
                  WHEN delay = 0 THEN 'No Delays'
                  ELSE 'Early'
               END AS Flight_Delays
               FROM us_delay_flights_tbl
               ORDER BY origin, delay DESC"""
).show(10)

+-----+------+-----------+-------------+
|delay|origin|destination|Flight_Delays|
+-----+------+-----------+-------------+
|  333|   ABE|        ATL|  Long Delays|
|  305|   ABE|        ATL|  Long Delays|
|  275|   ABE|        ATL|  Long Delays|
|  257|   ABE|        ATL|  Long Delays|
|  247|   ABE|        ATL|  Long Delays|
|  247|   ABE|        DTW|  Long Delays|
|  219|   ABE|        ORD|  Long Delays|
|  211|   ABE|        ATL|  Long Delays|
|  197|   ABE|        DTW|  Long Delays|
|  192|   ABE|        ORD|  Long Delays|
+-----+------+-----------+-------------+
only showing top 10 rows



### [Chapter Four](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch04.html)
> SQL like with DataFrame API

In [10]:
# In Python
from pyspark.sql import SparkSession

# Create a SparkSession
spark = (SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate())

# Path to data set
csv_file = path.join(
    PARENT_DIR,
    "databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
)

# Read and create a temporary view. Infer schema (note that for larger files you
# may want to specify the schema)
df = (
    spark.read.format("csv").option("inferSchema",
                                    "true").option("header",
                                                   "true").load(csv_file)
)
df.createOrReplaceTempView("us_delay_flights_tbl")

In [9]:
# In Python
from pyspark.sql.functions import col, desc
(
    df.select("distance", "origin",
              "destination").where(col("distance") > 1000).orderBy(
                  desc("distance")
              )
).show(10)

+--------+------+-----------+
|distance|origin|destination|
+--------+------+-----------+
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
|    4330|   HNL|        JFK|
+--------+------+-----------+
only showing top 10 rows



### [Chapter Four](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch04.html)
> Managed and unmanaged tables (using DataFrame API)

In [1]:
#managed
from os import path, popen
from shutil import rmtree
PARENT_DIR = popen('dirname $PWD').read().strip()

from pyspark.sql import SparkSession

# Create a SparkSession
spark = (SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate())

#remove default directory
rmtree(
    spark.catalog.listDatabases()[0].locationUri.split(':')[-1],
    ignore_errors=True
)
spark.sql('''CREATE DATABASE learn_spark_db''')
spark.sql("USE learn_spark_db")

# Path to our US flight delays CSV file
csv_file = path.join(
    PARENT_DIR,
    "databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
)

# Schema as defined in the preceding example
schema = "date STRING, delay INT, distance INT, origin STRING, destination STRING"
flights_df = spark.read.csv(csv_file, schema=schema)
'''
default_dbs = spark.catalog.listDatabases()
if default_dbs:
    
    default_dir = default_dbs[0].locationUri.split(':')[-1]
    if path.isdir(default_dir): 
        rmtree(default_dir)
'''

flights_df.write.saveAsTable("managed_us_delay_flights_tbl")

In [3]:
#unmanaged
(
    flights_df.write.option(
        "path",
        path.join(spark.catalog.listDatabases()[0].locationUri.split(':')[-1])
    )
    #,
    #   "us_flights_delay"))
    .saveAsTable("us_delay_flights_tbl", mode='overwrite')
)

### [Chapter Four](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch04.html)
> Managed and unmanaged tables (using DataFrame API)

In [5]:
from os import path, popen
from pyspark.sql import SparkSession

PARENT_DIR = popen('dirname $PWD').read().strip()

# Create a SparkSession
spark = (SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate())

# Path to data set
csv_file = path.join(
    PARENT_DIR,
    "databricks-datasets/learning-spark-v2/flights/departuredelays.csv"
)

# Read and create a temporary view. Infer schema (note that for larger files you
# may want to specify the schema)
df = (
    spark.read.format("csv").option("inferSchema",
                                    "true").option("header",
                                                   "true").load(csv_file)
)
df.createOrReplaceTempView("us_delay_flights_tbl")

df_sfo = spark.sql(
    "SELECT date, delay, origin, destination FROM \
  us_delay_flights_tbl WHERE origin = 'SFO'"
)
df_jfk = spark.sql(
    "SELECT date, delay, origin, destination FROM \
  us_delay_flights_tbl WHERE origin = 'JFK'"
)

# Create a temporary and global temporary view
df_sfo.createOrReplaceGlobalTempView("us_origin_airport_SFO_global_tmp_view")
df_jfk.createOrReplaceTempView("us_origin_airport_JFK_tmp_view")

In [7]:
spark.sql("SELECT * FROM us_origin_airport_JFK_tmp_view")

spark.catalog.dropGlobalTempView("us_origin_airport_SFO_global_tmp_view")
spark.catalog.dropTempView("us_origin_airport_JFK_tmp_view")

### [Chapter Four](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch04.html)
> Catalog: Access metada

In [13]:
from os import path, popen
from pyspark.sql import SparkSession

PARENT_DIR = popen('dirname $PWD').read().strip()

from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate())

[Database(name='default', description='default database', locationUri='file:/wd/notebooks/spark-warehouse')]


[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x7f50302a6310>
[0;31mDocstring:[0m  
Returns a :class:`DataFrameReader` that can be used to read data
in as a :class:`DataFrame`.

:return: :class:`DataFrameReader`

.. versionadded:: 2.0


### [Chapter Four](https://learning.oreilly.com/library/view/learning-spark-2nd/9781492050032/ch04.html)
> Read Parquet

In [30]:
import os
from pyspark.sql import SparkSession

PARENT_DIR = popen('dirname $PWD').read().strip()

spark = (SparkSession.builder.appName("SparkSQLExampleApp").getOrCreate())

In [26]:
#Parquet into DataFrame
file = path.join(
    popen('echo $PWD').read().strip(),
    'spark-warehouse/learn_spark_db.db/managed_us_delay_flights_tbl/'
)
df = spark.read.format("parquet").load(file)

In [31]:
#Parquet into SQL

path = os.path.join(
    popen('echo $PWD').read().strip(),
    'spark-warehouse/learn_spark_db.db/managed_us_delay_flights_tbl'
)

spark.sql(
    f'CREATE OR REPLACE TEMPORARY VIEW us_delay_flights_tbl  \
    USING parquet  OPTIONS (path "{path}")'
)

spark.sql("SELECT * FROM us_delay_flights_tbl").show()

+--------+-----+--------+------+-----------+
|    date|delay|distance|origin|destination|
+--------+-----+--------+------+-----------+
|02151800|  108|     290|   ORD|        MSP|
|02151800|  142|     772|   ORD|        DEN|
|02151303|   16|    1516|   ORD|        LAX|
|02151157|    7|    1316|   ORD|        LAS|
|02151818|   55|    1511|   ORD|        PDX|
|02151033|   12|     873|   ORD|        MCO|
|02150941|    0|    1499|   ORD|        SNA|
|02151320|   17|    1604|   ORD|        SFO|
|02151804|    2|    1497|   ORD|        SAN|
|02152000|   17|     119|   ORD|        GRR|
|02151502|   15|     260|   ORD|        DSM|
|02151935|   73|     879|   ORD|        TPA|
|02151315|   -1|     879|   ORD|        TPA|
|02150957|    0|     939|   ORD|        MTJ|
|02152001|  123|     804|   ORD|        IAH|
|02151945|   42|     558|   ORD|        RIC|
|02151502|   89|     625|   ORD|        EWR|
|02150653|   -5|     974|   ORD|        RSW|
|02150947|   -1|    1316|   ORD|        LAS|
|02152030|

In [32]:
# WriteFile
(df.write.format("parquet")
  .mode("overwrite")
  .option("compression", "snappy")
  .save("/tmp/data/parquet/df_parquet"))