# [Learning Spark Second Edition](https://github.com/databricks/LearningSparkV2)


 _all-spark-notebook_ 

### [Chapter Three](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch03.html#the_table_like_format_of_a_dataframe)
> Define schema using DDL 

In [1]:
from os import path,popen
SPARK_HOME = popen('echo $SPARK_HOME').read().strip()
PARENT_DIR = popen('dirname $PWD').read().strip()

In [2]:
EXAMPLE_3_6_FILE = path.join(PARENT_DIR,"py/chapter3/Example-3_6.py")
assert path.isfile(EXAMPLE_3_6_FILE)

In [3]:
#inspect the module
!cat {EXAMPLE_3_6_FILE}

from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# define schema for our data
schema = StructType([
   StructField("Id", IntegerType(), False),
   StructField("First", StringType(), False),
   StructField("Last", StringType(), False),
   StructField("Url", StringType(), False),
   StructField("Published", StringType(), False),
   StructField("Hits", IntegerType(), False),
   StructField("Campaigns", ArrayType(StringType()), False)])

#create our data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter", "LinkedIn"]],
       [2, "Brooke","Wenig","https://tinyurl.2", "5/5/2018", 8908, ["twitter", "LinkedIn"]],
       [3, "Denny", "Lee", "https://tinyurl.3","6/7/2019",7659, ["web", "twitter", "FB", "LinkedIn"]],
       [4, "Tathagata", "Das","https://tinyurl.4", "5/12/2018", 10568, ["twitter", "FB"]],
       [5, "Matei","Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web", "twitter", "FB", "LinkedIn"

In [5]:
!{SPARK_HOME+'/bin/spark-submit'} {EXAMPLE_3_6_FILE} 2> /dev/null

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIn]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIn]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIn]|
+---+---------+-------+-----------------+---------+-----+--------------------+


root
 |-- Id: integer (nullable = false)
 |-- First: string (nullable = false)
 |-- Last: string (nullable = false)
 |-- Url: string (nullable = false)
 |-- Published: string (nullable = false)
 |-- Hits: int

### [Chapter Three](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch03.html#the_table_like_format_of_a_dataframe)
> Rows

In [7]:
# Simple Row
from pyspark.sql import Row
blog_row = Row(6, "Reynold", "Xin", "https://tinyurl.6", 255568, "3/2/2015", 
  ["twitter", "LinkedIn"])
# access using index for individual items
blog_row[1]

'Reynold'

In [8]:
# Dataframe
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = (SparkSession
        .builder
	.appName("Authors")
	.getOrCreate())

schema = StructType([
	StructField("Author" , StringType(), False), 
	StructField("State", StringType(), False)])

rows = [Row("Matei Zaharia", "CA"), Row("Reynold Xin", "CA")]
authors_df = spark.createDataFrame(rows, schema)
authors_df.show()

+-------------+-----+
|       Author|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



### [Chapter Three](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch03.html#the_table_like_format_of_a_dataframe)
> Schema from Dataframe sample

In [1]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Authors")
         .getOrCreate())

In [20]:
#Infer schema from a sample of the dataset

In [14]:
sf_fire_file = '../databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv'

sample_df = spark.read.option('samplingRation',0.001).option('header',True).csv(sf_fire_file)

In [19]:
for col in sample_df.schema:
    print(col)

StructField(CallNumber,StringType,true)
StructField(UnitID,StringType,true)
StructField(IncidentNumber,StringType,true)
StructField(CallType,StringType,true)
StructField(CallDate,StringType,true)
StructField(WatchDate,StringType,true)
StructField(CallFinalDisposition,StringType,true)
StructField(AvailableDtTm,StringType,true)
StructField(Address,StringType,true)
StructField(City,StringType,true)
StructField(Zipcode,StringType,true)
StructField(Battalion,StringType,true)
StructField(StationArea,StringType,true)
StructField(Box,StringType,true)
StructField(OriginalPriority,StringType,true)
StructField(Priority,StringType,true)
StructField(FinalPriority,StringType,true)
StructField(ALSUnit,StringType,true)
StructField(CallTypeGroup,StringType,true)
StructField(NumAlarms,StringType,true)
StructField(UnitType,StringType,true)
StructField(UnitSequenceInCallDispatch,StringType,true)
StructField(FirePreventionDistrict,StringType,true)
StructField(SupervisorDistrict,StringType,true)
StructField

### [Chapter Three](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch03.html#the_table_like_format_of_a_dataframe)
> Projections

In [4]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Authors")
         .getOrCreate())

sf_fire_file = '../databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv'
sample_df = spark.read.option('samplingRation',0.001).option('header',True).csv(sf_fire_file)
fire_df = spark.read.csv(sf_fire_file, header=True, schema=sample_df.schema)

In [5]:
few_fire_df = (fire_df
  .select("IncidentNumber", "AvailableDtTm", "CallType") 
  .where(fire_df.CallType != "Medical Incident"))
few_fire_df.show(5, truncate=False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [6]:
# return number of distinct types of calls using countDistinct()
from pyspark.sql.functions import *
(fire_df
  .select("CallType")
  .where(col("CallType").isNotNull())
  .agg(countDistinct("CallType").alias("DistinctCallTypes"))
  .show())

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+



In [7]:
#filter for only distinct non-null CallTypes from all the rows
(fire_df
  .select("CallType")
  .where(col("CallType").isNotNull())
  .distinct()
  .show(10, False))

+-----------------------------------+
|CallType                           |
+-----------------------------------+
|Elevator / Escalator Rescue        |
|Marine Fire                        |
|Aircraft Emergency                 |
|Confined Space / Structure Collapse|
|Administrative                     |
|Alarms                             |
|Odor (Strange / Unknown)           |
|Citizen Assist / Service Call      |
|HazMat                             |
|Watercraft in Distress             |
+-----------------------------------+
only showing top 10 rows



In [8]:
# Renaming, adding, and dropping columns
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")
(new_fire_df
  .select("ResponseDelayedinMins")
  .where(col("ResponseDelayedinMins") > 5)
  .show(5, False))

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|6.25                 |
|7.25                 |
|11.916667            |
|8.633333             |
|95.28333             |
+---------------------+
only showing top 5 rows



In [9]:
# Converting columns
fire_ts_df = (new_fire_df
  .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
  .drop("CallDate") 
  .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
  .drop("WatchDate") 
  .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"), 
  "MM/dd/yyyy hh:mm:ss a"))
  .drop("AvailableDtTm"))

# Select the converted columns
(fire_ts_df
  .select("IncidentDate", "OnWatchDate", "AvailableDtTS")
  .show(5, False))

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows



In [10]:
# Query with the new formats
(fire_ts_df
  .select(year('IncidentDate'))
  .distinct()
  .orderBy(year('IncidentDate'))
  .show())

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|              2005|
|              2006|
|              2007|
|              2008|
|              2009|
|              2010|
|              2011|
|              2012|
|              2013|
|              2014|
|              2015|
|              2016|
|              2017|
|              2018|
+------------------+



### [Chapter Three](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch03.html#the_table_like_format_of_a_dataframe)
> Aggregations

In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Authors")
         .getOrCreate())

sf_fire_file = '../databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv'
sample_df = spark.read.option('samplingRation',0.001).option('header',True).csv(sf_fire_file)
fire_df = spark.read.csv(sf_fire_file, header=True, schema=sample_df.schema)
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")

fire_ts_df = (new_fire_df
  .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
  .drop("CallDate") 
  .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
  .drop("WatchDate") 
  .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"), 
  "MM/dd/yyyy hh:mm:ss a"))
  .drop("AvailableDtTm"))


In [2]:

(fire_ts_df
  .select("CallType")
  .where(col("CallType").isNotNull())
  .groupBy("CallType")
  .count()
  .orderBy("count", ascending=False)
  .show(n=10, truncate=False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



### [Chapter Three](https://learning.oreilly.com/library/view/Learning+Spark,+2nd+Edition/9781492050032/ch03.html#the_table_like_format_of_a_dataframe)
> Other common DataFrame operations

In [1]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SparkSession

spark = (SparkSession
         .builder
         .appName("Authors")
         .getOrCreate())

sf_fire_file = '../databricks-datasets/learning-spark-v2/sf-fire/sf-fire-calls.csv'
sample_df = spark.read.option('samplingRation',0.001).option('header',True).csv(sf_fire_file)
fire_df = spark.read.csv(sf_fire_file, header=True, schema=sample_df.schema)
new_fire_df = fire_df.withColumnRenamed("Delay", "ResponseDelayedinMins")

fire_ts_df = (new_fire_df
  .withColumn("IncidentDate", to_timestamp(col("CallDate"), "MM/dd/yyyy"))
  .drop("CallDate") 
  .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
  .drop("WatchDate") 
  .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTm"), 
  "MM/dd/yyyy hh:mm:ss a"))
  .drop("AvailableDtTm"))


In [2]:
import pyspark.sql.functions as F
(fire_ts_df
  .select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"),
    F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
  .show())

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|      176170.0|        3.8923641541750134|               0.016666668|                      99.9|
+--------------+--------------------------+--------------------------+--------------------------+

