In [0]:
# Example of expressivity and composability. We use the low-level RDD API for this, telling Spark how to do it.
dataRDD=sc.parallelize([("Brooke",20),("Denny",31),("Jules",30),("TD",35),("Brooke",25)])
agesRDD=(dataRDD.map(lambda x: (x[0],(x[1],1))).reduceByKey(lambda x, y: (x[0]+y[0],x[1]+y[1])).map(lambda x: (x[0], x[1][0]/x[1][1])))

In [0]:
# Now, we express the last query with high-level DSL operatos and the DF API, instructing Spark what to do.
#######################################################################################################################

from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

# Create a DF using SparkSession
spark = (SparkSession
        .builder
        .appName("AuthorsAges")
        .getOrCreate())

# Create a DF
data_df = spark.createDataFrame([("Brooke",20),("Denny",31),("Jules",30),("TD",35),("Brooke",25)],["name","age"])

# Group the same names together, aggregate their ages and compute an average
avg_df = data_df.groupBy("name").agg(avg("age"))

# Show the results of the final execution
avg_df.show()

+------+--------+
|  name|avg(age)|
+------+--------+
|Brooke|    22.5|
| Denny|    31.0|
| Jules|    30.0|
|    TD|    35.0|
+------+--------+



In [0]:
# DEFINING SCHEMAS.
# Firsty, we are going to define a schema programmatically for a DF with three named columns, author, title, and pages; using the Spark DF API.
################################################################################################################################################

from pyspark.sql.types import *
schema=StructType([StructField("author",StringType(),False),StructField("title",StringType(),False),StructField("pages",IntegerType(),False)])

# Secondly, we define the same schema using DDL (Data Definition Language)
schema2="author STRING, title STRING, pages INT"

In [0]:
# Another example of the usage of schemas.
##################################################################################################################################

from pyspark.sql import SparkSession
# Define the schema
schema= "`Id` INT, `First` STRING, `Last` STRING, `Url` STRING, `Published` STRING, `Hits` INT, `Campaigns` ARRAY<STRING>"
# Create our static data
data = [[1, "Jules", "Damji", "https://tinyurl.1", "1/4/2016", 4535, ["twitter","LinkedIN"]],
        [2, "Brooke", "Wenig", "https://tinyurl.2", "5/5/2018", 8908, ["twitter","LinkedIN"]],
        [3, "Denny", "Lee", "https://tinyurl.3", "6/7/2019", 7659, ["web","twitter","FB","LinkedIN"]],
        [4, "Tathagata", "Das", "https://tinyurl.4", "5/12/2018", 10568, ["twitter","FB"]], 
        [5, "Matei", "Zaharia", "https://tinyurl.5", "5/14/2014", 40578, ["web","twitter","FB","LinkedIN"]],
        [6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter","LinkedIN"]]
       ]
# Main program
if __name__=="__main__":
    # Create a SparkSession
    spark= (SparkSession
           .builder
           .appName("Example-3_6")
           .getOrCreate())
    #Create a DF using the schema defined above
    blogs_df=spark.createDataFrame(data,schema)
    blogs_df.show()
    print(blogs_df.printSchema())
    blogs_df.schema# 

+---+---------+-------+-----------------+---------+-----+--------------------+
| Id|    First|   Last|              Url|Published| Hits|           Campaigns|
+---+---------+-------+-----------------+---------+-----+--------------------+
|  1|    Jules|  Damji|https://tinyurl.1| 1/4/2016| 4535| [twitter, LinkedIN]|
|  2|   Brooke|  Wenig|https://tinyurl.2| 5/5/2018| 8908| [twitter, LinkedIN]|
|  3|    Denny|    Lee|https://tinyurl.3| 6/7/2019| 7659|[web, twitter, FB...|
|  4|Tathagata|    Das|https://tinyurl.4|5/12/2018|10568|       [twitter, FB]|
|  5|    Matei|Zaharia|https://tinyurl.5|5/14/2014|40578|[web, twitter, FB...|
|  6|  Reynold|    Xin|https://tinyurl.6| 3/2/2015|25568| [twitter, LinkedIN]|
+---+---------+-------+-----------------+---------+-----+--------------------+

root
 |-- Id: integer (nullable = true)
 |-- First: string (nullable = true)
 |-- Last: string (nullable = true)
 |-- Url: string (nullable = true)
 |-- Published: string (nullable = true)
 |-- Hits: integer (

In [0]:
# ROWS
#################################################################33

from pyspark.sql import Row
blog_row=Row(6, "Reynold", "Xin", "https://tinyurl.6", "3/2/2015", 25568, ["twitter","LinkedIN"])
blog_row[1]

# Row objects can be used to create DF if you need them for quick interactivity and exploration.
rows=[Row("Matei Zaharia", "CA"), Row("Reynold Xin","CA")]
authors_df=spark.createDataFrame(rows,["Authos","State"])
authors_df.show()

+-------------+-----+
|       Authos|State|
+-------------+-----+
|Matei Zaharia|   CA|
|  Reynold Xin|   CA|
+-------------+-----+



In [0]:
# DATAFRAME READER
###################################################################################################################################

from pyspark.sql.types import *
# Programmatic way to define the schema (as we've seen above)
fire_schema = StructType([StructField('CallNumber', IntegerType(), True),
 StructField('UnitID', StringType(), True),
 StructField('IncidentNumber', IntegerType(), True),
 StructField('CallType', StringType(), True), 
 StructField('CallDate', StringType(), True), 
 StructField('WatchDate', StringType(), True),
 StructField('CallFinalDisposition', StringType(), True),
 StructField('AvailableDtTm', StringType(), True),
 StructField('Address', StringType(), True), 
 StructField('City', StringType(), True), 
 StructField('Zipcode', IntegerType(), True), 
 StructField('Battalion', StringType(), True), 
 StructField('StationArea', StringType(), True), 
 StructField('Box', StringType(), True), 
 StructField('OriginalPriority', StringType(), True), 
 StructField('Priority', StringType(), True), 
 StructField('FinalPriority', IntegerType(), True), 
 StructField('ALSUnit', BooleanType(), True), 
 StructField('CallTypeGroup', StringType(), True),
 StructField('NumAlarms', IntegerType(), True),
 StructField('UnitType', StringType(), True),
 StructField('UnitSequenceInCallDispatch', IntegerType(), True),
 StructField('FirePreventionDistrict', StringType(), True),
 StructField('SupervisorDistrict', StringType(), True),
 StructField('Neighborhood', StringType(), True),
 StructField('Location', StringType(), True),
 StructField('RowID', StringType(), True),
 StructField('Delay', FloatType(), True)])

# Use the DFReader interface to read a CSV file
sf_fire_file="/FileStore/tables/sf_fire_calls-1.csv"
fire_df=spark.read.csv(sf_fire_file,header=True,schema=fire_schema)


In [0]:
# Saving a DF as a Parquet file or SQL table
###################################################################################################

parquet_table= #nombre_tabla
fire_df.write.format("parquet").saveAsTable(parquet_table)

In [0]:
# Projections and filters.
# We can use these techniques to examine specific aspects of our SF Fire Department data set
###################################################################################################################################
from pyspark.sql.functions import col
few_fire_df=(fire_df
            .select("IncidentNumber", "AvailableDtTm", "CallType")
            .where(col("CallType") != "Medical Incident"))
few_fire_df.show(5, False)

+--------------+----------------------+--------------+
|IncidentNumber|AvailableDtTm         |CallType      |
+--------------+----------------------+--------------+
|2003235       |01/11/2002 01:51:44 AM|Structure Fire|
|2003250       |01/11/2002 04:16:46 AM|Vehicle Fire  |
|2003259       |01/11/2002 06:01:58 AM|Alarms        |
|2003279       |01/11/2002 08:03:26 AM|Structure Fire|
|2003301       |01/11/2002 09:46:44 AM|Alarms        |
+--------------+----------------------+--------------+
only showing top 5 rows



In [0]:
# What if we want to know how many distinct CallTypes were recorded as the causes of the fire calls? These simple and expressive queries do the job:
from pyspark.sql.functions import *
(fire_df
 .select("CallType")
.where(col("CallType").isNotNull())
.agg(countDistinct("CallType").alias("DistinctCallTypes"))
.show())

# We also can list the distinct call types in the data set using these queries:
(fire_df
 .select("CallType")
.where(col("CallType").isNotNull())
.distinct()
.show(10, False))

+-----------------+
|DistinctCallTypes|
+-----------------+
|               30|
+-----------------+

+-----------------------------+
|CallType                     |
+-----------------------------+
|Elevator / Escalator Rescue  |
|Alarms                       |
|Odor (Strange / Unknown)     |
|Citizen Assist / Service Call|
|Vehicle Fire                 |
|Other                        |
|Outside Fire                 |
|Electrical Hazard            |
|Structure Fire               |
|Medical Incident             |
+-----------------------------+
only showing top 10 rows



In [0]:
# RENAMING, ADDING AND DROPPIN COLUMNS.
###############################################################################################################################
from pyspark.sql.functions import *

new_fire_df=fire_df.withColumnRenamed("Delay","ResponseDelayedinMins")
(new_fire_df
.select("ResponseDelayedinMins")
.where(col("ResponseDelayedinMins")>5)
.show(5,False))

# Date formats
fire_ts_df=(new_fire_df
           .withColumn("IncidentDate",to_timestamp(col("CallDate"),"MM/dd/yyyy"))
           .drop("CallDate")
           .withColumn("OnWatchDate", to_timestamp(col("WatchDate"), "MM/dd/yyyy"))
           .drop("WatchDate")
           .withColumn("AvailableDtTS", to_timestamp(col("AvailableDtTM"), "MM/dd/yyyy hh:mm:ss a"))
           .drop("AvailableDtTM"))
(fire_ts_df
.select("IncidentDate","OnWatchDate","AvailableDtTS")
.show(5,False))

(fire_ts_df
.select(year("IncidentDate"))
.distinct()
.orderBy(year("IncidentDate"))
.show())

+---------------------+
|ResponseDelayedinMins|
+---------------------+
|5.35                 |
|6.25                 |
|5.2                  |
|5.6                  |
|7.25                 |
+---------------------+
only showing top 5 rows

+-------------------+-------------------+-------------------+
|IncidentDate       |OnWatchDate        |AvailableDtTS      |
+-------------------+-------------------+-------------------+
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 01:51:44|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 03:01:18|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 02:39:50|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 04:16:46|
|2002-01-11 00:00:00|2002-01-10 00:00:00|2002-01-11 06:01:58|
+-------------------+-------------------+-------------------+
only showing top 5 rows

+------------------+
|year(IncidentDate)|
+------------------+
|              2000|
|              2001|
|              2002|
|              2003|
|              2004|
|       

In [0]:
# What were the most common types of fire calls?
#######################################################################################################################################
(fire_ts_df
.select("CallType")
.where(col("CallType").isNotNull())
.groupBy("CallType")
.count()
.orderBy("count",ascending=False)
.show(n=10,truncate=False))

+-------------------------------+------+
|CallType                       |count |
+-------------------------------+------+
|Medical Incident               |113794|
|Structure Fire                 |23319 |
|Alarms                         |19406 |
|Traffic Collision              |7013  |
|Citizen Assist / Service Call  |2524  |
|Other                          |2166  |
|Outside Fire                   |2094  |
|Vehicle Fire                   |854   |
|Gas Leak (Natural and LP Gases)|764   |
|Water Rescue                   |755   |
+-------------------------------+------+
only showing top 10 rows



In [0]:
# Other common DF operations
############################################################################################################################

import pyspark.sql.functions as F

(fire_ts_df
.select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"), F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))
.show())

+--------------+--------------------------+--------------------------+--------------------------+
|sum(NumAlarms)|avg(ResponseDelayedinMins)|min(ResponseDelayedinMins)|max(ResponseDelayedinMins)|
+--------------+--------------------------+--------------------------+--------------------------+
|        176170|         3.892364154521585|               0.016666668|                   1844.55|
+--------------+--------------------------+--------------------------+--------------------------+



In [0]:
# END-TO-END DF EXAMPLE
#########################################################################################################################3
# 1. What were all the different types of fire calls in 2018?
(fire_ts_df
.select("CallType")
.where(year("IncidentDate") == 2018)
.distinct()
.show(20,False))

# 2. What months within the year 2018 saw the highest number of fire calls?
from pyspark.sql.functions import col

(fire_ts_df
.select("NumAlarms", month("IncidentDate"),"CallType")
.where(year("IncidentDate")== 2018)
 .where(col("CallType").like("%Fire%"))
.orderBy("NumAlarms",ascending=False)
.show(n=5,truncate=False))

# 3. Which neighbourhood in San Francisco generated the most fire calls in 2018?
(fire_ts_df
.select("Neighborhood", "City", "NumAlarms", year("IncidentDate"), "CallType")
.where(col("City") == 'San Francisco')
.where(year("IncidentDate") == 2018)
.where(col("CallType").like("%Fire%"))
.orderBy("NumAlarms",ascending=False)
.show(n=3, truncate=False))

# 4. Which neighborhood had the worst response times to fire calls in 2018?
(fire_ts_df
.select("Neighborhood", "City", "ResponseDelayedinMins", year("IncidentDate"), "CallType")
.where(year("IncidentDate") == 2018)
.where(col("CallType").like("%Fire%"))
.orderBy("ResponseDelayedinMins",ascending=False)
.show(n=3, truncate=False))

# 5. Which week in the year 2018 had the most fire calls?

(fire_ts_df
.select(weekofyear("IncidentDate"), "NumAlarms", "City", "IncidentDate", "CallType")
.where(year("IncidentDate") == 2018)
.where(col("CallType").like("%Fire%"))
.orderBy("NumAlarms",ascending=False)
.show(n=3,truncate=False))

+-------------------------------+
|CallType                       |
+-------------------------------+
|Elevator / Escalator Rescue    |
|Alarms                         |
|Odor (Strange / Unknown)       |
|Citizen Assist / Service Call  |
|HazMat                         |
|Vehicle Fire                   |
|Other                          |
|Outside Fire                   |
|Traffic Collision              |
|Assist Police                  |
|Gas Leak (Natural and LP Gases)|
|Water Rescue                   |
|Electrical Hazard              |
|Structure Fire                 |
|Medical Incident               |
|Fuel Spill                     |
|Smoke Investigation (Outside)  |
|Train / Rail Incident          |
|Explosion                      |
|Suspicious Package             |
+-------------------------------+

+---------+-------------------+--------------+
|NumAlarms|month(IncidentDate)|CallType      |
+---------+-------------------+--------------+
|4        |3                  |Structure F

In [0]:
# Typed Objects, Untyped Objects and Generic Rows
#############################################################################################################

from pyspark.sql import Row
row=Row(350,True, "Learning Spark 2E", None)
row[0]
row[1]
row[2]

Out[73]: 'Learning Spark 2E'

In [0]:
# END TO END DATASET EXAMPLE
co2ds=spark.read.json("/FileStore/tables/iot_devices.json")
# Detect failing devices with battery levels below a threshold
(co2ds
.select("device_name","device_id","battery_level")
.where(col("battery_level")<5)
.show(n=30))

# Identify offending countries with high levels of CO2 emissions

(co2ds
.select("cn","c02_level")
.where(col("c02_level")> '1000')
.show(n=20,truncate=False))

# Compute the min and max values for temperature, battery level, CO2 and humidity
(co2ds
.select(min('temp'),max('temp'),min("battery_level"),max("battery_level"),min("c02_level"),max("c02_level"),min("humidity"),max("humidity")).show())



+--------------------+---------+-------------+
|         device_name|device_id|battery_level|
+--------------------+---------+-------------+
| device-mac-36TWSKiT|        3|            2|
|therm-stick-5gimp...|        5|            4|
|sensor-pad-6al7RT...|        6|            3|
|meter-gauge-7GeDoanM|        7|            3|
|sensor-pad-8xUD6p...|        8|            0|
| device-mac-9GcjZ2pw|        9|            3|
|meter-gauge-11dlM...|       11|            3|
|sensor-pad-12Y2kIm0o|       12|            0|
|sensor-pad-14QL93...|       14|            1|
|sensor-pad-16aXmI...|       16|            4|
|meter-gauge-17zb8...|       17|            0|
|sensor-pad-18XULN9Xv|       18|            4|
|therm-stick-25kK6...|       25|            4|
|sensor-pad-28Tsud...|       28|            3|
|device-mac-33B94G...|       33|            3|
|sensor-pad-36VQv8...|       36|            1|
|device-mac-39iklY...|       39|            2|
| sensor-pad-40NjeMqS|       40|            2|
|meter-gauge-

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-149898280422770>[0m in [0;36m<module>[0;34m[0m
[1;32m     19[0m [0;34m[0m[0m
[1;32m     20[0m [0;31m# Sort and group by temperature, c02, humidity and country[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 21[0;31m (co2ds
[0m[1;32m     22[0m [0;34m.[0m[0mselect[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     23[0m [0;34m.[0m[0mgroupBy[0m[0;34m([0m[0;34m"temp"[0m[0;34m,[0m[0;34m"c02_level"[0m[0;34m,[0m[0;34m"humidity"[0m[0;34m,[0m[0;34m"cn"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/spark/python/pyspark/sql/group.py[0m in [0;36m_api[0;34m(self, *cols)[0m
[1;32m     39[0m     [0;32mdef[0m [0m_api[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m*[0m[0mcols[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[