In [0]:
my_range = spark.range(1000).toDF('number')

In [0]:
divisBy2 = my_range.where("number % 2 = 0")

In [0]:
divisBy2.count()

Out[3]: 500

In [0]:
flightData2015 = spark \
                    .read\
                    .option("inferSchema", "true")\
                    .option("header", "true")\
                    .csv("/databricks-datasets/definitive-guide/data/flight-data/csv/2015-summary.csv")

In [0]:
flightData2015.take(3)

Out[12]: [Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [0]:
flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#98 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#98 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=153]
      +- FileScan csv [DEST_COUNTRY_NAME#96,ORIGIN_COUNTRY_NAME#97,count#98] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/databricks-datasets/definitive-guide/data/flight-data/csv/2015-s..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [0]:
spark.conf.set("spark.sql.shuffle.partitions", "5")
flightData2015.sort("count").take(2)

Out[14]: [Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [0]:
spark.conf.set("spark.sql.shuffle.partitions", "10")
flightData2015.sort("count").take(2)

Out[15]: [Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [0]:
spark.conf.set("spark.sql.shuffle.partitions", "1")
flightData2015.sort("count").take(2)

Out[16]: [Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [0]:
# sql

flightData2015.createOrReplaceTempView("flight_data_2015")

In [0]:
sqlWay = spark.sql("""
    select DEST_COUNTRY_NAME, count(1)
    from flight_data_2015
    group by DEST_COUNTRY_NAME
""")


dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()



sqlWay.explain()
print("------------------------------")
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#96], functions=[finalmerge_count(merge count#143L) AS count(1)#131L])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#96, 1), ENSURE_REQUIREMENTS, [plan_id=197]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#96], functions=[partial_count(1) AS count#143L])
         +- FileScan csv [DEST_COUNTRY_NAME#96] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/databricks-datasets/definitive-guide/data/flight-data/csv/2015-s..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


------------------------------
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#96], functions=[finalmerge_count(merge count#145L) AS count(1)#138L])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#96, 1), ENSURE_REQUIREMENTS, [plan_id=218]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#96], functio

In [0]:
spark.sql("select max(count) from flight_data_2015").take(1)

Out[22]: [Row(max(count)=370002)]

In [0]:
from pyspark.sql.functions import max

flightData2015.select(max("count")).take(1)

Out[26]: [Row(max(count)=370002)]

In [0]:
from pyspark.sql.functions import desc

flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)\
    .show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [0]:
example = flightData2015\
    .groupBy("DEST_COUNTRY_NAME")\
    .sum("count")\
    .withColumnRenamed("sum(count)", "destination_total")\
    .sort(desc("destination_total"))\
    .limit(5)

example.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#235L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#96,destination_total#235L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#96], functions=[finalmerge_sum(merge sum#239L) AS sum(count#98)#231L])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#96, 1), ENSURE_REQUIREMENTS, [plan_id=347]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#96], functions=[partial_sum(count#98) AS sum#239L])
            +- FileScan csv [DEST_COUNTRY_NAME#96,count#98] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/databricks-datasets/definitive-guide/data/flight-data/csv/2015-s..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>


