In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('Chapter II').getOrCreate()

24/08/04 13:55:56 WARN Utils: Your hostname, Khanhs-MAC.local resolves to a loopback address: 127.0.0.1; using 192.168.0.100 instead (on interface en0)
24/08/04 13:55:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/04 13:55:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
myRange = spark.range(1000).toDF('number')
myRange.show(5)

                                                                                

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
+------+
only showing top 5 rows



In [3]:
divisBy2 = myRange.where("number % 2 = 0")
divisBy2.show(5)

+------+
|number|
+------+
|     0|
|     2|
|     4|
|     6|
|     8|
+------+
only showing top 5 rows



## Actions

In [4]:
divisBy2.count()

500

## An End-to-End Example

In [5]:
flightData2015 = spark\
  .read\
  .option('inferSchema', "true")\
  .option('header', "true")\
  .csv('../data/flight-data/csv/2015-summary.csv')

In [6]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [7]:
flightData2015.sort("count").explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [count#39 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(count#39 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [plan_id=99]
      +- FileScan csv [DEST_COUNTRY_NAME#37,ORIGIN_COUNTRY_NAME#38,count#39] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/khanhnn/Developer/DE/spark/practice_spark/data/flight-data..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [8]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [11]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

## DataFrames and SQL

In [12]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [17]:
sqlWay = spark.sql("""
  SELECT DEST_COUNTRY_NAME, COUNT(1)
  FROM flight_data_2015
  GROUP BY DEST_COUNTRY_NAME
""")

dataFrameWay = flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .count()

sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#37], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#37, 5), ENSURE_REQUIREMENTS, [plan_id=156]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#37], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#37] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/khanhnn/Developer/DE/spark/practice_spark/data/flight-data..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[DEST_COUNTRY_NAME#37], functions=[count(1)])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#37, 5), ENSURE_REQUIREMENTS, [plan_id=169]
      +- HashAggregate(keys=[DEST_COUNTRY_NAME#37], functions=[partial_count(1)])
         +- FileScan csv [DEST_COUNTRY_NAME#37] Batched: false, DataFilters: [], Format: CSV, Location: InMe

In [18]:
from pyspark.sql.functions import max

In [19]:
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [21]:
maxSQL = spark.sql("""
  SELECT DEST_COUNTRY_NAME, SUM(count) as destination_total
  FROM flight_data_2015
  GROUP BY DEST_COUNTRY_NAME
  ORDER BY SUM(count) DESC
  LIMIT(5)
""")
maxSQL.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [24]:
from pyspark.sql.functions import desc

flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [25]:
flightData2015\
  .groupBy("DEST_COUNTRY_NAME")\
  .sum("count")\
  .withColumnRenamed("sum(count)", "destination_total")\
  .sort(desc("destination_total"))\
  .limit(5)\
  .explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[destination_total#153L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#37,destination_total#153L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#37], functions=[sum(count#39)])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#37, 5), ENSURE_REQUIREMENTS, [plan_id=307]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#37], functions=[partial_sum(count#39)])
            +- FileScan csv [DEST_COUNTRY_NAME#37,count#39] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[file:/Users/khanhnn/Developer/DE/spark/practice_spark/data/flight-data..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




24/08/05 02:09:51 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 919786 ms exceeds timeout 120000 ms
24/08/05 02:09:51 WARN SparkContext: Killing executors is not supported by current scheduler.
24/08/05 02:09:54 WARN Executor: Issue communicating with driver in heartbeater
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:101)
	at org.apache.spark.rpc.RpcEndpointRef.askSync(RpcEndpointRef.scala:85)
	at org.apache.spark.storage.BlockManagerMaster.registerBlockManager(BlockManagerMaster.scala:80)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:642)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1223)
	at o