In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("001")\
.master("local").config("spark.sql.warehouse.dir", "file:///C:/tmp/hive")\
.getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions", "5")
print("SparkSession created with name as 'spark'")

SparkSession created with name as 'spark'


In [2]:
pwd

'C:\\Users\\edge\\Desktop\\spark\\pyspark\\jupyter\\SparkTheDefinitiveGuide'

In [4]:
flightData2015=spark.read\
.option("inferSchema", "true")\
.option("header", "true")\
.csv("..\..\data\SparkTheDefinitiveGuide\\flight-data\csv\\2015-summary.csv")
#.csv("../../data/SparkTheDefinitiveGuide/flight-data/csv/2015-summary.csv")

AnalysisException: Path does not exist: file:/C:/Users/edge/Desktop/spark/data/SparkTheDefinitiveGuide/flight-data/csv/2015-summary.csv;

In [32]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [5]:
flightData2015.sort("count").explain()

== Physical Plan ==
*(1) Sort [count#16 ASC NULLS FIRST], true, 0
+- *(1) Project [DEST_COUNTRY_NAME#14, ORIGIN_COUNTRY_NAME#15, count#16]
   +- BatchScan[DEST_COUNTRY_NAME#14, ORIGIN_COUNTRY_NAME#15, count#16] CSVScan Location: InMemoryFileIndex[file:/D:/hadoopedge/bigdata/modules/spark/data/SparkTheDefinitiveGuide/flight-d..., ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>




In [6]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

#### Dataframe vs SQL, underlying plan and execution is same.

In [7]:
dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()

In [16]:
flightData2015.createOrReplaceTempView("flight_data_2015")
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME""")

In [17]:
sqlWay.explain()
dataFrameWay.explain()

== Physical Plan ==
*(1) Sort [DEST_COUNTRY_NAME#14 ASC NULLS FIRST], true, 0
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[count(1)])
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_count(1)])
      +- *(1) Project [DEST_COUNTRY_NAME#14]
         +- BatchScan[DEST_COUNTRY_NAME#14] CSVScan Location: InMemoryFileIndex[file:/D:/hadoopedge/bigdata/modules/spark/data/SparkTheDefinitiveGuide/flight-d..., ReadSchema: struct<DEST_COUNTRY_NAME:string>


== Physical Plan ==
*(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[count(1)])
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_count(1)])
   +- *(1) Project [DEST_COUNTRY_NAME#14]
      +- BatchScan[DEST_COUNTRY_NAME#14] CSVScan Location: InMemoryFileIndex[file:/D:/hadoopedge/bigdata/modules/spark/data/SparkTheDefinitiveGuide/flight-d..., ReadSchema: struct<DEST_COUNTRY_NAME:string>




In [20]:
from pyspark.sql.functions import max
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [23]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")
maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [31]:
from pyspark.sql.functions import desc
maxdest=flightData2015.groupby("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total")).limit(5)
maxdest.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#247L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#14,destination_total#247L])
+- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[sum(cast(count#16 as bigint))])
   +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#14], functions=[partial_sum(cast(count#16 as bigint))])
      +- *(1) Project [DEST_COUNTRY_NAME#14, count#16]
         +- BatchScan[DEST_COUNTRY_NAME#14, count#16] CSVScan Location: InMemoryFileIndex[file:/D:/hadoopedge/bigdata/modules/spark/data/SparkTheDefinitiveGuide/flight-d..., ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>




In [48]:
#spark.stop()