In [2]:
import findspark
findspark.init()

In [4]:
import pyspark
from pyspark.sql import SparkSession

In [5]:
# Create the spark session:
spark = SparkSession \
        .builder \
        .appName("ch01") \
        .master("yarn") \
        .config("spark.submit.deployMode", "client") \
        .config("spark.executor.memory", "2g") \
        .config("spark.driver.memory", "2g") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/10 14:27:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/10 14:27:54 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [6]:
myrange = spark.range(1000)

In [None]:
myrange1 = spark.range(1000).toDF("number") 
divby2 = myrange1.where("number % 2 == 0")   # transformation

In [18]:
divby2.take(3)     # action

[Row(number=0), Row(number=2), Row(number=4)]

In [None]:
divby2.explain() # the logical transformation plan

== Physical Plan ==
*(1) Project [id#12L AS number#14L]
+- *(1) Filter ((id#12L % 2) = 0)
   +- *(1) Range (0, 1000, step=1, splits=2)




### End to End Example

In [None]:
flightData2015 = spark.read.format("csv").option("header", True).option("inferSchema", True).load("hdfs:///data/flight-data/csv/2015-summary.csv")

                                                                                

In [29]:
type(flightData2015)

pyspark.sql.dataframe.DataFrame

In [30]:
flightData2015.columns

['DEST_COUNTRY_NAME', 'ORIGIN_COUNTRY_NAME', 'count']

In [67]:
# print schema
flightData2015.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: string (nullable = true)



In [None]:
flightData2015.head(5)     # action

                                                                                

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count='15'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count='344'),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count='15'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count='62')]

In [38]:
flightData2015.count()

256

In [33]:
flightData2015.sort("count").take(5)

                                                                                

[Row(DEST_COUNTRY_NAME='Malta', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Gibraltar', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count='1'),
 Row(DEST_COUNTRY_NAME='Saint Vincent and the Grenadines', ORIGIN_COUNTRY_NAME='United States', count='1'),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count='1')]

### Dataframe and Sql

In [34]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [46]:
sql_way = spark.sql("""
    select DEST_COUNTRY_NAME, count(*) as cnt
    from flight_data_2015
    group by DEST_COUNTRY_NAME
""")

In [47]:
sql_way.take(5)

[Row(DEST_COUNTRY_NAME='Anguilla', cnt=1),
 Row(DEST_COUNTRY_NAME='Russia', cnt=1),
 Row(DEST_COUNTRY_NAME='Paraguay', cnt=1),
 Row(DEST_COUNTRY_NAME='Senegal', cnt=1),
 Row(DEST_COUNTRY_NAME='Sweden', cnt=1)]

In [50]:
data_frame_way = flightData2015.groupBy("DEST_COUNTRY_NAME").count()
data_frame_way.take(5)

[Row(DEST_COUNTRY_NAME='Anguilla', count=1),
 Row(DEST_COUNTRY_NAME='Russia', count=1),
 Row(DEST_COUNTRY_NAME='Paraguay', count=1),
 Row(DEST_COUNTRY_NAME='Senegal', count=1),
 Row(DEST_COUNTRY_NAME='Sweden', count=1)]

In [69]:
from pyspark.sql.functions import max

In [70]:
flightData2015.select(max("count")).take(1)

[Row(max(count)='986')]

In [56]:
spark.sql("""
    select max(count)
    from flight_data_2015
""").take(1)

[Row(max(count)='986')]

##### Find Top five destination country

In [68]:
### Sql way

spark.sql("""
    select DEST_COUNTRY_NAME, sum(cast(count as int)) as cnt
    from flight_data_2015
    group by DEST_COUNTRY_NAME
    order by cnt desc
    limit 5 
""").show()

+-----------------+------+
|DEST_COUNTRY_NAME|   cnt|
+-----------------+------+
|    United States|411352|
|           Canada|  8399|
|           Mexico|  7140|
|   United Kingdom|  2025|
|            Japan|  1548|
+-----------------+------+



In [75]:
### dataframe Way
from pyspark.sql.functions import col, sum, desc
flightData2015.groupBy("DEST_COUNTRY_NAME") \
    .agg(sum(col("count").cast("long")).alias("cnt")) \
    .sort(desc("cnt")) \
    .limit(5) \
    .show()


+-----------------+------+
|DEST_COUNTRY_NAME|   cnt|
+-----------------+------+
|    United States|411352|
|           Canada|  8399|
|           Mexico|  7140|
|   United Kingdom|  2025|
|            Japan|  1548|
+-----------------+------+



In [76]:
from pyspark.sql.functions import col, sum, desc
flightData2015.groupBy("DEST_COUNTRY_NAME") \
    .agg(sum(col("count").cast("long")).alias("cnt")) \
    .sort(desc("cnt")) \
    .limit(5) \
    .explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- TakeOrderedAndProject(limit=5, orderBy=[cnt#292L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#33,cnt#292L])
   +- HashAggregate(keys=[DEST_COUNTRY_NAME#33], functions=[sum(cast(count#35 as bigint))])
      +- Exchange hashpartitioning(DEST_COUNTRY_NAME#33, 200), ENSURE_REQUIREMENTS, [plan_id=937]
         +- HashAggregate(keys=[DEST_COUNTRY_NAME#33], functions=[partial_sum(cast(count#35 as bigint))])
            +- FileScan csv [DEST_COUNTRY_NAME#33,count#35] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[hdfs://kumar-rke2-1:9000/data/flight-data/csv/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:string>


