# Spark API

In [1]:
import pydataset # for demonstration
import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
tips = pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
tips = spark.createDataFrame(tips) # any pandas dataframe

In [5]:
tips2 = tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [6]:
tips2

In [7]:
type(tips2)

NoneType

In [8]:
first_five_rows = tips.head(5)

In [9]:
first_row = first_five_rows[0]

In [10]:
first_row.total_bill

16.99

In [11]:
tips

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [12]:
tips.select('total_bill', 'smoker', 'day').show()

+----------+------+---+
|total_bill|smoker|day|
+----------+------+---+
|     16.99|    No|Sun|
|     10.34|    No|Sun|
|     21.01|    No|Sun|
|     23.68|    No|Sun|
|     24.59|    No|Sun|
|     25.29|    No|Sun|
|      8.77|    No|Sun|
|     26.88|    No|Sun|
|     15.04|    No|Sun|
|     14.78|    No|Sun|
|     10.27|    No|Sun|
|     35.26|    No|Sun|
|     15.42|    No|Sun|
|     18.43|    No|Sun|
|     14.83|    No|Sun|
|     21.58|    No|Sun|
|     10.33|    No|Sun|
|     16.29|    No|Sun|
|     16.97|    No|Sun|
|     20.65|    No|Sat|
+----------+------+---+
only showing top 20 rows



In [13]:
tip_percentage = tips.tip / tips.total_bill

In [14]:
tip_percentage.alias('tip_perc')

Column<'(tip / total_bill) AS tip_perc'>

In [15]:
tip_percentage

Column<'(tip / total_bill)'>

In [16]:
tips

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [17]:
tips = tips.select('*', tip_percentage.alias('tip_perc'))

In [18]:
tips

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint, tip_perc: double]

In spark, in order to modify objects, you must re-assign!

Can we use pandas/numpy methods (e.g. `np.where`) with a spark dataframe? No, there's ways to do it w/ the spark API

In [19]:
tips.select('size', tips.size.cast('string'))

DataFrame[size: bigint, size: string]

In [20]:
from pyspark.sql.functions import regexp_replace, regexp_extract

In [21]:
regexp_extract('time', r'^(\w)', 1)

Column<'regexp_extract(time, ^(\w), 1)'>

In [22]:
tips.select(
    'time',
    regexp_replace('time', r'.{3}$', ''), # col, pattern, repl,
    regexp_extract('time', r'^(\w)', 1).alias('first_letter')
).show(10)

+------+--------------------------------+------------+
|  time|regexp_replace(time, .{3}$, , 1)|first_letter|
+------+--------------------------------+------------+
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
+------+--------------------------------+------------+
only showing top 10 rows



In [23]:
len(tips.columns), tips.count()

(8, 244)

In [24]:
from pyspark.sql.functions import mean, sum

In [25]:
tips.select(mean('total_bill')).show()

+------------------+
|   avg(total_bill)|
+------------------+
|19.785942622950813|
+------------------+



In [26]:
from pyspark.sql.functions import concat, lit

In [27]:
# party of 4, party of 3
tips.select(
    'size',
    concat(lit('Party of '), 'size'),
).show()

+----+-----------------------+
|size|concat(Party of , size)|
+----+-----------------------+
|   2|             Party of 2|
|   3|             Party of 3|
|   3|             Party of 3|
|   2|             Party of 2|
|   4|             Party of 4|
|   4|             Party of 4|
|   2|             Party of 2|
|   4|             Party of 4|
|   2|             Party of 2|
|   2|             Party of 2|
|   2|             Party of 2|
|   4|             Party of 4|
|   2|             Party of 2|
|   4|             Party of 4|
|   2|             Party of 2|
|   2|             Party of 2|
|   3|             Party of 3|
|   3|             Party of 3|
|   3|             Party of 3|
|   3|             Party of 3|
+----+-----------------------+
only showing top 20 rows



In [28]:
from pyspark.sql.functions import when

In [29]:
when(tips.tip_perc > .20, 'good tip!').otherwise('not a good tip')

Column<'CASE WHEN (tip_perc > 0.2) THEN good tip! ELSE not a good tip END'>

In [30]:
tips.select(
    'tip_perc',
    when(tips.tip_perc > .20, 'good tip!').otherwise('not a good tip').alias('tip_description')
).show()

+-------------------+---------------+
|           tip_perc|tip_description|
+-------------------+---------------+
|0.05944673337257211| not a good tip|
|0.16054158607350097| not a good tip|
|0.16658733936220846| not a good tip|
| 0.1397804054054054| not a good tip|
|0.14680764538430255| not a good tip|
|0.18623962040332148| not a good tip|
|0.22805017103762829|      good tip!|
|0.11607142857142858| not a good tip|
|0.13031914893617022| not a good tip|
| 0.2185385656292287|      good tip!|
| 0.1665043816942551| not a good tip|
|0.14180374361883155| not a good tip|
|0.10181582360570687| not a good tip|
|0.16277807921866522| not a good tip|
|0.20364126770060686|      good tip!|
|0.18164967562557924| not a good tip|
| 0.1616650532429816| not a good tip|
|0.22774708410067526|      good tip!|
|0.20624631703005306|      good tip!|
|0.16222760290556903| not a good tip|
+-------------------+---------------+
only showing top 20 rows



In [31]:
tips.where(tips.tip_perc > .2).show(10)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_perc|
+----------+----+------+------+---+------+----+-------------------+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|0.20364126770060686|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|0.22774708410067526|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|0.20624631703005306|
|     17.92|4.08|  Male|    No|Sat|Dinner|   2|0.22767857142857142|
|     13.94|3.06|  Male|    No|Sun|Dinner|   2|0.21951219512195122|
|     22.23| 5.0|  Male|    No|Sun|Dinner|   2|0.22492127755285649|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2| 0.2526724975704568|
|     18.29|3.76|  Male|   Yes|Sat|Dinner|   4|0.20557681793329688|
+----------+----+------+------+---+------+----+-------------------+
only showing top 10 rows



In [32]:
tips.filter(tips.tip_perc > .2).show(10)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_perc|
+----------+----+------+------+---+------+----+-------------------+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|0.20364126770060686|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|0.22774708410067526|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|0.20624631703005306|
|     17.92|4.08|  Male|    No|Sat|Dinner|   2|0.22767857142857142|
|     13.94|3.06|  Male|    No|Sun|Dinner|   2|0.21951219512195122|
|     22.23| 5.0|  Male|    No|Sun|Dinner|   2|0.22492127755285649|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2| 0.2526724975704568|
|     18.29|3.76|  Male|   Yes|Sat|Dinner|   4|0.20557681793329688|
+----------+----+------+------+---+------+----+-------------------+
only showing top 10 rows



In [33]:
tips.sort(tips.size).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+----+------+------+----+------+----+-------------------+
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|0.13793103448275862|
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|0.32573289902280134|
|     10.07|1.83|Female|    No|Thur| Lunch|   1| 0.1817279046673287|
|      8.58|1.92|  Male|   Yes| Fri| Lunch|   1|0.22377622377622378|
|     19.81|4.19|Female|   Yes|Thur| Lunch|   2|0.21150933871781932|
|     12.76|2.23|Female|   Yes| Sat|Dinner|   2|0.17476489028213166|
|     15.69| 1.5|  Male|   Yes| Sun|Dinner|   2|0.09560229445506692|
|      7.56|1.44|  Male|    No|Thur| Lunch|   2|0.19047619047619047|
|     13.81| 2.0|  Male|   Yes| Sat|Dinner|   2| 0.1448225923244026|
|     15.48|2.02|  Male|   Yes|Thur| Lunch|   2|0.13049095607235142|
|     16.58| 4.0|  Male|   Yes|Thur| Lunch|   2|0.24125452352231608|
|     23.33|5.65|  Male|   Yes| Su

In [34]:
tips.orderBy(tips.size).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+----+------+------+----+------+----+-------------------+
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|0.32573289902280134|
|     10.07|1.83|Female|    No|Thur| Lunch|   1| 0.1817279046673287|
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|0.13793103448275862|
|      8.58|1.92|  Male|   Yes| Fri| Lunch|   1|0.22377622377622378|
|     13.81| 2.0|  Male|   Yes| Sat|Dinner|   2| 0.1448225923244026|
|     15.01|2.09|  Male|   Yes| Sat|Dinner|   2|0.13924050632911392|
|     16.45|2.47|Female|    No| Sat|Dinner|   2| 0.1501519756838906|
|     17.29|2.71|  Male|    No|Thur| Lunch|   2|  0.156737998843262|
|     11.02|1.98|  Male|   Yes| Sat|Dinner|   2|0.17967332123411978|
|     14.73| 2.2|Female|    No| Sat|Dinner|   2|0.14935505770536323|
|     20.23|2.01|  Male|    No| Sat|Dinner|   2|0.09935739001482945|
|     19.44| 3.0|  Male|   Yes|Thu

Filtering and Sorting:

- `.filter` or `.where` to filter rows based on a condition
- `.sort` or `.orderBy` to sort the data

In [35]:
tips.orderBy(tips.size.desc()).show(5)

+----------+---+------+------+----+------+----+-------------------+
|total_bill|tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+---+------+------+----+------+----+-------------------+
|     27.05|5.0|Female|    No|Thur| Lunch|   6|0.18484288354898337|
|      34.3|6.7|  Male|    No|Thur| Lunch|   6|0.19533527696793004|
|      29.8|4.2|Female|    No|Thur| Lunch|   6|0.14093959731543623|
|     48.17|5.0|  Male|    No| Sun|Dinner|   6|0.10379904504878555|
|     41.19|5.0|  Male|    No|Thur| Lunch|   5|0.12138868657441128|
+----------+---+------+------+----+------+----+-------------------+
only showing top 5 rows



In [36]:
from pyspark.sql.functions import asc, desc

In [37]:
tips.orderBy(desc('size'), asc('total_bill')).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+----+------+------+----+------+----+-------------------+
|     27.05| 5.0|Female|    No|Thur| Lunch|   6|0.18484288354898337|
|      29.8| 4.2|Female|    No|Thur| Lunch|   6|0.14093959731543623|
|      34.3| 6.7|  Male|    No|Thur| Lunch|   6|0.19533527696793004|
|     48.17| 5.0|  Male|    No| Sun|Dinner|   6|0.10379904504878555|
|     20.69| 5.0|  Male|    No| Sun|Dinner|   5| 0.2416626389560174|
|     28.15| 3.0|  Male|   Yes| Sat|Dinner|   5|0.10657193605683837|
|     29.85|5.14|Female|    No| Sun|Dinner|   5| 0.1721943048576214|
|     30.46| 2.0|  Male|   Yes| Sun|Dinner|   5|0.06565988181221273|
|     41.19| 5.0|  Male|    No|Thur| Lunch|   5|0.12138868657441128|
|     16.49| 2.0|  Male|    No| Sun|Dinner|   4|0.12128562765312312|
|     17.81|2.34|  Male|    No| Sat|Dinner|   4|0.13138686131386862|
|     18.29|3.76|  Male|   Yes| Sa

## Part 2: Aggregating and More Examples

In [40]:
tips.show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_perc|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [42]:
tips.groupBy('time').agg(mean('total_bill')).show()

+------+------------------+
|  time|   avg(total_bill)|
+------+------------------+
|Dinner| 20.79715909090909|
| Lunch|17.168676470588235|
+------+------------------+



In [45]:
tips.groupBy('time', 'day').agg(mean('total_bill')).sort('time', 'day').show()

+------+----+------------------+
|  time| day|   avg(total_bill)|
+------+----+------------------+
|Dinner| Fri| 19.66333333333333|
|Dinner| Sat|20.441379310344825|
|Dinner| Sun|21.409999999999997|
|Dinner|Thur|             18.78|
| Lunch| Fri|12.845714285714285|
| Lunch|Thur|17.664754098360653|
+------+----+------------------+



In [47]:
from pyspark.sql.functions import min, max

In [49]:
tips.groupBy('time', 'day').agg(
    mean('total_bill').alias('avg_bill'),
    min('tip').alias('smallest_tip'),
    max('tip').alias('largest_tip')
).show()

+------+----+------------------+------------+-----------+
|  time| day|          avg_bill|smallest_tip|largest_tip|
+------+----+------------------+------------+-----------+
|Dinner| Sun|21.409999999999997|        1.01|        6.5|
|Dinner| Sat|20.441379310344825|         1.0|       10.0|
| Lunch|Thur|17.664754098360653|        1.25|        6.7|
|Dinner| Fri| 19.66333333333333|         1.0|       4.73|
|Dinner|Thur|             18.78|         3.0|        3.0|
| Lunch| Fri|12.845714285714285|        1.58|       3.48|
+------+----+------------------+------------+-----------+



In [50]:
tips.groupby('time').pivot('day').agg(mean('total_bill')).show()

+------+------------------+------------------+------------------+------------------+
|  time|               Fri|               Sat|               Sun|              Thur|
+------+------------------+------------------+------------------+------------------+
| Lunch|12.845714285714285|              null|              null|17.664754098360653|
|Dinner| 19.66333333333333|20.441379310344825|21.409999999999997|             18.78|
+------+------------------+------------------+------------------+------------------+



In [51]:
tips.crosstab('day', 'time').show()

+--------+------+-----+
|day_time|Dinner|Lunch|
+--------+------+-----+
|     Sat|    87|    0|
|     Sun|    76|    0|
|    Thur|     1|   61|
|     Fri|    12|    7|
+--------+------+-----+



## Spark Miscellania

In [52]:
tips.createOrReplaceTempView('tips')

In [55]:
spark.sql('select * from tips').show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_perc|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [57]:
spark.sql('''
SELECT
    time,
    day,
    mean(total_bill) AS avg_bill,
    min(tip) AS smallest_tip,
    max(tip) AS biggest_tip
FROM tips
GROUP BY time, day
''').show()

+------+----+------------------+------------+-----------+
|  time| day|          avg_bill|smallest_tip|biggest_tip|
+------+----+------------------+------------+-----------+
|Dinner| Sun|21.409999999999997|        1.01|        6.5|
|Dinner| Sat|20.441379310344825|         1.0|       10.0|
| Lunch|Thur|17.664754098360653|        1.25|        6.7|
|Dinner| Fri| 19.66333333333333|         1.0|       4.73|
|Dinner|Thur|             18.78|         3.0|        3.0|
| Lunch| Fri|12.845714285714285|        1.58|       3.48|
+------+----+------------------+------------+-----------+



In [58]:
from pyspark.sql.functions import expr

In [60]:
tips.select('*', expr('total_bill / size AS price_per_person')).show()

+----------+----+------+------+---+------+----+-------------------+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_perc|  price_per_person|
+----------+----+------+------+---+------+----+-------------------+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|             8.495|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|3.4466666666666668|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846| 7.003333333333334|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|             11.84|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|            6.1475|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|            6.3225|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|             4.385|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|              6.72|
|     15.04|1.96|  Male|    No|Sun|Dinner| 

In [61]:
tips.groupBy('smoker').agg(expr('MAX(total_bill) AS biggest_bill')).show()

+------+------------+
|smoker|biggest_bill|
+------+------------+
|    No|       48.33|
|   Yes|       50.81|
+------+------------+



In [63]:
mpg = spark.createDataFrame(pydataset.data('mpg'))

In [64]:
mpg.explain()

== Physical Plan ==
*(1) Scan ExistingRDD[manufacturer#1092,model#1093,displ#1094,year#1095L,cyl#1096L,trans#1097,drv#1098,cty#1099L,hwy#1100L,fl#1101,class#1102]




In [66]:
mpg.select(mpg.hwy.alias('highway_mileage')).explain()

== Physical Plan ==
*(1) Project [hwy#1100L AS highway_mileage#1116L]
+- *(1) Scan ExistingRDD[manufacturer#1092,model#1093,displ#1094,year#1095L,cyl#1096L,trans#1097,drv#1098,cty#1099L,hwy#1100L,fl#1101,class#1102]




In [68]:
mpg.filter(mpg.cyl == 4).select(mpg.hwy.alias('highway_mileage')).explain()

== Physical Plan ==
*(1) Project [hwy#1100L AS highway_mileage#1120L]
+- *(1) Filter (isnotnull(cyl#1096L) AND (cyl#1096L = 4))
   +- *(1) Scan ExistingRDD[manufacturer#1092,model#1093,displ#1094,year#1095L,cyl#1096L,trans#1097,drv#1098,cty#1099L,hwy#1100L,fl#1101,class#1102]




In [69]:
mpg.select('cyl', mpg.hwy.alias('highway_mileage')).filter(mpg.cyl == 4).explain()

== Physical Plan ==
*(1) Project [cyl#1096L, hwy#1100L AS highway_mileage#1122L]
+- *(1) Filter (isnotnull(cyl#1096L) AND (cyl#1096L = 4))
   +- *(1) Scan ExistingRDD[manufacturer#1092,model#1093,displ#1094,year#1095L,cyl#1096L,trans#1097,drv#1098,cty#1099L,hwy#1100L,fl#1101,class#1102]




## More Examples

In [70]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



In [79]:
weather

DataFrame[date: string, precipitation: double, temp_max: double, temp_min: double, wind: double, weather: string]

In [76]:
weather.withColumn('did_rain', weather.precipitation > 0).show(5)

+----------+-------------+--------+--------+----+-------+--------+
|      date|precipitation|temp_max|temp_min|wind|weather|did_rain|
+----------+-------------+--------+--------+----+-------+--------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|   false|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|    true|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|    true|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|    true|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|    true|
+----------+-------------+--------+--------+----+-------+--------+
only showing top 5 rows



In [80]:
from pyspark.sql.functions import month, year, quarter

(
    weather.withColumn("year", year("date"))
    .withColumn("quarter", quarter("date"))
    .groupBy('year', 'quarter')
    .agg(sum("precipitation").alias("total_rainfall"))
    .sort("total_rainfall")
    .show()
)

+----+-------+------------------+
|year|quarter|    total_rainfall|
+----+-------+------------------+
|2012|      3|27.200000000000003|
|2015|      2| 72.29999999999997|
|2015|      3|106.69999999999999|
|2014|      3|122.29999999999998|
|2013|      4|             177.9|
|2013|      3|             191.2|
|2012|      2|195.39999999999998|
|2014|      2|204.90000000000006|
|2013|      1|215.70000000000005|
|2013|      2|243.20000000000005|
|2015|      1|340.70000000000016|
|2014|      4|416.40000000000015|
|2012|      1| 448.6000000000001|
|2014|      1| 489.2000000000001|
|2012|      4| 554.8000000000001|
|2015|      4| 619.4999999999999|
+----+-------+------------------+



Number of days with freezing temperatures for each month in 2013

In [81]:
weather.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [84]:
from pyspark.sql.functions import col

In [86]:
weather.temp_min

Column<'temp_min'>

In [87]:
col('temp_min')

Column<'temp_min'>

In [89]:
col('abcd')

Column<'abcd'>

In [88]:
weather.abcd

AttributeError: 'DataFrame' object has no attribute 'abcd'

In [93]:
(
    weather.filter(year("date") == 2013)
    .withColumn("temp_avg", (weather.temp_min + weather.temp_max) / 2)
    .withColumn("freezing_temps", (col('temp_avg') <= 0).cast("int"))
    .withColumn("month", month("date"))
    .groupBy("month")
    .agg(sum("freezing_temps").alias("no_of_days_with_freezing_temps"))
    .sort("month")
    .show()
)

+-----+------------------------------+
|month|no_of_days_with_freezing_temps|
+-----+------------------------------+
|    1|                             3|
|    2|                             0|
|    3|                             0|
|    4|                             0|
|    5|                             0|
|    6|                             0|
|    7|                             0|
|    8|                             0|
|    9|                             0|
|   10|                             0|
|   11|                             0|
|   12|                             5|
+-----+------------------------------+



In [94]:
(
    weather.filter(year("date") == 2013)
    .withColumn("temp_avg", (weather.temp_min + weather.temp_max) / 2)
    .withColumn("freezing_temps", (col('temp_avg') <= 0).cast("int"))
    .withColumn("month", month("date"))
    .groupBy("month")
    .agg(sum("freezing_temps").alias("no_of_days_with_freezing_temps"))
    .sort("month")
    .explain()
)

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Sort [month#1640 ASC NULLS FIRST], true, 0
   +- Exchange rangepartitioning(month#1640 ASC NULLS FIRST, 200), ENSURE_REQUIREMENTS, [id=#1304]
      +- HashAggregate(keys=[month#1640], functions=[sum(freezing_temps#1631)])
         +- Exchange hashpartitioning(month#1640, 200), ENSURE_REQUIREMENTS, [id=#1301]
            +- HashAggregate(keys=[month#1640], functions=[partial_sum(freezing_temps#1631)])
               +- Project [cast((((temp_min#1128 + temp_max#1127) / 2.0) <= 0.0) as int) AS freezing_temps#1631, month(cast(date#1125 as date)) AS month#1640]
                  +- Filter (isnotnull(date#1125) AND (year(cast(date#1125 as date)) = 2013))
                     +- Scan ExistingRDD[date#1125,precipitation#1126,temp_max#1127,temp_min#1128,wind#1129,weather#1130]




In [95]:
(
    weather.filter(year("date") == 2013)
    .withColumn("temp_avg", (weather.temp_min + weather.temp_max) / 2)
    .withColumn("freezing_temps", (col('temp_avg') <= 0).cast("int"))
    .withColumn("month", month("date"))
    .groupBy("month")
    .agg(sum("freezing_temps").alias("no_of_days_with_freezing_temps"))
    .sort("month")
    .toPandas()
)

Unnamed: 0,month,no_of_days_with_freezing_temps
0,1,3
1,2,0
2,3,0
3,4,0
4,5,0
5,6,0
6,7,0
7,8,0
8,9,0
9,10,0
