In [98]:
import multiprocessing
import pyspark
import pandas as pd
import numpy as np

nprocs = multiprocessing.cpu_count()
import pyspark.sql.functions as F

from pyspark.sql.functions import *
from pyspark.sql.functions import when

# create spark object
spark = (pyspark.sql.SparkSession.builder
 .master('local')
 .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.16')
 .config('spark.driver.memory', '4G')
 .config('spark.driver.cores', nprocs)
 .config('spark.sql.shuffle.partitions', nprocs)
 .appName('MySparkApplication')
 .getOrCreate())

## Exercise 1:

Create a jupyter notebook or python script named `spark101` for this exercise.

Create a spark data frame that contains your favorite programming languages.

- Create a dataframe with one column named `language`
> Hint: Start with a pandas dataframe. Maybe use a dictionary?
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [99]:
# create dataframe of programming languages and assign to a variable
languages = pd.DataFrame(
    {'language':['python', 'javascript', 'java', 'c#', 'typescript', 'r', 'swift']})
languages

Unnamed: 0,language
0,python
1,javascript
2,java
3,c#
4,typescript
5,r
6,swift


In [100]:
# convert dataframe to a spark object
df = spark.createDataFrame(languages)
df

DataFrame[language: string]

In [101]:
# check the schema
df.printSchema()

root
 |-- language: string (nullable = true)



In [102]:
# check the shape of the dataframe
print("DataFrame shape: ", df.count(), " x ", len(df.columns))

DataFrame shape:  7  x  1


In [103]:
# check the first 5 records
df.show(5)

+----------+
|  language|
+----------+
|    python|
|javascript|
|      java|
|        c#|
|typescript|
+----------+
only showing top 5 rows



## Exercise 2:

Load the `mpg` dataset as a spark dataframe.

a. Create 1 column of output that contains a message like the one below for each record:

    The 1999 audi a4 has a 4 cylinder engine.

> Hint: You will need to concatenate values that already exist in the data with string literals

b. Transform the trans column so that it only contains either manual or auto.

> Hint: Consider spark string methods and `when().otherwise()` chaining

In [104]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [105]:
from pyspark.sql.functions import lit

mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer, lit(' '), mpg.model, lit(' has a '), mpg.cyl, lit(' cylinder engine.')).alias('vehicle_cylinder_description')).show(5, truncate=False)

+-----------------------------------------+
|vehicle_cylinder_description             |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



In [106]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [107]:
# b. Transform the trans column so that it only contains either manual or auto.

mpg.select(col('trans'), when(mpg.trans.like('a%'), 'auto').otherwise('manual').alias('trans_type')).show(5)

+----------+----------+
|     trans|trans_type|
+----------+----------+
|  auto(l5)|      auto|
|manual(m5)|    manual|
|manual(m6)|    manual|
|  auto(av)|      auto|
|  auto(l5)|      auto|
+----------+----------+
only showing top 5 rows



## Exercise 3: 

Load the `tips` dataset as a spark dataframe.

a. What percentage of observations are smokers?

b. Create a column that contains the tip percentage

c. Calculate the average tip percentage for each combination of sex and smoker.

In [108]:
tips = spark.createDataFrame(data("tips"))
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [109]:
tips.groupBy('smoker').count().withColumn('pct', 
                                          round(col('count')/tips.count()*100, 0)).show()

+------+-----+----+
|smoker|count| pct|
+------+-----+----+
|    No|  151|62.0|
|   Yes|   93|38.0|
+------+-----+----+



In [110]:
tips = tips.withColumn('tip_pct', col('tip')/col('total_bill'))
tips.show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|0

In [111]:
tips.groupby('sex').pivot('smoker').agg(mean('tip_pct')).show()

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941032|
|  Male|0.1606687151291298|0.15277117520248512|
+------+------------------+-------------------+



# ------------------------------------------------------------------------
## Exercise 4:

Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [112]:
from vega_datasets import data

df = data.seattle_weather()
df = spark.createDataFrame(df)
df.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [113]:
# Convert the temperatures to fahrenheit
df = df.withColumn('temp_max', round(df.temp_max * 1.8 + 32)).withColumn('temp_min', round(df.temp_min * 1.8 + 32))
df.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    55.0|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    51.0|    37.0| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    53.0|    45.0| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    54.0|    42.0| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|    48.0|    37.0| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [123]:
# Which month has the most rain, on average?
df.withColumn('month', month(df.date)).groupBy('month').agg(avg('precipitation').alias('avg_rain')).sort(col('avg_rain').desc()).first()

Row(month=11, avg_rain=5.354166666666667)

In [124]:
# Which year was the windiest?
df.withColumn('year', year(df.date)).groupBy("year").agg(sum("wind").alias('windiness')).sort(col('windiness').desc()).first()

Row(year=2012, windiness=1244.7)

In [129]:
# What is the most frequent type of weather in January?
df.withColumn('month', 
              month('date')).filter(col('month') == 1).groupBy('weather').count().sort(col('count').desc()).first()

Row(weather='fog', count=38)

In [142]:
# What is the average high and low temperature on sunny days in July in 2013 and 2014?
df.withColumn('month', month('date')).withColumn('year', year('date')).filter(col('year').isin('2013', '2014')).filter(col('month') == 7).filter(col('weather') == 'sun').agg(avg('temp_max').alias('avg_high_temp'), avg('temp_min').alias('avg_low_temp')).show()

+-----------------+-----------------+
|    avg_high_temp|     avg_low_temp|
+-----------------+-----------------+
|80.28846153846153|57.53846153846154|
+-----------------+-----------------+



In [157]:
# What percentage of days were rainy in q3 of 2015?
df.filter(year('date') == 2015).filter(quarter('date') == 3).select(when(col('weather') == 'rain', 1).otherwise(0).alias('rain')).agg(round(mean('rain'), 2)).show()

+-------------------+
|round(avg(rain), 2)|
+-------------------+
|               0.02|
+-------------------+



In [160]:
df.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    55.0|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    51.0|    37.0| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    53.0|    45.0| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    54.0|    42.0| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|    48.0|    37.0| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [161]:
# For each year, find what percentage of days it rained (had non-zero precipitation).
df.withColumn('year', year('date')).select(when(col('precipitation') > 0, 1).otherwise(0).alias('rain'), 'year').groupby('year').agg(round(mean('rain'), 2)).show()

+----+-------------------+
|year|round(avg(rain), 2)|
+----+-------------------+
|2012|               0.48|
|2013|               0.42|
|2014|               0.41|
|2015|               0.39|
+----+-------------------+

