# Exercises

In [52]:
import pandas as pd
import numpy as np

import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

from pyspark.sql.functions import col, expr, concat, lit, ceil, round, avg, max, mean, sum

1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language

In [2]:
data = {"language": ['python', 'javascript', 'c#', 'julia']}
language = pd.DataFrame(data)
language

Unnamed: 0,language
0,python
1,javascript
2,c#
3,julia


In [3]:
df = spark.createDataFrame(language)
df

DataFrame[language: string]

- View the schema of the dataframe

In [4]:
df.printSchema()

root
 |-- language: string (nullable = true)



- Output the shape of the dataframe

In [5]:
def spark_shape(self):
    return (self.count(), len(self.columns))
pyspark.sql.dataframe.DataFrame.shape = spark_shape

In [6]:
df.shape()

(4, 1)

- Show the first 5 records in the dataframe

In [7]:
df.show(5)

+----------+
|  language|
+----------+
|    python|
|javascript|
|        c#|
|     julia|
+----------+



2. Load the mpg dataset as a spark dataframe.

In [8]:
from pydataset import data

mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



    a. Create 1 column of output that contains a message like the one below:


> The 1999 audi a4 has a 4 cylinder engine.

For each vehicle.

col1 = expr("The " + mpg.year + " " + mpg.manufacturer + " " + mpg.model + " " + "has a " + mpg.cyl + " cylinder engine.")







    b. Transform the trans column so that it only contains either manual or auto.

In [9]:
from pyspark.sql.functions import regexp_extract, regexp_replace
mpg.select(
    regexp_extract("trans", r"^(\w+)", 1).alias("transmission"),
).show(truncate=False)

+------------+
|transmission|
+------------+
|auto        |
|manual      |
|manual      |
|auto        |
|auto        |
|manual      |
|auto        |
|manual      |
|auto        |
|manual      |
|auto        |
|auto        |
|manual      |
|auto        |
|manual      |
|auto        |
|auto        |
|auto        |
|auto        |
|auto        |
+------------+
only showing top 20 rows



3. Load the tips dataset as a spark dataframe.

    a. What percentage of observations are smokers?

In [10]:
tips = spark.createDataFrame(data('tips'))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



    b. Create a column that contains the tip percentage

In [11]:
col1 = (tips.tip / tips.total_bill).alias("tip_percentage") 

tips.select(col1).show()

+-------------------+
|     tip_percentage|
+-------------------+
|0.05944673337257211|
|0.16054158607350097|
|0.16658733936220846|
| 0.1397804054054054|
|0.14680764538430255|
|0.18623962040332148|
|0.22805017103762829|
|0.11607142857142858|
|0.13031914893617022|
| 0.2185385656292287|
| 0.1665043816942551|
|0.14180374361883155|
|0.10181582360570687|
|0.16277807921866522|
|0.20364126770060686|
|0.18164967562557924|
| 0.1616650532429816|
|0.22774708410067526|
|0.20624631703005306|
|0.16222760290556903|
+-------------------+
only showing top 20 rows



    c. Calculate the average tip percentage for each combination of sex and smoker.

In [12]:
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [13]:
tips.createOrReplaceTempView("tips")

In [14]:
col1 = spark.sql(
    """
    SELECT AVG((tip / total_bill)) AS non_smoker_female_tip_percentage
    FROM tips
    WHERE sex like 'Female' 
    AND smoker like 'No'
    """
).show()

col2 = spark.sql(
    """
    SELECT AVG((tip / total_bill)) AS smoker_female_tip_percentage
    FROM tips
    WHERE sex like 'Female' 
    AND smoker like 'Yes'
    """
).show()

col3 = spark.sql(
    """
    SELECT AVG((tip / total_bill)) AS non_smoker_male_tip_percentage
    FROM tips
    WHERE sex like 'Male' 
    AND smoker like 'No'
    """
).show()

col4 = spark.sql(
    """
    SELECT AVG((tip / total_bill)) AS smoker_male_tip_percentage
    FROM tips
    WHERE sex like 'Male' 
    AND smoker like 'Yes'
    """
).show()

+--------------------------------+
|non_smoker_female_tip_percentage|
+--------------------------------+
|              0.1569209707691836|
+--------------------------------+

+----------------------------+
|smoker_female_tip_percentage|
+----------------------------+
|         0.18215035269941035|
+----------------------------+

+------------------------------+
|non_smoker_male_tip_percentage|
+------------------------------+
|            0.1606687151291298|
+------------------------------+

+--------------------------+
|smoker_male_tip_percentage|
+--------------------------+
|        0.1527711752024851|
+--------------------------+



4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [15]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)

In [16]:
weather.show()

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
|2012-01-07|          0.0|     7.2|     2.8| 2.3|   rain|
|2012-01-08|          0.0|    10.0|     2.8| 2.0|    sun|
|2012-01-09|          4.3|     9.4|     5.0| 3.4|   rain|
|2012-01-10|          1.0|     6.1|     0.6| 3.4|   rain|
|2012-01-11|          0.0|     6.1|    -1.1| 5.1|    sun|
|2012-01-12|          0.0|     6.1|    -1.7| 1.9|    sun|
|2012-01-13|          0.0|     5.0|    -2.8| 1.3|    sun|
|2012-01-14|          4.1|     4.4|     0.6| 5.3|   snow|
|2012-01-15|  

- Convert the temperatures to farenheight.

In [17]:
fahrenheit = weather.select(
    weather.date,
    weather.precipitation,
    round(weather.temp_max * (9/5) + 32).alias("max_fahr_temp"),
    round(weather.temp_min * (9/5) + 32).alias("low_fahr_temp"),
    weather.wind,
    weather.weather,
).show()

+----------+-------------+-------------+-------------+----+-------+
|      date|precipitation|max_fahr_temp|low_fahr_temp|wind|weather|
+----------+-------------+-------------+-------------+----+-------+
|2012-01-01|          0.0|         55.0|         41.0| 4.7|drizzle|
|2012-01-02|         10.9|         51.0|         37.0| 4.5|   rain|
|2012-01-03|          0.8|         53.0|         45.0| 2.3|   rain|
|2012-01-04|         20.3|         54.0|         42.0| 4.7|   rain|
|2012-01-05|          1.3|         48.0|         37.0| 6.1|   rain|
|2012-01-06|          2.5|         40.0|         36.0| 2.2|   rain|
|2012-01-07|          0.0|         45.0|         37.0| 2.3|   rain|
|2012-01-08|          0.0|         50.0|         37.0| 2.0|    sun|
|2012-01-09|          4.3|         49.0|         41.0| 3.4|   rain|
|2012-01-10|          1.0|         43.0|         33.0| 3.4|   rain|
|2012-01-11|          0.0|         43.0|         30.0| 5.1|    sun|
|2012-01-12|          0.0|         43.0|        

- Which month has the most rain, on average?

In [18]:
most_rain = weather.select(
    regexp_extract("date", r"\d+-(\d{2})", 1).alias("month"),
    weather.precipitation,
    weather.weather,
)

In [19]:
most_rain.show()

+-----+-------------+-------+
|month|precipitation|weather|
+-----+-------------+-------+
|   01|          0.0|drizzle|
|   01|         10.9|   rain|
|   01|          0.8|   rain|
|   01|         20.3|   rain|
|   01|          1.3|   rain|
|   01|          2.5|   rain|
|   01|          0.0|   rain|
|   01|          0.0|    sun|
|   01|          4.3|   rain|
|   01|          1.0|   rain|
|   01|          0.0|    sun|
|   01|          0.0|    sun|
|   01|          0.0|    sun|
|   01|          4.1|   snow|
|   01|          5.3|   snow|
|   01|          2.5|   snow|
|   01|          8.1|   snow|
|   01|         19.8|   snow|
|   01|         15.2|   snow|
|   01|         13.5|   snow|
+-----+-------------+-------+
only showing top 20 rows



In [20]:
most_rain.groupBy("month").agg(avg("precipitation").alias("avg_rainfall")).sort("month").show()

+-----+-------------------+
|month|       avg_rainfall|
+-----+-------------------+
|   01| 3.7580645161290316|
|   02|  3.734513274336283|
|   03|  4.888709677419355|
|   04| 3.1283333333333325|
|   05| 1.6733870967741935|
|   06| 1.1075000000000002|
|   07|0.38870967741935486|
|   08| 1.3201612903225806|
|   09| 1.9624999999999997|
|   10|  4.059677419354839|
|   11|  5.354166666666667|
|   12|  5.021774193548388|
+-----+-------------------+



- Which year was the windiest?

In [22]:
from pyspark.sql.functions import month, year, quarter

In [55]:
(
weather.withColumn("year", year("date"))
.groupBy("year")
.agg(sum("wind")
.alias("wind_by_year"))
.show()
)

+----+------------------+
|year|      wind_by_year|
+----+------------------+
|2015|            1153.3|
|2013|1100.7999999999997|
|2014|            1236.5|
|2012|1244.7000000000003|
+----+------------------+



- What is the most frequent type of weather in January?

In [57]:
(
    weather.filter(month("date") == 1)
    .withColumn("month", month("date"))
    .groupBy("weather")
    .count()
    .show()
 )

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|drizzle|   10|
|   rain|   35|
|    sun|   33|
|   snow|    8|
+-------+-----+



- What is the average high and low tempurature on sunny days in July in 2013 and 2014?

In [61]:
(
    weather.filter(month("date") == 7)
    .filter(year("date") == 2013)
    .filter(year("date") == 2014)
    .withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("weather")
    .agg(mean("temp_max"))
    .show()
)

+-------+-------------+
|weather|avg(temp_max)|
+-------+-------------+
+-------+-------------+



- What percentage of days were rainy in q3 of 2015?

- For each year, find what percentage of days it rained (had non-zero precipitation).