# Imports

In [28]:
import multiprocessing
import pyspark
import pandas as pd
import numpy as np
from pyspark.sql import functions as F
session = pyspark.sql.SparkSession.builder.getOrCreate()

# 1. Create a spark data frame that contains your favorite programming languages.

- 1a. The name of the column should be language
- 1b. View the schema of the dataframe
- 1c. Output the shape of the dataframe
- 1d. Show the first 5 records in the dataframe

In [10]:
# 1a. The name of the column should be language
# 1b. View the schema of the dataframe
data = ['Python','C++','Javascript','Java','Ruby','R','Scala','Cuda','SQL']

df = pd.DataFrame(data, columns=['languages'])
df

Unnamed: 0,languages
0,Python
1,C++
2,Javascript
3,Java
4,Ruby
5,R
6,Scala
7,Cuda
8,SQL


In [11]:
df = session.createDataFrame(df)

In [15]:
# 1c. Output the shape of the dataframe
df.count(), len(df.columns)

(9, 1)

In [18]:
# 1d. Show the first 5 records in the dataframe
df.show(5)

+----------+
| languages|
+----------+
|    Python|
|       C++|
|Javascript|
|      Java|
|      Ruby|
+----------+
only showing top 5 rows



# 2. Load the mpg dataset as a spark dataframe.

    Create 1 column of output that contains a message like the one below:


        ~ The 1999 audi a4 has a 4 cylinder engine.

In [19]:
from pydataset import data

In [20]:
mpg = data('mpg')

In [25]:
mpg = session.createDataFrame(mpg)

In [27]:
mpg.show(2)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 2 rows



In [58]:
# 2
mpg.select(
    (F.concat
     (
         F.lit('The '),mpg['year'].cast('string'),F.lit(' '),
      'manufacturer',F.lit(' '),F.col('model'),F.lit(' has a '),
      F.col('cyl'),F.lit(' cylinder engine.')
      )
      .alias('Requested String')),
      mpg.drv,
      mpg.cty,
      mpg.hwy
      ).show(5,truncate=False)

+-----------------------------------------+---+---+---+
|Requested String                         |drv|cty|hwy|
+-----------------------------------------+---+---+---+
|The 1999 audi a4 has a 4 cylinder engine.|f  |18 |29 |
|The 1999 audi a4 has a 4 cylinder engine.|f  |21 |29 |
|The 2008 audi a4 has a 4 cylinder engine.|f  |20 |31 |
|The 2008 audi a4 has a 4 cylinder engine.|f  |21 |30 |
|The 1999 audi a4 has a 6 cylinder engine.|f  |16 |26 |
+-----------------------------------------+---+---+---+
only showing top 5 rows



# 3. Load the tips dataset as a spark dataframe.

- 3a. What percentage of observations are smokers?
- 3b. Create a column that contains the tip percentage
- 3c. Calculate the average tip percentage for each combination of sex and smoker.

In [59]:
tips = data('tips')

In [60]:
tips = session.createDataFrame(tips)

In [62]:
tips.show(4)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 4 rows



In [65]:
tips.count(), len(tips.columns)

(244, 7)

In [102]:
# 3a. What percentage of observations are smokers?
num_smokers = tips.filter(tips.smoker == 'Yes', ).count()

print(f'Percentage of Smokers is {round((num_smokers / tips.count()) * 100, 2)}%')

Percentage of Smokers is 38.11%


In [None]:
tips.filter((tips.smoker == 'Yes') & (tips.sex == 'Male'))

In [182]:
# 3b. Create a column that contains the tip percentage
tips.withColumn('tip (%)', F.round((F.col('total_bill') / tips.tip), 2)).show(5)

+----------+----+------+------+---+------+----+-------+
|total_bill| tip|   sex|smoker|day|  time|size|tip (%)|
+----------+----+------+------+---+------+----+-------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|  16.82|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|   6.23|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    6.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|   7.15|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|   6.81|
+----------+----+------+------+---+------+----+-------+
only showing top 5 rows



In [150]:
# 3c. Calculate the average tip percentage for each combination of sex and smoker.
tip_smoker = round((num_smokers / tips.count()) * 100, 2)
tip_non_smoker = round(((tips.count() - num_smokers) / tips.count()) * 100, 2)

In [230]:
male_smokers = tips.filter((tips.smoker == 'Yes') & (tips.sex == 'Male'))
male_smokers_tip = male_smokers.withColumn('tip (%)', F.round((F.col('total_bill') / tips.tip), 2))

male_non_smokers = tips.filter((tips.smoker != 'Yes') & (tips.sex == 'Male'))
male_non_smokers_tip = male_non_smokers.withColumn('tip (%)', F.round((F.col('total_bill') / tips.tip), 2))


female_smokers = tips.filter((tips.smoker == 'Yes') & (tips.sex != 'Male'))
female_smokers_tip = female_smokers.withColumn('tip (%)', F.round((F.col('total_bill') / tips.tip), 2))

female_non_smokers = tips.filter((tips.smoker != 'Yes') & (tips.sex != 'Male'))
female_non_smokers_tip = female_non_smokers.withColumn('tip (%)', F.round((F.col('total_bill') / tips.tip), 2))

In [232]:
print(f'Avg Tip for Male Smokers is {male_smokers_tip.select(F.round(F.avg(male_smokers_tip["tip (%)"]), 2)).collect()[0][0]}%')
print(f'Avg Tip for Male Non-Smokers is {male_non_smokers_tip.select(F.round(F.avg(male_non_smokers_tip["tip (%)"]), 2)).collect()[0][0]}%')
print(f'Avg Tip for Female Smokers is {female_smokers_tip.select(F.round(F.avg(female_smokers_tip["tip (%)"]), 2)).collect()[0][0]}%')
print(f'Avg Tip for Female Non-Smokers is {female_non_smokers_tip.select(F.round(F.avg(female_non_smokers_tip["tip (%)"]), 2)).collect()[0][0]}%')


Avg Tip for Male Smokers is 8.12%
Avg Tip for Male Non-Smokers is 6.69%
Avg Tip for Female Smokers is 6.45%
Avg Tip for Female Non-Smokers is 6.88%


4. Use the Seattle weather dataset referenced in the lesson to answer the questions below.

- 4a. Convert the temperatures to fahrenheit.
- 4b. Which month has the most rain, on average?
- 4c. Which year was the windiest?
- 4d. What is the most frequent type of weather in January?
- 4e. What is the average high and low temperature on sunny days in July in 2013 and 2014?
- 4f. What percentage of days were rainy in Q3 of 2015?
- 4g. For each year, find what percentage of days it rained (had non-zero precipitation).

In [263]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = session.createDataFrame(weather)

In [265]:
weather_fahrenheit = weather.withColumn('temp_max', F.round((F.col('temp_max') * 9/5) + 32, 2))
weather_fahrenheit = weather_fahrenheit.withColumn('temp_min', F.round((F.col('temp_min') * 9/5) + 32, 2))

In [267]:
# 4a. Convert the temperatures to fahrenheit.
weather_fahrenheit.show(3)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03|          0.8|   53.06|   44.96| 2.3|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 3 rows



In [273]:
# 4b. Which month has the most rain, on average?
grouprain = weather.withColumn(
    'month', F.month(F.col('date')
                    )
).groupBy(
    'month', 'weather'
).agg(
    F.mean(weather.precipitation).alias('total_rain')
).sort('total_rain',ascending=False)

In [298]:
grouprain.show(3)

+-----+-------+------------------+
|month|weather|        total_rain|
+-----+-------+------------------+
|    3|   snow|11.766666666666666|
|   12|   snow|11.680000000000001|
|   10|   rain|             9.675|
+-----+-------+------------------+
only showing top 3 rows



In [276]:
rain = grouprain.filter(grouprain.weather == 'rain')

In [299]:
rain.groupby('month','total_rain').mean().show(3)

+-----+-----------------+----------+-----------------+
|month|       total_rain|avg(month)|  avg(total_rain)|
+-----+-----------------+----------+-----------------+
|   10|            9.675|      10.0|            9.675|
|   11|             8.42|      11.0|             8.42|
|    1|6.614285714285715|       1.0|6.614285714285715|
+-----+-----------------+----------+-----------------+
only showing top 3 rows



In [291]:
# 4c. Which year was the windiest?
groupwind = weather.withColumn(
    'year', F.year(F.col('date')
                    )
).groupBy(
    'year'
).agg(
    F.mean(weather.wind).alias('total_wind')
).sort('total_wind',ascending=False)

In [297]:
groupwind.show(1)

+----+-----------------+
|year|       total_wind|
+----+-----------------+
|2012|3.400819672131147|
+----+-----------------+
only showing top 1 row



In [None]:
# 4d. What is the most frequent type of weather in January?
