In [52]:
import multiprocessing
import pyspark
import pandas as pd
import numpy as np

nprocs = multiprocessing.cpu_count()
import pyspark.sql.functions as F

from pyspark.sql.functions import *
from pyspark.sql.functions import when

# create spark object
spark = (pyspark.sql.SparkSession.builder
 .master('local')
 .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.16')
 .config('spark.driver.memory', '4G')
 .config('spark.driver.cores', nprocs)
 .config('spark.sql.shuffle.partitions', nprocs)
 .appName('MySparkApplication')
 .getOrCreate())

## Exercise 1:

Create a jupyter notebook or python script named `spark101` for this exercise.

Create a spark data frame that contains your favorite programming languages.

- Create a dataframe with one column named `language`
> Hint: Start with a pandas dataframe. Maybe use a dictionary?
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [12]:
# create dataframe of programming languages and assign to a variable
languages = pd.DataFrame(
    {'language':['python', 'javascript', 'java', 'c#', 'typescript', 'r', 'swift']})
languages

Unnamed: 0,language
0,python
1,javascript
2,java
3,c#
4,typescript
5,r
6,swift


In [13]:
# convert dataframe to a spark object
df = spark.createDataFrame(languages)
df

DataFrame[language: string]

In [16]:
# check the schema
df.printSchema()

root
 |-- language: string (nullable = true)



In [17]:
# check the shape of the dataframe
print("DataFrame shape: ", df.count(), " x ", len(df.columns))

DataFrame shape:  7  x  1


In [18]:
# check the first 5 records
df.show(5)

+----------+
|  language|
+----------+
|    python|
|javascript|
|      java|
|        c#|
|typescript|
+----------+
only showing top 5 rows



## Exercise 2:

Load the `mpg` dataset as a spark dataframe.

a. Create 1 column of output that contains a message like the one below for each record:

    The 1999 audi a4 has a 4 cylinder engine.

> Hint: You will need to concatenate values that already exist in the data with string literals

b. Transform the trans column so that it only contains either manual or auto.

> Hint: Consider spark string methods and `when().otherwise()` chaining

In [19]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [39]:
from pyspark.sql.functions import lit

# The 1999 audi a4 has a 4 cylinder engine.

mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer, lit(' '), mpg.model, lit(' has a '), mpg.cyl, lit(' cylinder engine.')).alias('vehicle_cylinder_description')).show(5, truncate=False)

+-----------------------------------------+
|vehicle_cylinder_description             |
+-----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 2008 audi a4 has a 4 cylinder engine.|
|The 1999 audi a4 has a 6 cylinder engine.|
+-----------------------------------------+
only showing top 5 rows



In [28]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [40]:
# b. Transform the trans column so that it only contains either manual or auto.
# Hint: Consider spark string methods and when().otherwise() chaining

# mpg.withColumn("trans",
#    when($"trans".startsWith("auto"),regexp_replace($"trans", "auto"))

# mpg.select(col('trans'), when(col('trans').startsWith('auto'), 'auto')).show(5)
           
# #            .otherwise('manual').alias('trans')).show(10)

## Exercise 3: 

Load the `tips` dataset as a spark dataframe.

a. What percentage of observations are smokers?

b. Create a column that contains the tip percentage

c. Calculate the average tip percentage for each combination of sex and smoker.

In [46]:
tips = spark.createDataFrame(data("tips"))
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [53]:

tips.groupBy('smoker').count().show().withColumn('pct', 
                                            round(col('count')/tips.count()*100,2)).show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



AttributeError: 'NoneType' object has no attribute 'withColumn'

In [None]:
tips.withColumn('tip_pct', col('tip')/col('total_bill')).show()

In [None]:
tips.withColumn('tip_pct', col('tip')/col('total_bill'))).groupby('sex').pivot().show()