# Jonathan Halverson
# Friday, December 23, 2016
# More with Spark SQL

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.master("local").appName("Nail Play").getOrCreate()

In [55]:
# The data is permits managed by the Boston Public Health Commission
nails = spark.read.csv('Nail_Salon_Permits.csv', header=True, inferSchema=True)
nails.printSchema()

root
 |-- Salon_BusinessID: integer (nullable = true)
 |-- Permit_Number: string (nullable = true)
 |-- SalonName: string (nullable = true)
 |-- Salon St No: integer (nullable = true)
 |-- SalonStName: string (nullable = true)
 |-- Salon St Ext: string (nullable = true)
 |-- Salon Neighborhood: string (nullable = true)
 |-- Salon State: string (nullable = true)
 |-- Salon Zip: integer (nullable = true)
 |-- Salon Closed: integer (nullable = true)
 |-- Services Nails: integer (nullable = true)
 |-- Services Tanning: integer (nullable = true)
 |-- Services Hair: integer (nullable = true)
 |-- Services Spa: integer (nullable = true)
 |-- Services Wax: integer (nullable = true)
 |-- Services Other: string (nullable = true)
 |-- Number Tables: integer (nullable = true)
 |-- Number Baths: integer (nullable = true)
 |-- Salon_First_Visit: string (nullable = true)
 |-- Salon_Last_Visit: string (nullable = true)
 |-- Permit Start: string (nullable = true)
 |-- Permit End: string (nullable = tru

In [4]:
nails.first()

Row(Salon_BusinessID=730, Permit_Number=u'2011N-00057', SalonName=u'Wonder Nails', Salon St No=13, SalonStName=u'Poplar', Salon St Ext=u'ST', Salon Neighborhood=u'Roslindale', Salon State=u'MA', Salon Zip=2131, Salon Closed=0, Services Nails=1, Services Tanning=0, Services Hair=0, Services Spa=0, Services Wax=0, Services Other=None, Number Tables=3, Number Baths=3, Salon_First_Visit=u'01/20/2012 12:00:00 AM', Salon_Last_Visit=None, Permit Start=u'01/20/2012 12:00:00 AM', Permit End=u'01/19/2013 12:00:00 AM', Application Date=u'09/20/2011 12:00:00 AM', Application Type=u'Renewal', Previous Permit=u'2012N-00057', Permit_Use and Occupancy=1, Permit_Flammable Storage=1, Permit_Salon BOC License=1, Permit_Cosmetologist Licenses=1, Permit_Ventilation=1, Permit_Work with BPHC=0, Permit Fee=140, Date Passed Inspection=u'02/12/2013 12:00:00 AM')

In [5]:
nails.show(3)

+----------------+-------------+------------+-----------+-----------+------------+------------------+-----------+---------+------------+--------------+----------------+-------------+------------+------------+--------------+-------------+------------+--------------------+----------------+--------------------+--------------------+--------------------+----------------+---------------+------------------------+------------------------+------------------------+-----------------------------+------------------+---------------------+----------+----------------------+
|Salon_BusinessID|Permit_Number|   SalonName|Salon St No|SalonStName|Salon St Ext|Salon Neighborhood|Salon State|Salon Zip|Salon Closed|Services Nails|Services Tanning|Services Hair|Services Spa|Services Wax|Services Other|Number Tables|Number Baths|   Salon_First_Visit|Salon_Last_Visit|        Permit Start|          Permit End|    Application Date|Application Type|Previous Permit|Permit_Use and Occupancy|Permit_Flammable Storage|P

In [18]:
nails.describe(['Number Tables', 'Salon_BusinessID']).show()

+-------+-----------------+-----------------+
|summary|    Number Tables| Salon_BusinessID|
+-------+-----------------+-----------------+
|  count|              542|              573|
|   mean|5.226937269372693|850.4607329842931|
| stddev|2.994476343047401|73.43121950072631|
|    min|                1|              730|
|    max|               21|              983|
+-------+-----------------+-----------------+



Check for null values:

In [16]:
nails.select('*').filter(~F.isnull('Number Tables')).count()

542

In [17]:
nails.select('*').filter(F.isnull('Number Tables')).count()

31

In [21]:
nails.select('*').filter(F.isnull('Salon Neighborhood')).count()

0

In [19]:
nails.select(['Salon_BusinessID', 'SalonName', 'Salon Neighborhood', 'Number Tables', 'Salon_First_Visit']).orderBy('Salon_BusinessID').show(truncate=False)

+----------------+--------------------------+------------------+-------------+----------------------+
|Salon_BusinessID|SalonName                 |Salon Neighborhood|Number Tables|Salon_First_Visit     |
+----------------+--------------------------+------------------+-------------+----------------------+
|730             |Wonder Nails              |Roslindale        |3            |01/20/2012 12:00:00 AM|
|730             |Wonder Nails              |Roslindale        |3            |01/20/2012 12:00:00 AM|
|730             |Wonder Nails              |Roslindale        |3            |01/20/2012 12:00:00 AM|
|731             |Tina's Nail Design        |Roslindale        |7            |11/18/2011 12:00:00 AM|
|731             |Tina's Nail Design        |Roslindale        |7            |11/18/2011 12:00:00 AM|
|731             |Tina's Nail Design        |Roslindale        |7            |11/18/2011 12:00:00 AM|
|732             |Beauty Nails And Skin Care|Roslindale        |4            |null

In [8]:
nails.select(['Salon Neighborhood']).groupby('Salon Neighborhood').agg(F.count('*').alias('count')).show()

+--------------------+-----+
|  Salon Neighborhood|count|
+--------------------+-----+
|             Roxbury|   15|
|Back Bay-Beacon H...|  128|
|    North Dorchester|   24|
|           South End|   29|
|          Roslindale|   28|
|         Charlestown|   15|
|  North End-Downtown|   38|
|            Mattapan|   16|
|              Fenway|   30|
|    Allston-Brighton|   46|
|           Hyde Park|   10|
|        South Boston|   30|
|        West Roxbury|   34|
|       Jamaica Plain|   26|
|    South Dorchester|   75|
|         East Boston|   29|
+--------------------+-----+



Which salons have the highest average number of tables:

In [54]:
nails.select(['*']).groupby('Salon Neighborhood').agg(F.round(F.avg('Number Tables'), 1).alias('Avg Num Tables')).orderBy('Avg Num Tables', ascending=False).show(5)

+------------------+--------------+
|Salon Neighborhood|Avg Num Tables|
+------------------+--------------+
|  South Dorchester|           7.5|
|           Roxbury|           7.4|
|      South Boston|           6.2|
|         South End|           5.7|
|      West Roxbury|           5.4|
+------------------+--------------+
only showing top 5 rows



In [53]:
nails.select('SalonName').filter(nails['Services Hair'] == 1).filter(nails['Services Wax'] == 1).distinct().orderBy('SalonName').show(5)

+--------------------+
|           SalonName|
+--------------------+
|           Adara Spa|
|Athena Salon And ...|
|   BPHC Construction|
|Beaucage Salon An...|
|    Chez Bella Salon|
+--------------------+
only showing top 5 rows

