# Learning Objectives
In this notebook, you will learn Spark Dataframe APIs.

# Question List

Solve the following questions using Spark Dataframe APIs

### Join

1. easy - https://pgexercises.com/questions/joins/simplejoin.html
2. easy - https://pgexercises.com/questions/joins/simplejoin2.html
3. easy - https://pgexercises.com/questions/joins/self2.html 
4. medium - https://pgexercises.com/questions/joins/threejoin.html (three join)
5. medium - https://pgexercises.com/questions/joins/sub.html (subquery and join)

### Aggregation

1. easy - https://pgexercises.com/questions/aggregates/count3.html Group by order by
2. easy - https://pgexercises.com/questions/aggregates/fachours.html group by order by
3. easy - https://pgexercises.com/questions/aggregates/fachoursbymonth.html group by with condition 
4. easy - https://pgexercises.com/questions/aggregates/fachoursbymonth2.html group by multi col
5. easy - https://pgexercises.com/questions/aggregates/members1.html count distinct
6. med - https://pgexercises.com/questions/aggregates/nbooking.html group by multiple cols, join

### String & Date

1. easy - https://pgexercises.com/questions/string/concat.html format string
2. easy - https://pgexercises.com/questions/string/case.html WHERE + string function
3. easy - https://pgexercises.com/questions/string/reg.html WHERE + string function
4. easy - https://pgexercises.com/questions/string/substr.html group by, substr
5. easy - https://pgexercises.com/questions/date/series.html generate ts
6. easy - https://pgexercises.com/questions/date/bookingspermonth.html extract month from ts

### Question

How can you produce a list of the start times for bookings by members named 'David Farrell'?

https://pgexercises.com/questions/joins/simplejoin.html

In [0]:
# Write you solution here
# hint: you might need to re-run `0 - ETL pgexercieses CSV files` notebook to init tables

df = spark.sql("select * from bookings")
# display(df)

In [0]:
#1. Simple Join 1
david_farrell_bookings = spark.sql("""
    SELECT starttime
    FROM bookings bks
    JOIN members mems ON bks.memid = mems.memid
    WHERE mems.firstname = 'David' AND mems.surname = 'Farrell'
""")
display(david_farrell_bookings)                 

starttime
2012-09-18T09:00:00.000+0000
2012-09-18T17:30:00.000+0000
2012-09-18T13:30:00.000+0000
2012-09-18T20:00:00.000+0000
2012-09-19T09:30:00.000+0000
2012-09-19T15:00:00.000+0000
2012-09-19T12:00:00.000+0000
2012-09-20T15:30:00.000+0000
2012-09-20T11:30:00.000+0000
2012-09-20T14:00:00.000+0000


In [0]:
#2. Simple Join 2
tennis_bookings = spark.sql("""
    SELECT bks.starttime, facs.name AS facility
    FROM bookings bks
    JOIN facilities facs ON bks.facid = facs.facid
    WHERE facs.name LIKE 'Tennis Court%' AND bks.starttime >= '2012-09-21'
    ORDER BY bks.starttime
""")
display(tennis_bookings)                        

starttime,facility
2012-09-21T08:00:00.000+0000,Tennis Court 1
2012-09-21T08:00:00.000+0000,Tennis Court 2
2012-09-21T09:30:00.000+0000,Tennis Court 1
2012-09-21T10:00:00.000+0000,Tennis Court 2
2012-09-21T11:30:00.000+0000,Tennis Court 2
2012-09-21T12:00:00.000+0000,Tennis Court 1
2012-09-21T13:30:00.000+0000,Tennis Court 1
2012-09-21T14:00:00.000+0000,Tennis Court 2
2012-09-21T15:30:00.000+0000,Tennis Court 1
2012-09-21T16:00:00.000+0000,Tennis Court 2


In [0]:
#3. Self Join
recommended_members = spark.sql("""
    SELECT 
        CONCAT(mems.firstname, ' ', mems.surname) AS member_name,
        CASE 
            WHEN mems.recommendedby IS NOT NULL THEN CONCAT(recs.firstname, ' ', recs.surname)
            ELSE 'No recommender'
        END AS recommender_name
    FROM members mems
    LEFT JOIN members recs ON mems.recommendedby = recs.memid
    ORDER BY mems.surname, mems.firstname
""")
display(recommended_members)
                     

member_name,recommender_name
Florence Bader,Ponder Stibbons
Anne Baker,Ponder Stibbons
Timothy Baker,Jemima Farrell
Tim Boothe,Tim Rownam
Gerald Butters,Darren Smith
Joan Coplin,Timothy Baker
Erica Crumpet,Tracy Smith
Nancy Dare,Janice Joplette
David Farrell,No recommender
Jemima Farrell,No recommender


In [0]:
#4. Three Join
used_tennis_courts = spark.sql("""
    SELECT DISTINCT 
        CONCAT(mems.firstname, ' ', mems.surname) AS member_name,
        facs.name AS facility_name
    FROM bookings bks
    JOIN members mems ON bks.memid = mems.memid
    JOIN facilities facs ON bks.facid = facs.facid
    WHERE facs.name LIKE 'Tennis Court%'
    ORDER BY member_name, facility_name
""")
display(used_tennis_courts)                     

member_name,facility_name
Anne Baker,Tennis Court 1
Anne Baker,Tennis Court 2
Burton Tracy,Tennis Court 1
Burton Tracy,Tennis Court 2
Charles Owen,Tennis Court 1
Charles Owen,Tennis Court 2
Darren Smith,Tennis Court 2
David Farrell,Tennis Court 1
David Farrell,Tennis Court 2
David Jones,Tennis Court 1


In [0]:
#5. Subquery and join

recommenders_list = spark.sql("""
    SELECT DISTINCT
        CONCAT(mems.firstname, ' ', mems.surname) AS member_name,
        COALESCE(CONCAT(recs.firstname, ' ', recs.surname), 'No recommender') AS recommender_name
    FROM members mems
    LEFT JOIN members recs ON mems.recommendedby = recs.memid
    ORDER BY member_name
""")
display(recommenders_list)



member_name,recommender_name
Anna Mackenzie,Darren Smith
Anne Baker,Ponder Stibbons
Burton Tracy,No recommender
Charles Owen,Darren Smith
Darren Smith,No recommender
David Farrell,No recommender
David Jones,Janice Joplette
David Pinker,Jemima Farrell
Douglas Jones,David Jones
Erica Crumpet,Tracy Smith


AGGREGATION

In [0]:
#1 GROUP BY, ORDER BY

recommendation_count = spark.sql("""
    SELECT recommendedby AS member_id, COUNT(*) AS recommendation_count
    FROM members
    WHERE recommendedby IS NOT NULL
    GROUP BY recommendedby
    ORDER BY member_id
""")
display(recommendation_count)


member_id,recommendation_count
1,5
2,3
3,1
4,2
5,1
6,1
9,2
11,1
13,2
15,1


In [0]:
#2 

slots_per_facility = spark.sql("""
    SELECT facid AS facility_id, SUM(slots) AS total_slots
    FROM bookings
    GROUP BY facid
    ORDER BY facility_id
""")
display(slots_per_facility)


facility_id,total_slots
0,1320
1,1278
2,1209
3,830
4,1404
5,228
6,1104
7,908
8,911


In [0]:
#3 with condition

slots_per_facility_september = spark.sql("""
    SELECT facid AS facility_id, SUM(slots) AS total_slots
    FROM bookings
    WHERE starttime >= '2012-09-01' AND starttime < '2012-10-01'
    GROUP BY facid
    ORDER BY total_slots DESC
""")
display(slots_per_facility_september)


facility_id,total_slots
4,648
0,591
1,588
2,570
6,540
8,471
7,426
3,422
5,122


In [0]:
#4 
slots_per_facility_per_month_2012 = spark.sql("""
    SELECT facid AS facility_id, 
           YEAR(starttime) AS year, 
           MONTH(starttime) AS month, 
           SUM(slots) AS total_slots
    FROM bookings
    WHERE YEAR(starttime) = 2012
    GROUP BY facid, YEAR(starttime), MONTH(starttime)
    ORDER BY facility_id, month
""")
display(slots_per_facility_per_month_2012)

facility_id,year,month,total_slots
0,2012,7,270
0,2012,8,459
0,2012,9,591
1,2012,7,207
1,2012,8,483
1,2012,9,588
2,2012,7,180
2,2012,8,459
2,2012,9,570
3,2012,7,104


In [0]:
#5 

total_members_with_bookings = spark.sql("""
    SELECT COUNT(DISTINCT memid) AS total_members
    FROM bookings
""")
total_members_with_bookings.show()


+-------------+
|total_members|
+-------------+
|           30|
+-------------+



In [0]:
#6

first_booking_after_september = spark.sql("""
    SELECT mems.memid, 
           CONCAT(mems.firstname, ' ', mems.surname) AS member_name, 
           MIN(bks.starttime) AS first_booking_date
    FROM members mems
    JOIN bookings bks ON mems.memid = bks.memid
    WHERE bks.starttime > '2012-09-01'
    GROUP BY mems.memid, mems.firstname, mems.surname
    ORDER BY mems.memid
""")
first_booking_after_september.show()


+-----+---------------+-------------------+
|memid|    member_name| first_booking_date|
+-----+---------------+-------------------+
|    0|    GUEST GUEST|2012-09-01 08:00:00|
|    1|   Darren Smith|2012-09-01 09:00:00|
|    2|    Tracy Smith|2012-09-01 11:30:00|
|    3|     Tim Rownam|2012-09-01 16:00:00|
|    4|Janice Joplette|2012-09-01 15:00:00|
|    5| Gerald Butters|2012-09-02 12:30:00|
|    6|   Burton Tracy|2012-09-01 15:00:00|
|    7|     Nancy Dare|2012-09-01 12:30:00|
|    8|     Tim Boothe|2012-09-01 08:30:00|
|    9|Ponder Stibbons|2012-09-01 11:00:00|
|   10|   Charles Owen|2012-09-01 11:00:00|
|   11|    David Jones|2012-09-01 09:30:00|
|   12|     Anne Baker|2012-09-01 14:30:00|
|   13| Jemima Farrell|2012-09-01 09:30:00|
|   14|     Jack Smith|2012-09-01 11:00:00|
|   15| Florence Bader|2012-09-01 10:30:00|
|   16|  Timothy Baker|2012-09-01 15:00:00|
|   17|   David Pinker|2012-09-01 08:30:00|
|   20|Matthew Genting|2012-09-01 18:00:00|
|   21| Anna Mackenzie|2012-09-0

STRING AND DATE

In [0]:
#1 

members_names = spark.sql("""
    SELECT CONCAT(surname, ', ', firstname) AS name
    FROM members
""")
display(members_names)


name
"GUEST, GUEST"
"Smith, Darren"
"Smith, Tracy"
"Rownam, Tim"
"Joplette, Janice"
"Butters, Gerald"
"Tracy, Burton"
"Dare, Nancy"
"Boothe, Tim"
"Stibbons, Ponder"


In [0]:
#2 

facilities_tennis = spark.sql("""
    SELECT *
    FROM facilities
    WHERE name ILIKE 'tennis%'
""")
display(facilities_tennis)


facid,name,membercost,guestcost,initialoutlay,monthlymaintenance
0,Tennis Court 1,5.0,25.0,10000.0,200.0
1,Tennis Court 2,5.0,25.0,8000.0,200.0


In [0]:
#3

telephone_with_parentheses = spark.sql("""
    SELECT memid, telephone
    FROM members
    WHERE telephone REGEXP '\\(.*\\)'
    ORDER BY memid
""")
display(telephone_with_parentheses)


memid,telephone
0,(000) 000-0000
1,555-555-5555
2,555-555-5555
3,(844) 693-0723
4,(833) 942-4710
5,(844) 078-4130
6,(822) 354-9973
7,(833) 776-4001
8,(811) 433-2547
9,(833) 160-3900


In [0]:
#4 

surname_count = spark.sql("""
    SELECT UPPER(SUBSTRING(surname, 1, 1)) AS first_letter, COUNT(*) AS member_count
    FROM members
    GROUP BY first_letter
    ORDER BY first_letter
""")
display(surname_count)


first_letter,member_count
B,5
C,2
D,1
F,2
G,2
H,1
J,3
M,1
O,1
P,2


In [0]:
#5

october_dates = spark.sql("""
    WITH date_range AS (
        SELECT EXPLODE(SEQUENCE(DATE '2012-10-01', DATE '2012-10-31', INTERVAL 1 DAY)) AS date
    )
    SELECT date FROM date_range
    ORDER BY date
""")
display(october_dates)



date
2012-10-01
2012-10-02
2012-10-03
2012-10-04
2012-10-05
2012-10-06
2012-10-07
2012-10-08
2012-10-09
2012-10-10


In [0]:
#6

booking_counts_by_month = spark.sql("""
    SELECT 
        DATE_FORMAT(starttime, 'yyyy-MM') AS month,
        COUNT(*) AS booking_count
    FROM bookings
    GROUP BY month
    ORDER BY month
""")
display(booking_counts_by_month)


month,booking_count
2012-07,658
2012-08,1472
2012-09,1913
2013-01,1


### Question