### Question
1. How can you produce a list of the start times for bookings by members named 'David Farrell'?

In [0]:
from pyspark.sql.functions import col 

df_booking = spark.table('bookings')
df_facilities = spark.table('facilities')
result = df_booking.join(df_facilities, df_booking.memid == df_facilities.memid, "inner")
result = result.filter((col('firstname')=="David") & (col('surname') == "Farrell")).select(col('starttime'))
result.show()

+-------------------+
|          starttime|
+-------------------+
|2012-09-18 09:00:00|
|2012-09-18 17:30:00|
|2012-09-18 13:30:00|
|2012-09-18 20:00:00|
|2012-09-19 09:30:00|
|2012-09-19 15:00:00|
|2012-09-19 12:00:00|
|2012-09-20 15:30:00|
|2012-09-20 11:30:00|
|2012-09-20 14:00:00|
|2012-09-21 10:30:00|
|2012-09-21 14:00:00|
|2012-09-22 08:30:00|
|2012-09-22 17:00:00|
|2012-09-23 08:30:00|
|2012-09-23 17:30:00|
|2012-09-23 19:00:00|
|2012-09-24 08:00:00|
|2012-09-24 16:30:00|
|2012-09-24 12:30:00|
+-------------------+
only showing top 20 rows



### **Question**
2. How can you produce a list of the start times for bookings for tennis courts, for the date '2012-09-21'? Return a list of start time and facility name pairings, ordered by the time.

In [0]:
from pyspark.sql.functions import col 

df_booking = spark.table('bookings')
df_members = spark.table('members')
result = df_booking.join(df_members,df_booking.memid == df_members.facid, "inner")
result = result.filter((col('name').contains ("Tennis")) & (col('starttime') >= "2012-09-21") & (col('starttime') < "2012-09-22")).select( 'starttime', 'name')
result.show()

+-------------------+--------------+
|          starttime|          name|
+-------------------+--------------+
|2012-09-21 11:30:00|Tennis Court 2|
|2012-09-21 14:00:00|Tennis Court 1|
|2012-09-21 19:30:00|Tennis Court 2|
|2012-09-21 10:30:00|Tennis Court 1|
|2012-09-21 13:00:00|Tennis Court 1|
|2012-09-21 15:30:00|Tennis Court 1|
|2012-09-21 17:30:00|Tennis Court 2|
|2012-09-21 18:30:00|  Table Tennis|
|2012-09-21 17:30:00|Tennis Court 1|
|2012-09-21 09:30:00|Tennis Court 1|
|2012-09-21 11:30:00|Tennis Court 1|
|2012-09-21 14:00:00|Tennis Court 1|
|2012-09-21 11:00:00|  Table Tennis|
|2012-09-21 16:00:00|  Table Tennis|
|2012-09-21 19:30:00|  Table Tennis|
+-------------------+--------------+



### Question
3. How can you output a list of all members, including the individual who recommended them (if any)? Ensure that results are ordered by (surname, firstname).

In [0]:
result = df_facilities.alias('fac1').join(df_facilities.alias('fac2'), col('fac1.recommendedby') == col('fac2.memid'), "left").select (col('fac1.firstname').alias ('First name'), col('fac1.surname').alias ('Surname'), col('fac2.firstname').alias('recommendedby firstname'),col( 'fac2.surname').alias('recommendedby surname')).orderBy ('fac1.surname','fac1.firstname')

result.show()

+----------+---------+-----------------------+---------------------+
|First name|  Surname|recommendedby firstname|recommendedby surname|
+----------+---------+-----------------------+---------------------+
|  Florence|    Bader|                 Ponder|             Stibbons|
|      Anne|    Baker|                 Ponder|             Stibbons|
|   Timothy|    Baker|                 Jemima|              Farrell|
|       Tim|   Boothe|                    Tim|               Rownam|
|    Gerald|  Butters|                 Darren|                Smith|
|      Joan|   Coplin|                Timothy|                Baker|
|     Erica|  Crumpet|                  Tracy|                Smith|
|     Nancy|     Dare|                 Janice|             Joplette|
|     David|  Farrell|                   null|                 null|
|    Jemima|  Farrell|                   null|                 null|
|     GUEST|    GUEST|                   null|                 null|
|   Matthew|  Genting|            

### Question
4. How can you produce a list of all members who have used a tennis court? Include in your output the name of the court, and the name of the member formatted as a single column. Ensure no duplicate data, and order by the member name followed by the facility name.

In [0]:
from pyspark.sql.functions import concat, col, lit

result = df_facilities.join(df_booking, df_facilities.memid == df_booking.memid, 'inner').join(df_members, df_booking.facid == df_members.facid, 'inner').select(concat(df_facilities.firstname, lit(' '), df_facilities.surname).alias('member'),df_members.name.alias('facility')).filter(col('facility').contains('Tennis Court')).distinct().orderBy('member', 'facility')

result.show()

+--------------+--------------+
|        member|      facility|
+--------------+--------------+
|    Anne Baker|Tennis Court 1|
|    Anne Baker|Tennis Court 2|
|  Burton Tracy|Tennis Court 1|
|  Burton Tracy|Tennis Court 2|
|  Charles Owen|Tennis Court 1|
|  Charles Owen|Tennis Court 2|
|  Darren Smith|Tennis Court 2|
| David Farrell|Tennis Court 1|
| David Farrell|Tennis Court 2|
|   David Jones|Tennis Court 1|
|   David Jones|Tennis Court 2|
|  David Pinker|Tennis Court 1|
| Douglas Jones|Tennis Court 1|
| Erica Crumpet|Tennis Court 1|
|Florence Bader|Tennis Court 1|
|Florence Bader|Tennis Court 2|
|   GUEST GUEST|Tennis Court 1|
|   GUEST GUEST|Tennis Court 2|
|Gerald Butters|Tennis Court 1|
|Gerald Butters|Tennis Court 2|
+--------------+--------------+
only showing top 20 rows



### Question
5. How can you output a list of all members, including the individual who recommended them (if any), without using any joins? Ensure that there are no duplicates in the list, and that each firstname + surname pairing is formatted as a column and ordered.

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

df_m = df_facilities.alias("m")
df_r = df_facilities.alias("r")

result = (df_m
    .join(df_r, F.col("m.recommendedby") == F.col("r.memid"), "left")
    .select(
        F.concat(F.col("m.firstname"), F.lit(" "), F.col("m.surname")).alias("member"),
        F.concat(F.col("r.firstname"), F.lit(" "), F.col("r.surname")).alias("recommender")
    )
    .distinct()
    .orderBy("member"))

result.show()


+--------------------+---------------+
|              member|    recommender|
+--------------------+---------------+
|      Anna Mackenzie|   Darren Smith|
|          Anne Baker|Ponder Stibbons|
|        Burton Tracy|           null|
|        Charles Owen|   Darren Smith|
|        Darren Smith|           null|
|       David Farrell|           null|
|         David Jones|Janice Joplette|
|        David Pinker| Jemima Farrell|
|       Douglas Jones|    David Jones|
|       Erica Crumpet|    Tracy Smith|
|      Florence Bader|Ponder Stibbons|
|         GUEST GUEST|           null|
|      Gerald Butters|   Darren Smith|
|    Henrietta Rumney|Matthew Genting|
|Henry Worthington...|    Tracy Smith|
| Hyacinth Tupperware|           null|
|          Jack Smith|   Darren Smith|
|     Janice Joplette|   Darren Smith|
|      Jemima Farrell|           null|
|         Joan Coplin|  Timothy Baker|
+--------------------+---------------+
only showing top 20 rows



## Aggregation

### Question
1. Produce a count of the number of recommendations each member has made. Order by member ID.

easy - https://pgexercises.com/questions/aggregates/count3.html Group by order by

In [0]:
from pyspark.sql import functions as F

df = (df_facilities
      .where(F.col("recommendedby").isNotNull())
      .groupBy("recommendedby")
      .count()
      .orderBy("recommendedby"))

df.show()


+-------------+-----+
|recommendedby|count|
+-------------+-----+
|            1|    5|
|            2|    3|
|            3|    1|
|            4|    2|
|            5|    1|
|            6|    1|
|            9|    2|
|           11|    1|
|           13|    2|
|           15|    1|
|           16|    1|
|           20|    1|
|           30|    1|
+-------------+-----+



### Question
2. Produce a list of the total number of slots booked per facility. For now, just produce an output table consisting of facility id and slots, sorted by facility id.

easy - https://pgexercises.com/questions/aggregates/fachours.html group by order by

In [0]:

from pyspark.sql import functions as F

df = (df_booking
      .groupBy("facid")
      .agg(F.sum("slots").alias("Total Slots"))
      .orderBy("facid"))

df.show()


+-----+-----------+
|facid|Total Slots|
+-----+-----------+
|    0|       1320|
|    1|       1278|
|    2|       1209|
|    3|        830|
|    4|       1404|
|    5|        228|
|    6|       1104|
|    7|        908|
|    8|        911|
+-----+-----------+



### Question
3. Produce a list of the total number of slots booked per facility in the month of September 2012. Produce an output table consisting of facility id and slots, sorted by the number of slots.

easy - https://pgexercises.com/questions/aggregates/fachoursbymonth.html group by with condition



In [0]:
from pyspark.sql import functions as F

df = (df_booking
      .filter((F.col("starttime") >= F.to_date(F.lit('01-09-2012'), 'dd-MM-yyyy')) & (F.col("starttime") < F.to_date(F.lit('01-10-2012'), 'dd-MM-yyyy')))
      .groupBy("facid")
      .agg(F.sum("slots"))
      .orderBy("facid"))

df.show()

+-----+----------+
|facid|sum(slots)|
+-----+----------+
|    0|       591|
|    1|       588|
|    2|       570|
|    3|       422|
|    4|       648|
|    5|       122|
|    6|       540|
|    7|       426|
|    8|       471|
+-----+----------+



### Question
4. Produce a list of the total number of slots booked per facility per month in the year of 2012. Produce an output table consisting of facility id and slots, sorted by the id and month.

easy - https://pgexercises.com/questions/aggregates/fachoursbymonth2.html group by multi col


In [0]:
from pyspark.sql import functions as F

df = (df_booking
      .filter((F.col("starttime") >= F.to_date(F.lit('01-01-2012'), 'dd-MM-yyyy')) & (F.col("starttime") < F.to_date(F.lit('01-01-2013'), 'dd-MM-yyyy')) )
      .select(
          "facid",
          F.date_format(F.col("starttime"), "MM").alias("month"),
          "slots"
      )
      .groupBy("facid","month")
      .agg(F.sum("slots"))
      .orderBy("facid",  "month"))

df.show()

+-----+-----+----------+
|facid|month|sum(slots)|
+-----+-----+----------+
|    0|   07|       270|
|    0|   08|       459|
|    0|   09|       591|
|    1|   07|       207|
|    1|   08|       483|
|    1|   09|       588|
|    2|   07|       180|
|    2|   08|       459|
|    2|   09|       570|
|    3|   07|       104|
|    3|   08|       304|
|    3|   09|       422|
|    4|   07|       264|
|    4|   08|       492|
|    4|   09|       648|
|    5|   07|        24|
|    5|   08|        82|
|    5|   09|       122|
|    6|   07|       164|
|    6|   08|       400|
+-----+-----+----------+
only showing top 20 rows



### Question
5. Find the total number of members (including guests) who have made at least one booking.

easy - https://pgexercises.com/questions/aggregates/members1.html count distinct

In [0]:
from pyspark.sql import functions as F

result = df_booking.select('memid').distinct().count()
print(result)

30


### Question
6.Produce a list of each member name, id, and their first booking after September 1st 2012. Order by member ID.

med - https://pgexercises.com/questions/aggregates/nbooking.html group by multiple cols, join


In [0]:
from pyspark.sql import functions as F

result = (
    df_facilities
    .join(df_booking, df_facilities.memid == df_booking.memid, 'inner')
    .filter(df_booking.starttime >= F.to_date(F.lit('01-09-2012'), 'dd-MM-yyyy'))
    .groupBy(df_facilities.surname, df_facilities.firstname, df_facilities.memid)
    .agg(F.min(df_booking.starttime).alias("first_booking"))
    .orderBy(df_facilities.memid)
)

result.show()




+---------+---------+-----+-------------------+
|  surname|firstname|memid|      first_booking|
+---------+---------+-----+-------------------+
|    GUEST|    GUEST|    0|2012-09-01 08:00:00|
|    Smith|   Darren|    1|2012-09-01 09:00:00|
|    Smith|    Tracy|    2|2012-09-01 11:30:00|
|   Rownam|      Tim|    3|2012-09-01 16:00:00|
| Joplette|   Janice|    4|2012-09-01 15:00:00|
|  Butters|   Gerald|    5|2012-09-02 12:30:00|
|    Tracy|   Burton|    6|2012-09-01 15:00:00|
|     Dare|    Nancy|    7|2012-09-01 12:30:00|
|   Boothe|      Tim|    8|2012-09-01 08:30:00|
| Stibbons|   Ponder|    9|2012-09-01 11:00:00|
|     Owen|  Charles|   10|2012-09-01 11:00:00|
|    Jones|    David|   11|2012-09-01 09:30:00|
|    Baker|     Anne|   12|2012-09-01 14:30:00|
|  Farrell|   Jemima|   13|2012-09-01 09:30:00|
|    Smith|     Jack|   14|2012-09-01 11:00:00|
|    Bader| Florence|   15|2012-09-01 10:30:00|
|    Baker|  Timothy|   16|2012-09-01 15:00:00|
|   Pinker|    David|   17|2012-09-01 08

## String and Date

### Question
1. Output the names of all members, formatted as 'Surname, Firstname'

easy - https://pgexercises.com/questions/string/concat.html format string


In [0]:
from pyspark.sql import functions as F

result = df_facilities.select(F.concat_ws(', ', F.col("surname"), F.col("firstname")).alias("name")) \
    .filter(F.col("memid").isNotNull())

result.show()


+----------------+
|            name|
+----------------+
|    GUEST, GUEST|
|   Smith, Darren|
|    Smith, Tracy|
|     Rownam, Tim|
|Joplette, Janice|
| Butters, Gerald|
|   Tracy, Burton|
|     Dare, Nancy|
|     Boothe, Tim|
|Stibbons, Ponder|
|   Owen, Charles|
|    Jones, David|
|     Baker, Anne|
| Farrell, Jemima|
|     Smith, Jack|
| Bader, Florence|
|  Baker, Timothy|
|   Pinker, David|
|Genting, Matthew|
| Mackenzie, Anna|
+----------------+
only showing top 20 rows



### Question
2. Perform a case-insensitive search to find all facilities whose name begins with 'tennis'. Retrieve all columns.

easy - https://pgexercises.com/questions/string/case.html WHERE + string function

select * from cd.facilities where upper(name) like 'TENNIS%';


In [0]:
from pyspark.sql import functions as F

result = df_members.filter(F.upper(F.col("name")).like("TENNIS%"))

result.show()

+-----+--------------+----------+---------+-------------+------------------+
|facid|          name|membercost|guestcost|initialoutlay|monthlymaintenance|
+-----+--------------+----------+---------+-------------+------------------+
|    0|Tennis Court 1|       5.0|     25.0|        10000|               200|
|    1|Tennis Court 2|       5.0|     25.0|         8000|               200|
+-----+--------------+----------+---------+-------------+------------------+



### Question
3. You've noticed that the club's member table has telephone numbers with very inconsistent formatting. You'd like to find all the telephone numbers that contain parentheses, returning the member ID and telephone number sorted by member ID.

easy - https://pgexercises.com/questions/string/reg.html WHERE + string function



In [0]:
from pyspark.sql import functions as F

result = df_facilities.filter(F.upper(F.col("telephone")).like("%")).select('memid', 'telephone')


result.show()

+-----+--------------+
|memid|     telephone|
+-----+--------------+
|    0|(000) 000-0000|
|    1|  555-555-5555|
|    2|  555-555-5555|
|    3|(844) 693-0723|
|    4|(833) 942-4710|
|    5|(844) 078-4130|
|    6|(822) 354-9973|
|    7|(833) 776-4001|
|    8|(811) 433-2547|
|    9|(833) 160-3900|
|   10|(855) 542-5251|
|   11|(844) 536-8036|
|   12|  844-076-5141|
|   13|(855) 016-0163|
|   14|(822) 163-3254|
|   15|(833) 499-3527|
|   16|  833-941-0824|
|   17|  811 409-6734|
|   20|(811) 972-1377|
|   21|(822) 661-2898|
+-----+--------------+
only showing top 20 rows



### Question
4. You'd like to produce a count of how many members you have whose surname starts with each letter of the alphabet. Sort by the letter, and don't worry about printing out a letter if the count is 0.

easy - https://pgexercises.com/questions/string/substr.html group by, substr

In [0]:
from pyspark.sql.functions import substring, upper, col, count

result = (
    df_facilities.withColumn('First Letter', upper(substring(col('surname'), 1, 1)))
    .groupBy('First Letter')
    .agg(count('*').alias('count'))
    .orderBy('First Letter')
)

result.show()


+------------+-----+
|First Letter|count|
+------------+-----+
|           B|    5|
|           C|    2|
|           D|    1|
|           F|    2|
|           G|    2|
|           H|    1|
|           J|    3|
|           M|    1|
|           O|    1|
|           P|    2|
|           R|    2|
|           S|    6|
|           T|    2|
|           W|    1|
+------------+-----+




### Question
5. Produce a list of all the dates in October 2012. They can be output as a timestamp (with time set to midnight) or a date.

easy - https://pgexercises.com/questions/date/series.html generate ts


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sequence, explode, to_date, lit, date_add


df = spark.createDataFrame([(1,)], ["id"])  

start_date = to_date(lit("2012-09-30"))

df = spark.range(31).toDF("noOfDays")

df = df.select(col("noOfDays").cast("int"))

df = df.withColumn("October", date_add(start_date, col("noOfDays")))

df = df.select(col('October'))
df.show()

+----------+
|   October|
+----------+
|2012-09-30|
|2012-10-01|
|2012-10-02|
|2012-10-03|
|2012-10-04|
|2012-10-05|
|2012-10-06|
|2012-10-07|
|2012-10-08|
|2012-10-09|
|2012-10-10|
|2012-10-11|
|2012-10-12|
|2012-10-13|
|2012-10-14|
|2012-10-15|
|2012-10-16|
|2012-10-17|
|2012-10-18|
|2012-10-19|
+----------+
only showing top 20 rows



6. Question
Return a count of bookings for each month, sorted by month

easy - https://pgexercises.com/questions/date/bookingspermonth.html extract month from ts



In [0]:

from pyspark.sql.functions import date_trunc, count, to_date

result = df_booking.groupBy(
    to_date(date_trunc('month', 'starttime')).alias('month')
).agg(
    count('*').alias('count')
).orderBy('month')

result.show()

+----------+-----+
|     month|count|
+----------+-----+
|2012-07-01|  658|
|2012-08-01| 1472|
|2012-09-01| 1913|
|2013-01-01|    1|
+----------+-----+

