# Imports

In [0]:
from pyspark.sql.types import DateType
from pyspark.sql.functions import *

#Join - 1

In [0]:
bookings = spark.sql("SELECT * FROM bookings")
members = spark.sql("SELECT * FROM members")
facilities = spark.sql("SELECT * FROM facilities")

res = members.join(bookings, members.memid == bookings.memid).filter(members.surname == "Farrell").filter(members.firstname == "David").select(bookings.starttime)
display(res)

starttime
2012-09-18T09:00:00.000+0000
2012-09-18T17:30:00.000+0000
2012-09-18T13:30:00.000+0000
2012-09-18T20:00:00.000+0000
2012-09-19T09:30:00.000+0000
2012-09-19T15:00:00.000+0000
2012-09-19T12:00:00.000+0000
2012-09-20T15:30:00.000+0000
2012-09-20T11:30:00.000+0000
2012-09-20T14:00:00.000+0000


## Join 2

In [0]:

res = bookings.join(facilities, bookings.facid == facilities["facid"]).withColumn('date', bookings.starttime.cast(DateType())).filter(col("date") == "2012-09-21").filter(col("name").rlike("Tennis Court")).withColumnRenamed("starttime", "start").select("start", "name").orderBy("starttime")

display(res)

start,name
2012-09-21T08:00:00.000+0000,Tennis Court 1
2012-09-21T08:00:00.000+0000,Tennis Court 2
2012-09-21T09:30:00.000+0000,Tennis Court 1
2012-09-21T10:00:00.000+0000,Tennis Court 2
2012-09-21T11:30:00.000+0000,Tennis Court 2
2012-09-21T12:00:00.000+0000,Tennis Court 1
2012-09-21T13:30:00.000+0000,Tennis Court 1
2012-09-21T14:00:00.000+0000,Tennis Court 2
2012-09-21T15:30:00.000+0000,Tennis Court 1
2012-09-21T16:00:00.000+0000,Tennis Court 2


## Join 3

In [0]:

res = members.alias("mem1").join(members.alias("mem2"), col("mem1.recommendedby") == col("mem2.memid")).select(col("mem1.firstname").alias("memfname"), col("mem1.surname").alias("memsname"), col("mem2.firstname").alias("recfname"), col("mem2.surname").alias("recsname"))
display(res)

memfname,memsname,recfname,recsname
Janice,Joplette,Darren,Smith
Gerald,Butters,Darren,Smith
Nancy,Dare,Janice,Joplette
Tim,Boothe,Tim,Rownam
Ponder,Stibbons,Burton,Tracy
Charles,Owen,Darren,Smith
David,Jones,Janice,Joplette
Anne,Baker,Ponder,Stibbons
Jack,Smith,Darren,Smith
Florence,Bader,Ponder,Stibbons


## Join 4

In [0]:
res = members.join(bookings, members.memid == bookings.memid).join(facilities, bookings.facid == facilities.facid).filter(col("name").rlike("Tennis Court")).withColumn("member", concat("firstname", lit(" "), "surname")).withColumnRenamed("name", "facility").select("member", "facility").distinct().orderBy("member", "facility")

display(res)

member,facility
Anne Baker,Tennis Court 1
Anne Baker,Tennis Court 2
Burton Tracy,Tennis Court 1
Burton Tracy,Tennis Court 2
Charles Owen,Tennis Court 1
Charles Owen,Tennis Court 2
Darren Smith,Tennis Court 2
David Farrell,Tennis Court 1
David Farrell,Tennis Court 2
David Jones,Tennis Court 1


## Join 5

## Aggregation 1

In [0]:
res = members.groupBy(col("members.recommendedby")).count().where(col("members.recommendedby").isNotNull()).orderBy(col("members.recommendedby"))
display(res)

recommendedby,count
1,5
2,3
3,1
4,2
5,1
6,1
9,2
11,1
13,2
15,1


## Aggregation 2

In [0]:
res = facilities.join(bookings, facilities.facid == bookings.facid).groupBy(col("facilities.facid")).sum("slots").withColumnRenamed("sum(slots)","Total Slots").orderBy("facid")
display(res)

facid,Total Slots
0,1320
1,1278
2,1209
3,830
4,1404
5,228
6,1104
7,908
8,911


## Aggregation 3

In [0]:


res = facilities.join(bookings.withColumn("startdate", bookings.starttime.cast(DateType())), facilities.facid == bookings.facid).filter(col("startdate") >= "2012-09-01").filter(col("startdate") < "2012-10-01").groupBy(facilities.facid).sum("slots").withColumnRenamed("sum(slots)","Total Slots").orderBy("Total Slots")
display(res)

facid,Total Slots
5,122
3,422
7,426
8,471
6,540
2,570
1,588
0,591
4,648


## Aggregation 4

In [0]:
res = bookings.filter(year('starttime') == 2012).withColumn("month", month("starttime")).groupBy("facid", "month").sum("slots").withColumnRenamed("sum(slots)","Total Slots").orderBy("facid", "month")
display(res)

facid,month,Total Slots
0,7,270
0,8,459
0,9,591
1,7,207
1,8,483
1,9,588
2,7,180
2,8,459
2,9,570
3,7,104


## Aggregation 5

In [0]:
res = bookings.select(countDistinct(bookings.memid))
display(res)

count(DISTINCT memid)
30


## Aggregation 6

In [0]:
sub = bookings.filter(col("starttime") >= "2012-09-01").groupBy("memid").agg(min("starttime")).withColumnRenamed("min(starttime)", "starttime")
res = members.join(sub, members.memid == sub.memid).select("surname", "firstname", "members.memid", "starttime").orderBy("memid")
display(res)

surname,firstname,memid,starttime
GUEST,GUEST,0,2012-09-01T08:00:00.000+0000
Smith,Darren,1,2012-09-01T09:00:00.000+0000
Smith,Tracy,2,2012-09-01T11:30:00.000+0000
Rownam,Tim,3,2012-09-01T16:00:00.000+0000
Joplette,Janice,4,2012-09-01T15:00:00.000+0000
Butters,Gerald,5,2012-09-02T12:30:00.000+0000
Tracy,Burton,6,2012-09-01T15:00:00.000+0000
Dare,Nancy,7,2012-09-01T12:30:00.000+0000
Boothe,Tim,8,2012-09-01T08:30:00.000+0000
Stibbons,Ponder,9,2012-09-01T11:00:00.000+0000


## String & Date 1

In [0]:
res = members.select(concat("surname", lit(", "), "firstname"))
display(res)

"concat(surname, , , firstname)"
"GUEST, GUEST"
"Smith, Darren"
"Smith, Tracy"
"Rownam, Tim"
"Joplette, Janice"
"Butters, Gerald"
"Tracy, Burton"
"Dare, Nancy"
"Boothe, Tim"
"Stibbons, Ponder"


## String & Date 2

In [0]:
res = facilities.filter(col("name").rlike("(?i)^tennis"))

display(res)

facid,name,membercost,guestcost,initialoutlay,monthlymaintenance
0,Tennis Court 1,5.0,25.0,10000.0,200.0
1,Tennis Court 2,5.0,25.0,8000.0,200.0


## String & Date 3

In [0]:
res = members.filter(col("telephone").rlike("[()]")).select("memid", "telephone").orderBy("memid")
display(res)

memid,telephone
0,(000) 000-0000
3,(844) 693-0723
4,(833) 942-4710
5,(844) 078-4130
6,(822) 354-9973
7,(833) 776-4001
8,(811) 433-2547
9,(833) 160-3900
10,(855) 542-5251
11,(844) 536-8036


## String & Date 4

In [0]:
res = members.withColumn("letter", substring("surname", 0, 1)).groupBy("letter").count().orderBy("letter")
display(res)

letter,count
B,5
C,2
D,1
F,2
G,2
H,1
J,3
M,1
O,1
P,2


## String & Date 5

In [0]:
res = (spark.createDataFrame([{"date": 1}])).select(explode(sequence(to_date(lit("2012-10-01")), to_date(lit("2012-10-31")), expr("INTERVAL 1 day"))).alias("ts"))

display(res)


ts
2012-10-01
2012-10-02
2012-10-03
2012-10-04
2012-10-05
2012-10-06
2012-10-07
2012-10-08
2012-10-09
2012-10-10


## String & Date 6

In [0]:
res = bookings.withColumn("month", date_trunc("month", bookings.starttime)).groupBy("month").count().orderBy("month")
display(res)

month,count
2012-07-01T00:00:00.000+0000,658
2012-08-01T00:00:00.000+0000,1472
2012-09-01T00:00:00.000+0000,1913
2013-01-01T00:00:00.000+0000,1
