In [90]:
import psycopg2 as pg2
import pandas as pd

In [157]:
conn = pg2.connect(database='dvdrental', user='kevindeeboman')
cur = conn.cursor()

In [92]:
def show_res(cur, head=False):
    res = cur.fetchall()
    col_names = [desc[0] for desc in cur.description]
    df = pd.DataFrame(data= res, columns=col_names)
    if head == True:
        display(df.head(3))
    else:
        display(df)

#### Most common aggregate functions
* AVG() - Average value
* COUNT() - Returnes number of values
* MAX() - Returnes maximum value
* MIN() - Returnes minumum value
* SUM() - Returnes the sum of all values
* ROUND() - Rounds value to set decimal points

In [93]:
# Example of combining two aggregate functions #
cur.execute("""
    SELECT ROUND(AVG(replacement_cost), 2) FROM film;
    """)
show_res(cur, head=True)

Unnamed: 0,round
0,19.98


In [94]:
# GROUP BY # OBS! Group by must come directly after FROM or WHERE statement! #
cur.execute("""
    SELECT rating, ROUND(AVG(replacement_cost), 2) as avg_rc 
    FROM film
    WHERE rating <> 'R'
    GROUP BY rating
    ORDER BY avg_rc DESC;
    """)
show_res(cur)
# IMPORTANT # All selected cols must be either in the Group by statement OR have an aggregate function applied #
# LIMIT always comes last! #
cur.execute("""
    SELECT rating, rental_rate, ROUND(AVG(replacement_cost), 2)
    FROM film
    WHERE rental_rate <> 0.99
    GROUP BY rating, rental_rate
    ORDER BY rating, rental_rate DESC
    LIMIT 8;
    """)
show_res(cur)
# Here we Group by both rating and rental rate and order by the rental rate per group where rental rate is not 0.99 #
# If a WHERE statement is included it should not reference the aggregate function! #

Unnamed: 0,rating,avg_rc
0,PG-13,20.4
1,NC-17,20.14
2,G,20.12
3,PG,18.96


Unnamed: 0,rating,rental_rate,round
0,G,4.99,19.34
1,G,2.99,19.74
2,PG,4.99,19.58
3,PG,2.99,18.27
4,PG-13,4.99,20.73
5,PG-13,2.99,19.75
6,R,4.99,19.62
7,R,2.99,21.12


In [98]:
# Here we present the top 10 renters by total amount spent #
cur.execute("""
    SELECT customer_id, SUM(amount) as total, ROUND(AVG(amount), 2) as avg_amount, COUNT(rental_id) as number_rented
    FROM payment
    GROUP BY customer_id
    ORDER BY total DESC
    LIMIT 10
    """)
show_res(cur)

Unnamed: 0,customer_id,total,avg_amount,number_rented
0,148,211.55,4.7,45
1,526,208.58,4.97,42
2,178,194.61,4.99,39
3,137,191.62,5.04,38
4,144,189.6,4.74,40
5,459,183.63,4.96,37
6,181,167.67,5.08,33
7,410,167.62,4.41,38
8,236,166.61,4.27,39
9,403,162.67,4.93,33


In [108]:
# Currently payment_date looks like 2007-02-17 04:32:51.996577, with specific time data included #
# Using DATE() we can exclude timestamp data and only look at specific dates #
cur.execute("""
    SELECT DATE(payment_date) as dates, SUM(amount) as total
    FROM payment
    GROUP BY dates
    ORDER BY total DESC
    LIMIT 5
    """)
show_res(cur)
# We show the top 5 sales by date #

Unnamed: 0,dates,total
0,2007-04-30,5723.89
1,2007-03-21,2868.27
2,2007-03-01,2808.24
3,2007-04-29,2717.6
4,2007-03-18,2701.76


In [122]:
# Which staff member has sold the most? #
cur.execute("""
    SELECT staff_id, COUNT(payment_id), ROUND(AVG(amount), 2), SUM(amount) as sales_count
    FROM payment
    GROUP BY staff_id
    ORDER BY sales_count DESC;
    """)
show_res(cur)

cur.execute("""
    SELECT rating, ROUND(AVG(replacement_cost), 2) as avg_cost
    FROM film
    GROUP BY rating
    ORDER BY avg_cost DESC
    """)
show_res(cur)

Unnamed: 0,staff_id,count,round,sales_count
0,2,7304,4.25,31059.92
1,1,7292,4.15,30252.12


Unnamed: 0,rating,avg_cost
0,PG-13,20.4
1,R,20.23
2,NC-17,20.14
3,G,20.12
4,PG,18.96


In [149]:
# HAVING clause # Used to filter AFTER a Group by call, in contrast to WHERE which is called before the Group by -
# - has been performed # HAVING does note work with self assigned variable names #
cur.execute("""
    SELECT customer_id, SUM(amount) as total_spent
    FROM payment
    GROUP BY customer_id
    HAVING SUM(amount) > 200
    ORDER BY total_spent DESC
    ;
    """)
show_res(cur)
# Here we see only customers who spent over 200, we can only filter aggregate functions in Group by using HAVING #

cur.execute("""
    SELECT district, COUNT(district) as num
    FROM address 
    GROUP BY district
    HAVING COUNT(district) >= 8
    ORDER BY num DESC
    LIMIT 3
    ;
    """)
show_res(cur)
# Sorting is performed LAST! # Priority list --> SELECT > FROM > WHERE > GROUP BY > HAVING > ORDER BY > LIMIT #

Unnamed: 0,customer_id,total_spent
0,148,211.55
1,526,208.58


Unnamed: 0,district,num
0,Buenos Aires,10
1,West Bengali,9
2,Shandong,9


In [162]:
# Customers who have 40 or more transactions #
cur.execute("""
    SELECT customer_id, COUNT(*) as counts
    FROM payment
    GROUP BY customer_id
    HAVING COUNT(*) >= 40
    ORDER BY counts DESC
    ;
    """)
show_res(cur)
# Customers who have spent most money with staff_id = 2 #
cur.execute("""
    SELECT customer_id, staff_id, SUM(amount) as sums
    FROM payment
    WHERE staff_id = 2
    GROUP BY customer_id, staff_id
    HAVING SUM(amount) >= 100
    ORDER BY sums DESC
    ;
    """)
show_res(cur)

Unnamed: 0,customer_id,counts
0,148,45
1,526,42
2,144,40


Unnamed: 0,customer_id,staff_id,sums
0,187,2,110.81
1,148,2,110.78
2,211,2,108.77
3,522,2,102.8
4,526,2,101.78


In [167]:
# How many movies start with the letter 'J'? #
cur.execute("""
    SELECT COUNT(title)
    FROM film
    WHERE title ILIKE 'J%'
    ;
    """)
show_res(cur)
#What customer has the highest customer ID number whose name starts with an 'E' and has an address ID lower than 500?#
cur.execute("""
    SELECT customer_id, first_name, last_name
    FROM customer
    WHERE first_name ILIKE 'E%' AND address_id < 500
    ORDER BY customer_id DESC
    LIMIT 1
    ;
    """)
show_res(cur)

Unnamed: 0,count
0,20


Unnamed: 0,customer_id,first_name,last_name
0,434,Eddie,Tomlin
