# SQL notes

# SELECT * and LIMIT

In [None]:
SELECT *
FROM people
LIMIT 10;

In [None]:
SELECT title, release_year, country
FROM films;

# SELECT COUNT(DISTINCT... 
- COUNT will not include NULL values

In [None]:
# SELECT DISTINCT - remove duplicates
SELECT DISTINCT country 
FROM films

In [None]:
# SELECT COUNT

# number of rows
SELECT COUNT(*)
FROM people;

# count but drop nan
SELECT COUNT(birthdate)
FROM people;

# COUNT(DISTINCT...
SELECT COUNT(DISTINCT birthdate)
FROM people;



# Filtering with WHERE
- after the FROM statement
- Important: in PostgreSQL (the version of SQL we're using), you must use single quotes with WHERE.

WHERE keyword allows you to filter based on both text and numeric values in a table. There are a few different comparison operators you can use:

'=' equal

'<>' not equal - Note that in this course we will use <> and not != for the not equal operator, as per the SQL standard.

'<' less than

'>' greater than

'<=' less than or equal to

'>=' greater than or equal to

In [None]:
SELECT title
FROM films
WHERE title = 'Metropolis';

In [None]:
SELECT title
FROM films
WHERE title = 'Metropolis';

# WHERE AND

In [None]:
SELECT title
FROM films
WHERE release_year > 1994
AND release_year < 2000;

# WHERE OR, WHERE (x OR y) AND (a OR b)

In [None]:
SELECT title
FROM films
WHERE release_year = 1994
OR release_year = 2000;

In [None]:
# using parentheses for individual clauses when using AND and OR
SELECT title
FROM films
WHERE (release_year = 1994 OR release_year = 1995)
AND (certification = 'PG' OR certification = 'R');

In [None]:
SELECT title, release_year
FROM films
WHERE (release_year > 1989 AND release_year < 2000) 
AND (language='French' OR language='Spanish')
AND (gross >2000000)

# BETWEEN
- inclusive
- Similar to the WHERE clause, the BETWEEN clause can be used with multiple AND and OR operators,

In [None]:
SELECT title
FROM films
WHERE release_year
BETWEEN 1994 AND 2000;

# WHERE IN
- The IN operator allows you to specify multiple values in a WHERE clause, making it easier and quicker to specify multiple OR conditions!

In [None]:
SELECT name
FROM kids
WHERE age IN (2, 4, 6, 8, 10);

In [None]:
SELECT title, release_year
FROM films
WHERE release_year IN (1990, 2000)
AND duration > 120

# NULL and IS NULL, IS NOT NULL

In [None]:
# count NULL values
SELECT COUNT(*)
FROM people
WHERE birthdate IS NULL;

In [None]:
SELECT name
FROM people
WHERE birthdate IS NOT NULL;

# LIKE and NOT LIKE

you'll want to search for a pattern rather than a specific text string.

In SQL, the LIKE operator can be used in a WHERE clause to search for a pattern in a column. To accomplish this, you use something called a wildcard as a placeholder for some other values. There are two wildcards you can use with LIKE:

The `%` wildcard will match zero, one, or many characters in text. For example, the following query matches companies like 'Data', 'DataC' 'DataCamp', 'DataMind', and so on:

SELECT name
FROM companies
WHERE name LIKE 'Data%';

The `_` wildcard will match a single character. For example, the following query matches companies like 'DataCamp', 'DataComp', and so on:

SELECT name
FROM companies
WHERE name LIKE 'DataC_mp';
You can also use the NOT LIKE operator to find records that don't match the pattern you specify.

In [None]:
SELECT name
FROM people
WHERE name LIKE 'B%';

# Aggregate functions

In [None]:
SELECT AVG(budget)
FROM films;

SELECT MAX(budget)
FROM films;

SELECT SUM(budget)
FROM films;

SELECT MIN(budget)
FROM films;

## Combine aggregate functions with WHERE

In [None]:
SELECT SUM(budget)
FROM films
WHERE release_year >= 2010;

In [None]:
# title starts with 'A'
SELECT AVG(gross)
FROM films
WHERE title LIKE 'A%';

In [None]:
SELECT MAX(gross)
FROM films
WHERE release_year
BETWEEN 2000 AND 2012;

## Arithmetic
- `+`, `-`, `*`, and `/`

In [None]:
SELECT (4 * 3);
# 12

# note division of integers, returns same type of integers
SELECT (4 / 3);
# 1

SELECT (4.0 / 3.0) AS result;
# 1.333

### Make sure when dividing you use at least 1 float type

In [None]:
SELECT 45 * 100.0 / 10;

# Aliasing with AS

In [None]:
SELECT MAX(budget) AS max_budget,
       MAX(duration) AS max_duration
FROM films;

SELECT title,
    (gross - budget) AS net_profit
FROM films;

SELECT AVG(duration)/60.0 AS avg_duration_hours
FROM films;

In [None]:
# note COUNT will not include NULL values
-- get the count(deathdate) and multiply by 100.0
-- then divide by count(*)
SELECT COUNT(deathdate)*100.0/COUNT(*) AS percentage_dead
FROM people;

SELECT MAX(release_year)-MIN(release_year) AS difference
FROM films;

# Use () for numerator of fraction
SELECT (MAX(release_year)-MIN(release_year))/10.0 AS number_of_decades
FROM films;

# Commenting with --

-- get the count(deathdate) and multiply by 100.0

-- then divide by count(*)

# ORDER BY
- default ascending order
- `DESC` for descending order
- NOTE - ORDER BY sorts text column alphabetically A to Z by default

In [None]:
# DESC example
SELECT title
FROM films
ORDER BY release_year DESC;

SELECT name
FROM people
ORDER BY name;

SELECT birthdate, name
FROM people
ORDER BY birthdate;

SELECT title
FROM films
WHERE release_year IN (2000,2012)
ORDER BY release_year;

SELECT *
FROM films
WHERE release_year <> 2015
ORDER BY duration;

SELECT title, gross
FROM films
WHERE title LIKE 'M%'
ORDER BY title;

## ORDER BY for multiple columns

In [None]:
SELECT birthdate, name
FROM people
ORDER BY birthdate, name;


# GROUP BY
- Commonly, `GROUP BY` is used with aggregate functions like `COUNT()` or `MAX()`. Note that `GROUP BY` always goes after the `FROM` clause!
- Note also that `ORDER BY` always goes after `GROUP BY`. Let's try some exercises!

In [None]:
SELECT sex, count(*)
FROM employees
GROUP BY sex;

In [None]:
'''
A word of warning: SQL will return an error if you try to SELECT 
a field that is not in your GROUP BY clause without using it to 
calculate some kind of value about the entire group.

Note that you can combine GROUP BY with ORDER BY to group your 
results, calculate something about them, and then order your results. 
For example,
'''

SELECT sex, count(*)
FROM employees
GROUP BY sex
ORDER BY count DESC;

SELECT release_year, MIN(gross)
FROM films
GROUP BY release_year;

SELECT language, SUM(gross)
FROM films
GROUP BY language;

SELECT release_year, country, MAX(budget)
FROM films
GROUP BY release_year, country
ORDER BY release_year, country;


# HAVING
- In SQL, aggregate functions can't be used in WHERE clauses.


In [None]:
# shows only those years in which more than 10 films were released
SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;

In [None]:
SELECT COUNT(release_year)
FROM films
GROUP BY release_year
HAVING COUNT(release_year) > 200;

In [None]:
# In how many different years were more than 200 movies released?
SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 200;

# Example - ORDER BY, GROUP BY, HAVING

In [None]:
SELECT release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year
HAVING AVG(budget) > 60000000
ORDER BY (avg_gross) DESC;

In [None]:
-- select country, average budget, SE
SELECT country, AVG(budget) AS avg_budget,
--     and average gross
    AVG(gross) AS avg_gross
-- from the films table
FROM FILMS
-- group by country 
GROUP BY country
-- where the country has more than 10 titles
HAVING COUNT(country) > 10
-- order by country
ORDER BY country
-- limit to only show 5 results
LIMIT 5;

# JOIN
- note jupyter notebook on SQL_joins

In [None]:
SELECT title, imdb_score
FROM films
JOIN reviews
ON films.id = reviews.film_id
WHERE title = 'To Kill a Mockingbird';

In [None]:
SELECT 	name
FROM hit_tracks AS t
INNER JOIN features AS f
ON 
t.id = f.song_id
 AND 
t.dance = f.dance_level
 
ORDER BY name
LIMIT 5;

# Intermediate SQL skills...

# `CASE WHEN` statement
Applications
- create categorical variables
- aggregate data into a single column with multiple filtering conditions
- calculate counts and %

`CASE` statements - create categories in a field
- `WHEN`, `THEN`, `ELSE`, `END AS` alias

In [None]:
# identify matches played between FC Schalke 04 and FC Bayern Munich
SELECT
    -- Select the team long name and team API id
    team_long_name,
    team_api_id
FROM teams_germany
-- Only include FC Schalke 04 and FC Bayern Munich
WHERE team_long_name IN ('FC Schalke 04', 'FC Bayern Munich');



In [None]:
# Create a CASE statement that identifies whether a match in 
# Germany included FC Bayern Munich, FC Schalke 04, 
# or neither as the home team
-- Identify the home team as Bayern Munich, Schalke 04, or neither
SELECT 
    CASE WHEN hometeam_id = 10189 THEN 'FC Schalke 04'
         WHEN hometeam_id = 9823 THEN 'FC Bayern Munich'
         ELSE 'Other' END AS home_team,
    COUNT(id) AS total_matches
FROM matches_germany
-- Group by the CASE statement alias
GROUP BY home_team;

## CASE - comparing column values

In [None]:
# creating a list of matches in the 2011/2012 season where 
# Barcelona was the home team
'''
In 3 steps, you will build a query that identifies a match's winner, 
identifies the identity of the opponent, and finally filters 
for Barcelona as the home team.
'''

# create categories for home win, loss, or tie
SELECT 
    -- Select the date of the match
    date,
    -- Identify home wins, losses, or ties
    CASE WHEN home_goal > away_goal THEN 'Home win!'
        WHEN home_goal < away_goal THEN 'Home loss :(' 
        ELSE 'Tie' END AS outcome
FROM matches_spain;

# Build 2nd step into query
SELECT 
    m.date,
    --Select the team long name column and call it 'opponent'
    t.team_long_name AS opponent, 
    -- Complete the CASE statement with an alias
    CASE WHEN m.home_goal > away_goal THEN 'Home win!'
        WHEN m.home_goal < away_goal THEN 'Home loss :('
        ELSE 'Tie' END AS outcome
FROM matches_spain AS m
-- Left join teams_spain onto matches_spain
LEFT JOIN teams_spain AS t
# this allows for identifying the opponent
ON m.awayteam_id = t.team_api_id;

# Add 3rd step
SELECT 
    m.date,
    t.team_long_name AS opponent,
    -- Complete the CASE statement with an alias
    CASE WHEN m.home_goal > m.away_goal THEN 'Barcelona win!'
        WHEN m.home_goal < m.away_goal THEN 'Barcelona loss :('
        ELSE 'Tie' END AS outcome 
FROM matches_spain AS m
LEFT JOIN teams_spain AS t 
ON m.awayteam_id = t.team_api_id
-- Filter for Barcelona as the home team
WHERE m.hometeam_id = 8634; 

## CASE - compare 2 column values

In [None]:
-- Select matches where Barcelona was the away team
SELECT  
    m.date,
    t.team_long_name AS opponent,
    CASE WHEN m.home_goal < m.away_goal THEN 'Barcelona win!'
        WHEN m.home_goal > m.away_goal THEN 'Barcelona loss :(' 
        ELSE 'Tie' END AS outcome
FROM matches_spain AS m
-- Join teams_spain to matches_spain
LEFT JOIN teams_spain AS t 
ON m.hometeam_id = t.team_api_id
WHERE m.awayteam_id = 8634;

## CASE WHEN...AND (inside WHEN clause) - for multiple conditions
- may need `WHERE` clause for specific filters

In [None]:
SELECT date, hometeam_id, awayteam_id,
    CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
            THEN 'Chelsea home win!'
         WHEN awayteam_id = 8455 AND home_goal < away_goal
            THEN 'Chelsea away win!'
         ELSE 'Loss or tie :(' END AS outcome
FROM match
WHERE hometeam_id = 8455 OR awayteam_id = 8455;

In [None]:
SELECT 
    date,
    CASE WHEN hometeam_id = 8634 THEN 'FC Barcelona' 
         ELSE 'Real Madrid CF' END AS home,
    CASE WHEN awayteam_id = 8634 THEN 'FC Barcelona' 
         ELSE 'Real Madrid CF' END AS away,
    -- Identify all possible match outcomes
    CASE WHEN home_goal > away_goal AND hometeam_id = 8634 THEN 'Barcelona win!'
        WHEN home_goal > away_goal AND hometeam_id = 8633 THEN 'Real Madrid win!'
        WHEN home_goal < away_goal AND awayteam_id = 8634 THEN 'Barcelona win!'
        WHEN home_goal < away_goal AND awayteam_id = 8633 THEN 'Real Madrid win!'
        ELSE 'Tie!' END AS outcome
FROM matches_spain
WHERE (awayteam_id = 8634 OR hometeam_id = 8634)
      AND (awayteam_id = 8633 OR hometeam_id = 8633);

## CASE WHEN...AND...WHERE - drop NULL
- you can use the CASE statement as a filtering column like any other column in your database. The only difference is that you don't alias the statement in `WHERE`.
- do not alias the CASE

In [None]:
SELECT date, hometeam_id, awayteam_id,
    CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
            THEN 'Chelsea home win!'
         WHEN awayteam_id = 8455 AND home_goal < away_goal
            THEN 'Chelsea away win!'
         ELSE 'Loss or tie :(' END AS outcome
FROM match
# use CASE and END IS NOT NULL
WHERE CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
            THEN 'Chelsea home win!'
         WHEN awayteam_id = 8455 AND home_goal < away_goal
            THEN 'Chelsea away win!'
         ELSE 'Loss or tie :(' END IS NOT NULL;

In [None]:
-- Select the season, date, home_goal, and away_goal columns
SELECT 
    season,
    date,
    home_goal,
    away_goal
FROM matches_italy
WHERE 
-- Exclude games not won by Bologna
    CASE WHEN hometeam_id = 9857 AND home_goal > away_goal THEN 'Bologna Win'
        WHEN awayteam_id = 9857 AND away_goal > home_goal THEN 'Bologna Win' 
        END IS NOT NULL;

## CASE WHEN with aggregate functions
- like COUNT, SUM, AVG

### COUNT(CASE WHEN ...) AS alias

In [None]:
SELECT
    season,
    COUNT(CASE WHEN hometeam_id = 8650
               AND home_goal > away_goal
               THEN 54321 END) AS home_wins,
    COUNT(CASE WHEN awayteam_id = 8650
               AND away_goal > home_goal
               THEN 'Some random text' END) AS away_wins
FROM match
GROUP BY season;

### SUM(CASE WHEN ...) AS alias
- In SQL, need to convert values to boolean for `SUM`
- R/Python can sum logical values like True/False

In [None]:
# note ELSE is assumed NULL
SELECT
    season,
    SUM(CASE WHEN hometeam_id = 8650
               THEN home_goal END) AS home_goals,
    SUM(CASE WHEN awayteam_id = 8650
               THEN away_goal END) AS away_goals
FROM match
GROUP BY season;

In [None]:
# example - sum logical values, need to convert to bool
SELECT 
    c.name AS country,
    -- Sum the total records in each season where the home team won
    SUM(CASE WHEN m.season = '2012/2013' AND m.home_goal > m.away_goal 
        THEN 1 ELSE 0 END) AS matches_2012_2013,
     SUM(CASE WHEN m.season = '2013/2014' AND m.home_goal > m.away_goal 
        THEN 1 ELSE 0 END) AS matches_2013_2014,
    SUM(CASE WHEN m.season = '2014/2015' AND m.home_goal > m.away_goal THEN 1 ELSE 0 END) AS matches_2014_2015
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
-- Group by country name alias
GROUP BY country;

### ROUND(AVG(CASE WHEN...

In [None]:
SELECT
    season,
    ROUND(AVG(CASE WHEN hometeam_id = 8650
               THEN home_goal END),2) AS home_goals,
    ROUND(AVG(CASE WHEN awayteam_id = 8650
               THEN away_goal END),2) AS away_goals
FROM match
GROUP BY season;

### Percentages with CASE and AVG

In [None]:
SELECT
    season,
    AVG(CASE WHEN hometeam_id = 8455 AND home_goal > away_goal THEN 1
             WHEN hometeam_id = 8455 AND home_goal < away_goal THEN 0
             END) AS pct_homewins,
    AVG(CASE WHEN awayteam_id = 8455 AND away_goal > home_goal THEN 1
             WHEN awayteam_id = 8455 AND away_goal < home_goal THEN 0
             END) AS pct_awaywins,
FROM match
GROUP BY season;

In [None]:
SELECT 
    c.name AS country,
    -- Round the percentage of tied games to 2 decimal points
    ROUND(AVG(CASE WHEN m.season='2013/2014' AND m.home_goal = m.away_goal THEN 1
        WHEN m.season='2013/2014' AND m.home_goal != m.away_goal THEN 0 
        END),2) AS pct_ties_2013_2014,
    ROUND(AVG(CASE WHEN m.season='2014/2015' AND m.home_goal = m.away_goal THEN 1
        WHEN m.season='2014/2015' AND m.home_goal != m.away_goal THEN 0
        END),2) AS pct_ties_2014_2015
FROM country AS c
LEFT JOIN matches AS m
ON c.id = m.country_id
GROUP BY country;

# Subqueries - extract and tranform data
- subquery = nested query
- also in SQL_joins notebook
- subqueries find most commonly in `WHERE`, then `SELECT`, then `FROM`
- also in `GROUP BY` clause

Application example
- comparing groups to summarized values
- reshaping data - highest monthly avg in some group
- combining data that can not be joined - get both home and away team names in a table to match results

## Simple subqueries - in the `WHERE` clause

In [None]:
SELECT date, hometeam_id, awayteam_id, home_goal, away_goal
FROM match
WHERE season = '2012/2013'
    AND home_goal > (SELECT AVG(home_goal)
                     FROM match);

## Simple subquery - filtering list with IN

In [None]:
SELECT
    team_long_name,
    team_short_name AS abbr
FROM team
WHERE
    team_api_id IN
    (SELECT hometeam_id
     FROM match
     WHERE country_id = 15722);

## Subquery in `FROM` statement
Applications
- restructure and transform data
    - transform data from long to wide format
    - prefiltering data
- calculate aggregates of aggregates
    - ie. which 3 teams has the highest avg of home goals scored
        1. calculate AVG for each team
        1. get 3 highest of the AVG values

Note
- can create multiple subqueries in one `FROM` statement. 
    - Use alias for each subquery.
    - Join them
- can join a subquery to a table in `FROM`
    - include a joining columns in both tables


In [None]:
SELECT team, home_avg
FROM (SELECT
         t.team_long_name AS team,
         AVG(m.home_goal) AS home_avg
      FROM match AS m
      LEFT JOIN team AS t
      ON m.hometeam_id = t.team_api_id
      WHERE season = '2011/2012'
      GROUP BY team) AS subquery
ORDER BY home_avg DESC
LIMIT 3;

In [None]:
SELECT
    -- Select country, date, home, and away goals from the subquery
    country,
    date,
    home_goal,
    away_goal
FROM 
    -- Select country name, date, home_goal, away_goal, and total goals in the subquery
    (SELECT c.name AS country, 
            m.date, 
            m.home_goal, 
            m.away_goal,
           (m.home_goal + m.away_goal) AS total_goals
    FROM match AS m
    LEFT JOIN country AS c
    ON m.country_id = c.id) AS subquery
-- Filter by total goals scored in the main query
WHERE total_goals >= 10;

## Subquery in `SELECT`
Applications
- return single value
    - ie. include aggregate values to compare to individual values
- use in math calculations
    - ie. deviation from the avg

Note
- need to return a SINGLE value, else error
- REQUIRED = ensure all filters in the right place
    - properly filter both the main query and subquery (same WHERE clause in subquery and query)

In [None]:
SELECT
    season
    COUNT(id) AS matches,
    (SELECT COUNT(id) FROM match) AS total_matches
FROM match
GROUP BY season;

In [None]:
SELECT
    date,
    (home_goal + away_goal) AS goals,
    (home_goal + away_goal) - 
        (SELECT AVG(home_goal + away_goal)
         FROM match
         WHERE season = '2011/2012') AS diff
FROM match
WHERE season = '2011/2012';

In [None]:
SELECT 
    l.name AS league,
    -- Select and round the league's total goals
    ROUND(AVG(m.home_goal + m.away_goal), 2) AS avg_goals,
    -- Select & round the average total goals for the season
    (SELECT ROUND(AVG(home_goal + away_goal), 2) 
     FROM match
     WHERE season = '2013/2014') AS overall_avg
FROM league AS l
LEFT JOIN match AS m
ON l.country_id = m.country_id
-- Filter for the 2013/2014 season
WHERE season = '2013/2014'
GROUP BY league;

In [None]:
# calculation subquery in SELECT example
SELECT
    -- Select the league name and average goals scored
    l.name AS league,
    ROUND(AVG(m.home_goal + m.away_goal),2) AS avg_goals,
    -- Subtract the overall average from the league average
    ROUND(AVG(m.home_goal + m.away_goal) - 
        (SELECT AVG(home_goal + away_goal)
         FROM match 
         WHERE season = '2013/2014'),2) AS diff
FROM league AS l
LEFT JOIN match AS m
ON l.country_id = m.country_id
-- Only include 2013/2014 results
WHERE season = '2013/2014'
GROUP BY l.name;

## Subquery best practice
1. Format your queries
    - line up `SELECT`, `FROM`, `WHERE`, and `GROUP BY`
1. Annotate your queries with comments
    - multi-line comments with `/* ... */`
    - inline comments with `--`
1. Indent your subquery
1. Resources
    - Holywell's SQL style guide
1. Is subquery necessary
    - requires computing power
    - longer time for bigger database, bigger table
1. Match filter for each subquery
    - WHERE statement should match from subquery to query
    

## Subquery example - subquery each in SELECT, FROM, and WHERE clause

In [None]:
# subquery in FROM and WHERE
SELECT 
	-- Select the stage and average goals from the subquery
	s.stage,
	ROUND(s.avg_goals,2) AS avg_goals
FROM 
	-- Select the stage and average goals in 2012/2013
	(SELECT
		 stage,
         AVG(home_goal + away_goal) AS avg_goals
	 FROM match
	 WHERE season = '2012/2013'
	 GROUP BY stage) AS s
WHERE 
	-- Filter the main query using the subquery
	s.avg_goals > (SELECT AVG(home_goal + away_goal) 
                    FROM match WHERE season = '2012/2013');

In [None]:
# subquery in all 3 clauses
SELECT 
	-- Select the stage and average goals from s
	s.stage,
    ROUND(s.avg_goals,2) AS avg_goal,
    -- Select the overall average for 2012/2013
    -- subquery in SELECT
    (SELECT AVG(home_goal + away_goal) 
     FROM match 
     WHERE season = '2012/2013') AS overall_avg
FROM 
	-- Select the stage and average goals in 2012/2013 from match
    -- subquery in FROM
	(SELECT
		 stage,
         AVG(home_goal + away_goal) AS avg_goals
	 FROM match
	 WHERE season = '2012/2013'
	 GROUP BY stage) AS s
WHERE 
	-- Filter the main query using the subquery
    # subquery in WHERE
	s.avg_goals > (SELECT AVG(home_goal + away_goal) 
                   FROM match 
                   WHERE season = '2012/2013');

# Correlated Queries, Nested Queries, and Common Table Expressions

# Correlated subquery
- uses values from outer query to generate result
- re-run for every row generated in final data set
- used for advanced joining, filtering, and evaluating data

Correlated subqueries (vs Simple subquery)
- Dependent on the main query to execute (run independently to main query)
- evaluated in loops (vs once in whole query)
    - significantly slows down query runtime


In [None]:
# example
SELECT
    c.name AS country
    (SELECT
        AVG(home_goal + away_goal)
     FROM match AS m
     # correlated part
     WHERE m.country_id = c.id)
        AS avg_goals
FROM country AS c
GROUP BY country;

In [None]:
SELECT 
	-- Select country ID, date, home, and away goals from match
	main.country_id,
    main.date,
    main.home_goal, 
    main.away_goal
FROM match AS main
WHERE 
	-- Filter the main query by the subquery
	(home_goal + away_goal) > 
        (SELECT AVG((sub.home_goal + sub.away_goal) * 3)
         FROM match AS sub
         -- Join the main query to the subquery in WHERE
         ### correlated part
         WHERE main.country_id = sub.country_id);

## Correlated subquery with multiple conditions

In [None]:
'''
In this exercise, you're going to add an additional column 
for matching to answer the question -- what was the 
highest scoring match for each country, in each season?
'''
SELECT 
	-- Select country ID, date, home, and away goals from match
	main.country_id,
    main.date,
    main.home_goal,
    main.away_goal
FROM match AS main
WHERE 
	-- Filter for matches with the highest number of goals scored
    -- total goals = max goals
	(home_goal + away_goal) = 
        (SELECT MAX(sub.home_goal + sub.away_goal)
         FROM match AS sub
         WHERE main.country_id = sub.country_id
               AND main.season = sub.season);

# Nested subquery
- can be correlated or uncorrelated, or combo of both


In [None]:
SELECT
    EXTRACT(MONTH FROM date) AS MONTH
    SUM(m.home_goal + m.away_goal) AS total_goals,
    SUM(m.home_goal + m.away_goal) -
    (SELECT AVG(goals)
     FROM (SELECT
              EXTRACT(MONTH FROM date) AS month,
           FROM match
           GROUP BY month) AS s) AS diff
FROM match AS m
GROUP BY month;

In [None]:
'''
nested subquery to examine the highest total number of goals 
in each season, overall, and during July across all seasons.
'''
SELECT
	-- Select the season and max goals scored in a match
	season,
    MAX(home_goal + away_goal) AS max_goals,
    -- Select the overall max goals scored in a match
   (SELECT MAX(home_goal + away_goal) FROM match) AS overall_max_goals,
   -- Select the max number of goals scored in any match in July
   (SELECT MAX(home_goal + away_goal) 
    FROM match
    WHERE id IN (
          SELECT id FROM match WHERE EXTRACT(MONTH FROM date) = 07)) AS july_max_goals
FROM match
GROUP BY season;

In [None]:
SELECT
	c.name AS country,
    -- Calculate the average matches per season
	AVG(outer_s.matches) AS avg_seasonal_high_scores
FROM country AS c
-- Left join outer_s to country
LEFT JOIN (
  SELECT country_id, season,
         COUNT(id) AS matches
  FROM (
    SELECT country_id, season, id
	FROM match
	WHERE home_goal >= 5 OR away_goal >= 5) AS inner_s
  -- Close parentheses and alias the subquery
  GROUP BY country_id, season) AS outer_s
ON c.id = outer_s.country_id
GROUP BY country;

# Common Table Expressions (CTE)
- table is declared before the main query
- named and referenced later in `FROM` statement

CTE Advantages
- executed once - CTE stored in memory and improves query performance
- improve organization of queries
- reference other CTEs (ie. 3rd CTE can reference 1st and 2nd CTE)
- CTE can reference itself (SELF JOIN) = recursive CTE


In [None]:
# CTE syntax
'''
WITH cte_name AS(
    SELECT ...
    FROM ...
    WHERE ...
),
-- New subquery
cte_name2()

QUERY...
SELECT
FROM
INNER JOIN cte_name
ON 
INNER JOIN cte_name2
ON 
GROUP BY
'''

In [None]:
WITH cte AS (
    SELECT col1, col2
    FROM table)
SELECT
    AVG(col1) AS avg_col
FROM cte;

In [None]:
-- Set up your CTE
WITH match_list AS (
    SELECT 
  		country_id, 
  		id
    FROM match
    WHERE (home_goal + away_goal) >= 10)
-- Select league and count of matches from the CTE
SELECT
    l.name AS league,
    COUNT(match_list.id) AS matches
FROM league AS l
-- Join the CTE to the league table
LEFT JOIN match_list ON l.id = match_list.country_id
GROUP BY l.name;

In [None]:
-- Set up your CTE
WITH match_list AS (
  -- Select the league, date, home, and away goals
    SELECT 
  		l.name AS league, 
     	m.date, 
  		m.home_goal, 
  		m.away_goal,
       (m.home_goal + m.away_goal) AS total_goals
    FROM match AS m
    LEFT JOIN league as l ON m.country_id = l.id)
-- Select the league, date, home, and away goals from the CTE
SELECT league, date, home_goal, away_goal
FROM match_list
-- Filter by total goals
WHERE total_goals >= 10;

In [None]:
# CTE with nested subquery
-- Set up your CTE
WITH match_list AS (
    SELECT 
  		country_id,
  	   (home_goal + away_goal) AS goals
    FROM match
  	-- Create a list of match IDs to filter data in the CTE
    WHERE id IN (
       SELECT id
       FROM match
       WHERE season = '2013/2014' AND EXTRACT(MONTH FROM date) = 08))
-- Select the league name and average of goals in the CTE
SELECT 
	l.name,
  AVG(match_list.goals)
FROM league AS l
-- Join the CTE onto the league table
LEFT JOIN match_list ON l.id = match_list.country_id
GROUP BY l.name;

# Differentiating techniques
Joins
- combine 2+ tables
- simple operations/aggregations

Correlated subquery - great for matching columns in multiple tables
- match subqueries and tables
- avoid limits of join (you can't join 2 separate columns in 1 table to a single column in another table)
- BUT take a long time to process

Multiple/Nested subquery - requires multiple steps to transform
- multi-step transformations
- improve accuracy and reproducibility

CTE
- organize subqueries sequentially
- can reference other CTEs

Which technique to use?
- depends on database and question
- use and reuse queries
- clear and accurate results


Following examples show 3 techniques for same query result


In [None]:
# Get team names with subquery
SELECT
	m.date,
    -- Get the home and away team names
    home.hometeam,
    away.awayteam,
    m.home_goal,
    m.away_goal
FROM match AS m

-- Join the home subquery to the match table
LEFT JOIN (
  SELECT match.id, team.team_long_name AS hometeam
  FROM match
  LEFT JOIN team
  ON match.hometeam_id = team.team_api_id) AS home
ON home.id = m.id

-- Join the away subquery to the match table
LEFT JOIN (
  SELECT match.id, team.team_long_name AS awayteam
  FROM match
  LEFT JOIN team
  -- Get the away team ID in the subquery
  ON match.awayteam_id = team.team_api_id) AS away
ON away.id = m.id;

In [None]:
# get team names with correlated subquery
SELECT
    m.date,
    (SELECT team_long_name
     FROM team AS t
     WHERE t.team_api_id = m.hometeam_id) AS hometeam,
    -- Connect the team to the match table
    (SELECT team_long_name
     FROM team AS t
     WHERE t.team_api_id = m.awayteam_id) AS awayteam,
    -- Select home and away goals
     home_goal,
     away_goal
FROM match AS m;

In [None]:
# get team names with CTE
WITH home AS (
  SELECT m.id, m.date, 
  		 t.team_long_name AS hometeam, m.home_goal
  FROM match AS m
  LEFT JOIN team AS t 
  ON m.hometeam_id = t.team_api_id),
-- Declare and set up the away CTE
away AS (
  SELECT m.id, m.date, 
  		 t.team_long_name AS awayteam, m.away_goal
  FROM match AS m
  LEFT JOIN team AS t 
  ON m.awayteam_id = t.team_api_id)
-- Select date, home_goal, and away_goal
SELECT 
	home.date,
    home.hometeam,
    away.awayteam,
    home.home_goal,
    away.away_goal
-- Join away and home on the id column
FROM home
INNER JOIN away
ON home.id = away.id;

# Window Functions - `OVER` clause
- how to pass aggregate functions along a dataset
    - similar to subquery in SELECT
- calculate running totals and partitioned averages
- window = perform calculations on an already generated result set
- Aggregate calculations - ie. running totals, rankings, moving averages
- Unlike a subquery in SELECT, your window function will apply the filter that you include in your WHERE clause.

Order of process
- window function processed after every part of query except `ORDER BY`
    - so it uses the result set (instead of database)
- available in: PostgreSQL, Oracle, MySQL, SQL Server...
    - BUT NOT SQLite

## `OVER` clause

In [None]:
SELECT
    date
    (home_goal + away_goal) AS goals,
    AVG(home_goal + away_goal) OVER() AS overall_avg
FROM match
WHERE season = '2011/2012';

In [None]:
SELECT 
	-- Select the id, country name, season, home, and away goals
	m.id, 
    c.name AS country, 
    m.season,
	m.home_goal,
	m.away_goal,
    -- Use a window to include the aggregate average in each row
	AVG(m.home_goal + m.away_goal) OVER() AS overall_avg
FROM match AS m
LEFT JOIN country AS c ON m.country_id = c.id;

## Generate a `RANK` - `RANK() OVER(ORDER BY...) AS...`
- default order is smallest to largest value

In [None]:
# DESC rank
SELECT
    date
    (home_goal + away_goal) AS goals,
    RANK() OVER(ORDER BY home_goal + away_goal DESC) AS goals_rank
FROM match
WHERE season = '2011/2012';

In [None]:
# ascending rank
SELECT 
	-- Select the league name and average goals scored
	l.name AS league,
    AVG(m.home_goal + m.away_goal) AS avg_goals,
    -- Rank each league according to the average goals
    RANK() OVER(ORDER BY AVG(m.home_goal + m.away_goal)) AS league_rank
FROM league AS l
LEFT JOIN match AS m 
ON l.id = m.country_id
WHERE m.season = '2011/2012'
GROUP BY l.name
-- Order the query by the rank you created
ORDER BY league_rank;

In [None]:
# DESC rank example
SELECT 
	-- Select the league name and average goals scored
	l.name AS league,
    AVG(m.home_goal + m.away_goal) AS avg_goals,
    -- Rank leagues in descending order by average goals
    RANK() OVER(ORDER BY AVG(m.home_goal + m.away_goal) DESC) AS league_rank
FROM league AS l
LEFT JOIN match AS m 
ON l.id = m.country_id
WHERE m.season = '2011/2012'
GROUP BY l.name
-- Order the query by the rank you created
ORDER BY league_rank;

# Window Partitions - `OVER(PARTITION BY column_to_apply)`
- calculate separate values for different categories
- calculate different calculations in the same column

Considerations
- can partition data by 1 or more columns
- can partition aggregate calculations, ranks, etc

PARTITION BY multiple colums
- You can even calculate the information you want to use to partition your data! For example, you can calculate average goals scored by season and by country, or by the calendar year (taken from the date column).

In [None]:
SELECT
    date,
    (home_goal + away_goal) AS goals,
    AVG(home_goal) OVER(PARTITION BY season) AS season_avg
FROM match;

In [None]:
SELECT
	date,
	season,
	home_goal,
	away_goal,
	CASE WHEN hometeam_id = 8673 THEN 'home' 
		 ELSE 'away' END AS warsaw_location,
    -- Calculate the average goals scored partitioned by season
    AVG(home_goal) OVER(PARTITION BY season) AS season_homeavg,
    AVG(away_goal) OVER(PARTITION BY season) AS season_awayavg
FROM match
-- Filter the data set for Legia Warszawa matches only
WHERE 
	hometeam_id = 8673 
    OR awayteam_id = 8673
ORDER BY (home_goal + away_goal) DESC;

## Partition by Multiple Columns

In [None]:
SELECT
    c.name
    m.season,
    (home_goal + away_goal) AS goals,
    AVG(home_goal + away_goal)
        OVER(PARTITION BY m.season, c.name) AS season_ctry_avg
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id;

In [None]:
SELECT 
	date,
	season,
	home_goal,
	away_goal,
	CASE WHEN hometeam_id = 8673 THEN 'home' 
         ELSE 'away' END AS warsaw_location,
	-- Calculate average goals partitioned by season and month
    AVG(home_goal) OVER(PARTITION BY season, 
         	EXTRACT(MONTH FROM date)) AS season_mo_home,
    AVG(away_goal) OVER(PARTITION BY season, 
            EXTRACT(MONTH FROM date)) AS season_mo_away
FROM match
WHERE 
	hometeam_id = 8673
    OR awayteam_id = 8673
ORDER BY (home_goal + away_goal) DESC;

# Sliding Windows - think running totals
- calculate changes with each row in the data set
- sliding windows = perform calculations relative to the current row
- ie. can be used to calculate running totals, sums, avg, etc
- can be partitioned by one or more columns

Syntax
- some calculation like
    - `SUM(data) OVER(ORDER BY ROWS...) AS alias
- `ROWS BETWEEN <start> AND <finish> # slice of rows
- keywords for start and finish parameters
    - `PRECEDING` # rows before current row
        - ie. `1 PRECEDING` # 1 row before
    - `FOLLOWING` # rows after current row
    - `UNBOUNDED PRECEDING` # every row since the beginning
    - `UNBOUNDED FOLLOWING`
    - `CURRENT ROW` # stop at current row


In [None]:
'''
Complete the window function by:
Assessing the running total of home goals scored by FC Utrecht.
Assessing the running average of home goals scored.
Ordering both the running average and running total by date.
'''
SELECT 
	date,
	home_goal,
	away_goal,
    -- Create a running total and running average of home goals
    SUM(home_goal) OVER(ORDER BY date 
         ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total,
    AVG(home_goal) OVER(ORDER BY date 
         ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_avg
FROM match
WHERE 
	hometeam_id = 9908 
	AND season = '2011/2012';

In [None]:
'''
modify the query from the previous exercise by sorting 
the data set in reverse order and calculating a 
backward running total from the CURRENT ROW to the 
end of the data set (earliest record).
'''
SELECT 
	-- Select the date, home goal, and away goals
	date,
    home_goal,
    away_goal,
    -- Create a running total and running average of home goals
    SUM(home_goal) OVER(ORDER BY date DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_total,
    AVG(home_goal) OVER(ORDER BY date DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_avg
FROM match
WHERE 
	awayteam_id = 9908 
    AND season = '2011/2012';

# Final exercise
Steps to construct the query
- get team names with CTEs
- get match outcome with `CASE` statements
- determine how badly they lost with a window function

Data source - European Soccer Database

## Identify home team in a match for CTE
- first task is to create the first query that filters for matches where Manchester United played as the home team. This will become a common table expression in a later exercise.

In [None]:
SELECT 
	m.id, 
    t.team_long_name,
    -- Identify matches as home/away wins or ties
	CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		WHEN m.home_goal < m.away_goal THEN 'MU Loss'
        ELSE 'Tie' END AS outcome
FROM match AS m
-- Left join team on the home team ID and team API id
LEFT JOIN team AS t 
ON m.hometeam_id = t.team_api_id
WHERE 
	-- Filter for 2014/2015 and Manchester United as the home team
	m.season = '2014/2015'
	AND t.team_long_name = 'Manchester United';

## Identify away team CTE
- use reverse logic
- primary difference in this query is that you will be joining the tables on awayteam_id, and reversing the match outcomes in the CASE statement.
- When altering CASE statement logic in your own work, you can reverse either the logical condition (i.e., home_goal > away_goal) or the outcome in THEN -- just make sure you only reverse one of the two!

In [None]:
SELECT 
	m.id, 
    t.team_long_name,
    -- Identify matches as home/away wins or ties
	CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss'
		WHEN m.home_goal < m.away_goal THEN 'MU Win'
        ELSE 'Tie' END AS outcome
-- Join team table to the match table
FROM match AS m
LEFT JOIN team AS t 
ON m.awayteam_id = t.team_api_id
WHERE 
	-- Filter for 2014/2015 and Manchester United as the away team
	m.season = '2014/2015'
	AND t.team_long_name = 'Manchester United';

## Putting 2 CTEs together
Now that you've created the two subqueries identifying the home and away team opponents, it's time to rearrange your query with the `home` and `away` subqueries as Common Table Expressions (CTEs). You'll notice that the main query includes the phrase, `SELECT DISTINCT`. Without identifying only `DISTINCT` matches, you will return a duplicate record for each game played.

Continue building the query to extract all matches played by Manchester United in the 2014/2015 season.

In [None]:
-- Set up the home team CTE
WITH home AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id)
-- Select team names, the date and goals
SELECT DISTINCT
    m.date,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_goal,
    m.away_goal
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
      AND (home.team_long_name = 'Manchester United' 
           OR away.team_long_name = 'Manchester United');

## Add a window function
- one final component of the question left -- how badly did Manchester United lose in each match?
- In order to determine this, let's add a window function to the main query that ranks matches by the absolute value of the difference between home_goal and away_goal. This allows us to directly compare the difference in scores without having to consider whether Manchester United played as the home or away team!

In [None]:
-- Set up the home team CTE
WITH home AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Win'
		   WHEN m.home_goal < m.away_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.hometeam_id = t.team_api_id),
-- Set up the away team CTE
away AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_goal > m.away_goal THEN 'MU Loss'
		   WHEN m.home_goal < m.away_goal THEN 'MU Win' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.awayteam_id = t.team_api_id)
-- Select columns and and rank the matches by date
SELECT DISTINCT
    m.date,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_goal, m.away_goal,
    RANK() OVER(ORDER BY ABS(home_goal - away_goal) DESC) as match_rank
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN AWAY ON m.id = away.id
WHERE m.season = '2014/2015'
	  AND ((home.team_long_name = 'Manchester United' 
            AND home.outcome = 'MU Loss')
	  OR (away.team_long_name = 'Manchester United' 
          AND away.outcome = 'MU Loss'));