# SQL notes

# SELECT * and LIMIT

In [None]:
SELECT *
FROM people
LIMIT 10;

In [None]:
SELECT title, release_year, country
FROM films;

# SELECT COUNT(DISTINCT... 
- COUNT will not include NULL values

In [None]:
# SELECT DISTINCT - remove duplicates
SELECT DISTINCT country 
FROM films

In [None]:
# SELECT COUNT

# number of rows
SELECT COUNT(*)
FROM people;

# count but drop nan
SELECT COUNT(birthdate)
FROM people;

# COUNT(DISTINCT...
SELECT COUNT(DISTINCT birthdate)
FROM people;



# Filtering with WHERE
- after the FROM statement
- Important: in PostgreSQL (the version of SQL we're using), you must use single quotes with WHERE.

WHERE keyword allows you to filter based on both text and numeric values in a table. There are a few different comparison operators you can use:

'=' equal

'<>' not equal - Note that in this course we will use <> and not != for the not equal operator, as per the SQL standard.

'<' less than

'>' greater than

'<=' less than or equal to

'>=' greater than or equal to

In [None]:
SELECT title
FROM films
WHERE title = 'Metropolis';

In [None]:
SELECT title
FROM films
WHERE title = 'Metropolis';

# WHERE AND

In [None]:
SELECT title
FROM films
WHERE release_year > 1994
AND release_year < 2000;

# WHERE OR, WHERE (x OR y) AND (a OR b)

In [None]:
SELECT title
FROM films
WHERE release_year = 1994
OR release_year = 2000;

In [None]:
# using parentheses for individual clauses when using AND and OR
SELECT title
FROM films
WHERE (release_year = 1994 OR release_year = 1995)
AND (certification = 'PG' OR certification = 'R');

In [None]:
SELECT title, release_year
FROM films
WHERE (release_year > 1989 AND release_year < 2000) 
AND (language='French' OR language='Spanish')
AND (gross >2000000)

# BETWEEN
- inclusive
- Similar to the WHERE clause, the BETWEEN clause can be used with multiple AND and OR operators,

In [None]:
SELECT title
FROM films
WHERE release_year
BETWEEN 1994 AND 2000;

# WHERE IN
- The IN operator allows you to specify multiple values in a WHERE clause, making it easier and quicker to specify multiple OR conditions!

In [None]:
SELECT name
FROM kids
WHERE age IN (2, 4, 6, 8, 10);

In [None]:
SELECT title, release_year
FROM films
WHERE release_year IN (1990, 2000)
AND duration > 120

# NULL and IS NULL, IS NOT NULL

In [None]:
# count NULL values
SELECT COUNT(*)
FROM people
WHERE birthdate IS NULL;

In [None]:
SELECT name
FROM people
WHERE birthdate IS NOT NULL;

# LIKE and NOT LIKE

you'll want to search for a pattern rather than a specific text string.

In SQL, the LIKE operator can be used in a WHERE clause to search for a pattern in a column. To accomplish this, you use something called a wildcard as a placeholder for some other values. There are two wildcards you can use with LIKE:

The `%` wildcard will match zero, one, or many characters in text. For example, the following query matches companies like 'Data', 'DataC' 'DataCamp', 'DataMind', and so on:

SELECT name
FROM companies
WHERE name LIKE 'Data%';

The `_` wildcard will match a single character. For example, the following query matches companies like 'DataCamp', 'DataComp', and so on:

SELECT name
FROM companies
WHERE name LIKE 'DataC_mp';
You can also use the NOT LIKE operator to find records that don't match the pattern you specify.

In [None]:
SELECT name
FROM people
WHERE name LIKE 'B%';

# Aggregate functions

In [None]:
SELECT AVG(budget)
FROM films;

SELECT MAX(budget)
FROM films;

SELECT SUM(budget)
FROM films;

SELECT MIN(budget)
FROM films;

## Combine aggregate functions with WHERE

In [None]:
SELECT SUM(budget)
FROM films
WHERE release_year >= 2010;

In [None]:
# title starts with 'A'
SELECT AVG(gross)
FROM films
WHERE title LIKE 'A%';

In [None]:
SELECT MAX(gross)
FROM films
WHERE release_year
BETWEEN 2000 AND 2012;

## Arithmetic
- `+`, `-`, `*`, and `/`

In [None]:
SELECT (4 * 3);
# 12

# note division of integers, returns same type of integers
SELECT (4 / 3);
# 1

SELECT (4.0 / 3.0) AS result;
# 1.333

### Make sure when dividing you use at least 1 float type

In [None]:
SELECT 45 * 100.0 / 10;

# Aliasing with AS

In [None]:
SELECT MAX(budget) AS max_budget,
       MAX(duration) AS max_duration
FROM films;

SELECT title,
    (gross - budget) AS net_profit
FROM films;

SELECT AVG(duration)/60.0 AS avg_duration_hours
FROM films;

In [None]:
# note COUNT will not include NULL values
-- get the count(deathdate) and multiply by 100.0
-- then divide by count(*)
SELECT COUNT(deathdate)*100.0/COUNT(*) AS percentage_dead
FROM people;

SELECT MAX(release_year)-MIN(release_year) AS difference
FROM films;

# Use () for numerator of fraction
SELECT (MAX(release_year)-MIN(release_year))/10.0 AS number_of_decades
FROM films;

# Commenting with --

-- get the count(deathdate) and multiply by 100.0

-- then divide by count(*)

# ORDER BY
- default ascending order
- `DESC` for descending order
- NOTE - ORDER BY sorts text column alphabetically A to Z by default

In [None]:
# DESC example
SELECT title
FROM films
ORDER BY release_year DESC;

SELECT name
FROM people
ORDER BY name;

SELECT birthdate, name
FROM people
ORDER BY birthdate;

SELECT title
FROM films
WHERE release_year IN (2000,2012)
ORDER BY release_year;

SELECT *
FROM films
WHERE release_year <> 2015
ORDER BY duration;

SELECT title, gross
FROM films
WHERE title LIKE 'M%'
ORDER BY title;

## ORDER BY for multiple columns

In [None]:
SELECT birthdate, name
FROM people
ORDER BY birthdate, name;


# GROUP BY
- Commonly, `GROUP BY` is used with aggregate functions like `COUNT()` or `MAX()`. Note that `GROUP BY` always goes after the `FROM` clause!
- Note also that `ORDER BY` always goes after `GROUP BY`. Let's try some exercises!

In [None]:
SELECT sex, count(*)
FROM employees
GROUP BY sex;

In [None]:
'''
A word of warning: SQL will return an error if you try to SELECT 
a field that is not in your GROUP BY clause without using it to 
calculate some kind of value about the entire group.

Note that you can combine GROUP BY with ORDER BY to group your 
results, calculate something about them, and then order your results. 
For example,
'''

SELECT sex, count(*)
FROM employees
GROUP BY sex
ORDER BY count DESC;

SELECT release_year, MIN(gross)
FROM films
GROUP BY release_year;

SELECT language, SUM(gross)
FROM films
GROUP BY language;

SELECT release_year, country, MAX(budget)
FROM films
GROUP BY release_year, country
ORDER BY release_year, country;


# HAVING
- In SQL, aggregate functions can't be used in WHERE clauses.


In [None]:
# shows only those years in which more than 10 films were released
SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 10;

In [None]:
SELECT COUNT(release_year)
FROM films
GROUP BY release_year
HAVING COUNT(release_year) > 200;

In [None]:
# In how many different years were more than 200 movies released?
SELECT release_year
FROM films
GROUP BY release_year
HAVING COUNT(title) > 200;

# Example - ORDER BY, GROUP BY, HAVING

In [None]:
SELECT release_year, AVG(budget) AS avg_budget, AVG(gross) AS avg_gross
FROM films
WHERE release_year > 1990
GROUP BY release_year
HAVING AVG(budget) > 60000000
ORDER BY (avg_gross) DESC;

In [None]:
-- select country, average budget, SE
SELECT country, AVG(budget) AS avg_budget,
--     and average gross
    AVG(gross) AS avg_gross
-- from the films table
FROM FILMS
-- group by country 
GROUP BY country
-- where the country has more than 10 titles
HAVING COUNT(country) > 10
-- order by country
ORDER BY country
-- limit to only show 5 results
LIMIT 5;

# JOIN
- note jupyter notebook on SQL_joins

In [None]:
SELECT title, imdb_score
FROM films
JOIN reviews
ON films.id = reviews.film_id
WHERE title = 'To Kill a Mockingbird';

In [None]:
SELECT 	name
FROM hit_tracks AS t
INNER JOIN features AS f
ON 
t.id = f.song_id
 AND 
t.dance = f.dance_level
 
ORDER BY name
LIMIT 5;

# Intermediate SQL skills...

# `CASE WHEN` statement
Applications
- create categorical variables
- aggregate data into a single column with multiple filtering conditions
- calculate counts and %

`CASE` statements - create categories in a field
- `WHEN`, `THEN`, `ELSE`, `END AS` alias

In [None]:
# identify matches played between FC Schalke 04 and FC Bayern Munich
SELECT
    -- Select the team long name and team API id
    team_long_name,
    team_api_id
FROM teams_germany
-- Only include FC Schalke 04 and FC Bayern Munich
WHERE team_long_name IN ('FC Schalke 04', 'FC Bayern Munich');



In [None]:
# Create a CASE statement that identifies whether a match in 
# Germany included FC Bayern Munich, FC Schalke 04, 
# or neither as the home team
-- Identify the home team as Bayern Munich, Schalke 04, or neither
SELECT 
    CASE WHEN hometeam_id = 10189 THEN 'FC Schalke 04'
         WHEN hometeam_id = 9823 THEN 'FC Bayern Munich'
         ELSE 'Other' END AS home_team,
    COUNT(id) AS total_matches
FROM matches_germany
-- Group by the CASE statement alias
GROUP BY home_team;

# Subqueries
- also in SQL_joins notebook
- subqueries find most commonly in `WHERE`, then `SELECT`, then `FROM`

# Correlated Queries, Nested Queries, and Common Table Expressions

# Window Functions
- how to pass aggregate functions along a dataset
- calculate running totals and partitioned averages