# SQL JOINs
- types: INNER JOIN, LEFT JOIN


# INNER JOIN

In [None]:
SELECT *
FROM left_table
INNER JOIN right_table
ON left_table.id = right_table.id;


## Aliasing with JOIN
- Sometimes it's easier to write SQL code out of order: you write the `SELECT` statement after you've done the `JOIN`.
- alias each table using the first letter of its name (e.g. countries AS c)! It is standard practice to alias in this way

In [None]:
-- Select name fields (with alias) and region 
SELECT cities.name AS city, countries.region, countries.name AS country
FROM cities
  INNER JOIN countries
    ON cities.country_code = countries.code;
    
SELECT c1.name AS city, c2.name AS country
FROM cities AS c1
INNER JOIN countries AS c2
ON c1.country_code = c2.code;

In [None]:
-- Select fields
SELECT c.code, c.name, c.region, p.year, p.populations
  -- From countries (alias as c)
  FROM countries AS c
  -- Join with populations (as p)
  INNER JOIN populations as p
    -- Match on country code
    ON c.code = p.country_code

## Multiple JOINs

In [None]:
SELECT *
FROM left_table
  INNER JOIN right_table
    ON left_table.id = right_table.id
  INNER JOIN another_table
    ON left_table.id = another_table.id;


In [None]:
-- Select fields
SELECT c.code, name, region, e.year, fertility_rate, unemployment_rate
  -- From countries (alias as c)
  FROM countries AS c
  -- Join to populations (as p)
  INNER JOIN populations AS p
    -- Match on country code
    ON c.code = p.country_code
  -- Join to economies (as e)
  INNER JOIN economies AS e
    -- Match on country code and year
    ON c.code = e.code
# note additional join condition added
    -- use additional joining condition
    AND e.year = p.year;

## INNER JOIN via USING (col_name)
- `USING (col_name)` clause when column name is same in both tables
- don't forget the parentheses `( )`

In [None]:
-- Select fields
SELECT c.name AS country, c.continent, l.name AS language, l.official
  -- From countries (alias as c)
  FROM countries AS c
  -- Join to languages (as l)
  INNER JOIN languages as l
    -- Match using code
    USING (code);

## Self-ish joins, just in CASE
- `CASE` to join table with itself. aka Self-joins
- ie. slice a numerical field into categories using the CASE command
- Used to compare values in a field to other values of the same field from within the same table

In [None]:
SELECT p1.country AS country1, p2.country AS country2,
p1.continent
FROM prime_ministers AS p1
INNER JOIN prime_ministers AS p2
# use condition to remove duplicates
ON p1.continent = p2.continent AND p1.country <> p2.country1
LIMIT 13;

In [None]:
-- Select fields with aliases
SELECT p1.country_code,
       p1.size AS size2010, 
       p2.size AS size2015,
       -- Calculate growth_perc
       ((p2.size - p1.size)/p1.size * 100.0) AS growth_perc
-- From populations (alias as p1)
FROM populations AS p1
  -- Join to itself (alias as p2)
  INNER JOIN populations AS p2
    -- Match on country code
    ON p1.country_code = p2.country_code
        -- and year (with calculation)
        AND p1.year = p2.year - 5;

## `CASE` with `WHEN`, `THEN`, `ELSE`, and `END`
- simple way to do multiple IF-THEN-ELSE statements

In [None]:
SELECT name, continent, code, surface_area,
    -- First case
    CASE WHEN surface_area > 2000000 THEN 'large'
        -- Second case
        WHEN surface_area > 350000 THEN 'medium'
        -- Else clause + end
        ELSE 'small' END
        -- Alias name
        AS geosize_group
-- From table
FROM countries;

## `INTO`
- create a table in the query

In [None]:
SELECT name, continent, code, surface_area,
    CASE WHEN surface_area > 2000000
            THEN 'large'
       WHEN surface_area > 350000
            THEN 'medium'
       ELSE 'small' END
       AS geosize_group
INTO countries_plus
FROM countries;


In [None]:
SELECT country_code, size,
    CASE WHEN size > 50000000 THEN 'large'
        WHEN size > 1000000 THEN 'medium'
        ELSE 'small' END
        AS popsize_group
-- Into table
INTO pop_plus
FROM populations
WHERE year = 2015;

# display results of new table
-- Select all columns of pop_plus
SELECT *
FROM pop_plus;

In [None]:
SELECT country_code, size,
  CASE WHEN size > 50000000
            THEN 'large'
       WHEN size > 1000000
            THEN 'medium'
       ELSE 'small' END
       AS popsize_group
INTO pop_plus       
FROM populations
WHERE year = 2015;

-- Select fields
SELECT name, continent, geosize_group, popsize_group
-- From countries_plus (alias as c)
FROM countries_plus as c
  -- Join to pop_plus (alias as p)
  INNER JOIN pop_plus as p
    -- Match on country code
    ON p.country_code = c.code
-- Order the table    
ORDER BY geosize_group;

# Outer joins - `LEFT JOIN`, `RIGHT JOIN`, `FULL JOIN`

## LEFT JOIN

In [None]:
SELECT c1.name AS city, code, c2.name AS country,
       region, city_proper_pop
FROM cities AS c1
  -- Join right table (with alias)
  LEFT JOIN countries AS c2
    -- Match on country code
    ON c1.country_code = c2.code
-- Order by descending country code
ORDER BY code DESC;

In [None]:
-- Select fields
SELECT region, AVG(gdp_percapita) AS avg_gdp
-- From countries (alias as c)
FROM countries AS c
  -- Left join with economies (alias as e)
  LEFT JOIN economies AS e
    -- Match on code fields
    ON c.code = e.code
-- Focus on 2010
WHERE year = 2010
-- Group by region
GROUP BY region
-- Order by descending avg_gdp
ORDER BY avg_gdp DESC;

## RIGHT JOIN
- note position of left and right tables in ON clause
- Right joins aren't as common as left joins. One reason why is that you can always write a right join as a left join.

In [None]:
-- convert this code to use RIGHT JOINs instead of LEFT JOINs
/*
SELECT cities.name AS city, urbanarea_pop, countries.name AS country,
       indep_year, languages.name AS language, percent
FROM cities
  LEFT JOIN countries
    ON cities.country_code = countries.code
  LEFT JOIN languages
    ON countries.code = languages.code
ORDER BY city, language;
*/

SELECT cities.name AS city, urbanarea_pop, countries.name AS country,
       indep_year, languages.name AS language, percent
FROM languages
  RIGHT JOIN countries
    ON languages.code = countries.code
  RIGHT JOIN cities
    ON countries.code = cities.country_code
ORDER BY city, language;

## FULL JOIN

In [None]:
SELECT name AS country, code, region, basic_unit
-- From countries
FROM countries
  -- Join to currencies
  FULL JOIN currencies
    -- Match on code
    USING (code)
-- Where region is North America or null
WHERE region = 'North America' OR region IS NULL
-- Order by region
ORDER BY region;

In [None]:
# 2 consecutive full joins
-- Select fields (with aliases)
SELECT c1.name AS country, region, l.name AS language,
       basic_unit, frac_unit
-- From countries (alias as c1)
FROM countries AS c1
  -- Join with languages (alias as l)
  FULL JOIN languages AS l
    -- Match on code
    USING (code)
  -- Join with currencies (alias as c2)
  FULL JOIN currencies AS c2
    -- Match on code
    USING (code)
-- Where region like Melanesia and Micronesia
WHERE region LIKE 'M%esia';

# CROSS JOIN
- yields all combinations

In [None]:
-- Select fields
SELECT c.name AS city, l.name AS language
-- From cities (alias as c)
FROM cities AS c        
  -- Join to languages (alias as l)
  CROSS JOIN languages AS l
-- Where c.name like Hyderabad
WHERE c.name LIKE 'Hyder%';

# Set theory clauses (like UNION)
- types: 
    - Union
    - Union All
    - Intersect
    - Except
- requirements
    - fields included in the operation must be same data type since the result is 1 field
    - UNION and UNION ALL clauses do not do the lookup step like JOIN

In [None]:
-- Select fields from 2010 table
SELECT *
  -- From 2010 table
  FROM economies2010
    -- Set theory clause
    UNION
-- Select fields from 2015 table
SELECT *
  -- From 2015 table
  FROM economies2015
-- Order by code and year
ORDER BY code, year;

## UNION
- use to determine all occurrences of field in multiple tables
- removes duplicates

In [None]:
-- Select field
SELECT country_code
  -- From cities
  FROM cities
    -- Set theory clause
    UNION
-- Select field
SELECT code AS country_code
  -- From currencies
  FROM currencies
-- Order by country_code
ORDER BY country_code;

## UNION ALL - include duplicates

In [None]:
-- Select fields
SELECT code, year
  -- From economies
  FROM economies
    -- Set theory clause
    UNION ALL
-- Select fields
SELECT country_code AS code, year
  -- From populations
  FROM populations
-- Order by code, year
ORDER BY code, year;

## INTERSECT
- Note: INTERSECT on 2 fields is like matching with both fields
- Looks for records in common

In [None]:
-- Select fields
SELECT code, year
  -- From economies
  FROM economies
    -- Set theory clause
    INTERSECT
-- Select fields
SELECT country_code AS code, year
  -- From populations
  FROM populations
-- Order by code and year
ORDER BY code, year;

### Which countries have a city with the same name as country name?

In [None]:
-- Select fields
SELECT name
  -- From countries
  FROM countries
    -- Set theory clause
    INTERSECT
-- Select fields
SELECT name
  -- From cities
  FROM cities;

## EXCEPT
- = only the record in 1 table, but not the other

In [None]:
-- Select field
SELECT name
  -- From cities
  FROM cities
    -- Set theory clause
    EXCEPT
-- Select field
SELECT capital
  -- From countries
  FROM countries
-- Order by result
ORDER BY name;

In [None]:
# example in reverse from above
-- Select field
SELECT capital
  -- From countries
  FROM countries
    -- Set theory clause
    EXCEPT
-- Select field
SELECT name
  -- From cities
  FROM cities
-- Order by ascending capital
ORDER BY capital;

# Semi JOIN and Anti JOIN - examples of subqueries
- use similar to a WHERE clause dependent on 2nd table values

- Semi-join - chooses records in 1st table where condition IS met in 2nd table

- Anti-join - chooses records in 1st table where condition IS NOT met in 2nd table

## semi-join

In [None]:
-- Select code
SELECT code
  -- From countries
  FROM countries
-- Where region is Middle East
WHERE region = 'Middle East';

In [None]:
-- Query from step 1:
/*
SELECT code
  FROM countries
WHERE region = 'Middle East';
*/

-- Select field
SELECT DISTINCT name
  -- From languages
  FROM languages
-- Order by name
ORDER BY name;

## `WHERE ____ IN` to connect subquerie

In [None]:
# semi-join to identify languages in the Middle East

'''
Combine the previous two queries into one query by adding a 
WHERE IN statement to the SELECT DISTINCT query
'''
-- Query from step 2
SELECT DISTINCT name
  FROM languages
-- Where in statement
WHERE code IN
  -- Query from step 1
  -- Subquery
  (SELECT code
   FROM countries
   WHERE region = 'Middle East')
-- Order by name
ORDER BY name;

## Relating semi-join to a tweaked inner join
- Sometimes problems solved with semi-joins can also be solved using an inner join.

In [None]:
SELECT DISTINCT languages.name AS language
FROM languages
INNER JOIN countries
ON languages.code = countries.code
WHERE region = 'Middle East'
ORDER BY language;

## Diagnosing problems using anti-join - `NOT IN`
- useful in identifying which records are causing an incorrect number of records to appear in join queries

In [None]:
-- Select statement
SELECT COUNT(name)
  -- From countries
  FROM countries
-- Where continent is Oceania
WHERE continent = 'Oceania';

In [None]:
-- Select fields (with aliases)
SELECT c1.code, c1.name, c2.basic_unit AS currency
  -- From countries (alias as c1)
  FROM countries as c1
  -- Join with currencies (alias as c2)
  INNER JOIN currencies as c2
    -- Match on code
    ON c1.code = c2.code
-- Where continent is Oceania
WHERE continent = 'Oceania';

In [None]:
# note NOT IN
-- Select fields
SELECT c1.code, c1.name
  -- From Countries
  FROM countries as c1
  -- Where continent is Oceania
  WHERE continent = 'Oceania'
  -- And code not in
  AND code NOT IN
  -- Subquery
  (SELECT code
   FROM currencies);

# Set theory example
- task here will be to incorporate two of UNION/UNION ALL/INTERSECT/EXCEPT to solve a challenge involving three tables
- In addition, you will use a subquery

In [None]:
-- Select the city name
SELECT name
  -- Alias the table where city name resides
  FROM cities AS c1
  -- Choose only records matching the result of multiple set theory clauses
  WHERE country_code IN
(
    -- Select appropriate field from economies AS e
    SELECT e.code
    FROM economies AS e
    -- Get all additional (unique) values of the field from currencies AS c2  
    UNION
    SELECT c.code
    FROM currencies AS c
    -- Exclude those appearing in populations AS p
    EXCEPT
    SELECT p.country_code
    FROM populations AS p
);

# Subqueries inside WHERE and SELECT clauses
- subqueries find most commonly in `WHERE`, then `SELECT`, then `FROM`

In [None]:
-- Select fields
SELECT *
  -- From populations
  FROM populations
-- Where life_expectancy is greater than
WHERE life_expectancy > 1.15 *
  -- 1.15 * subquery
  (SELECT AVG(life_expectancy)
   FROM populations
   WHERE year=2015)
   AND year=2015;

In [None]:
-- Select fields
SELECT name, country_code, urbanarea_pop
  -- From cities
  FROM cities
-- Where city name in the field of capital cities
WHERE name IN
  -- Subquery
  (SELECT capital
   FROM countries)
ORDER BY urbanarea_pop DESC;

In [None]:
SELECT countries.name AS country, COUNT(*) AS cities_num
  FROM cities
    INNER JOIN countries
    ON countries.code = cities.country_code
GROUP BY country
ORDER BY cities_num DESC, country
LIMIT 9;

# 2nd query with same result using subquery
SELECT countries.name AS country,
  -- Subquery
  (SELECT COUNT(*)
   FROM cities
   WHERE countries.code = cities.country_code) AS cities_num
FROM countries
ORDER BY cities_num DESC, country
LIMIT 9;

# Subquery inside `FROM` clause

In [None]:
-- Select fields (with aliases)
SELECT COUNT(*) AS lang_num, code
  -- From languages
  FROM languages
-- Group by code
GROUP BY code;

In [None]:
-- Select fields
SELECT local_name, countries.code, lang_num
  -- From countries
  FROM countries, # note add ',' after 1st table in FROM clause
  -- Subquery (alias as subquery)
  (SELECT code, COUNT(*) AS lang_num
   FROM languages
   GROUP BY code) AS subquery
  -- Where codes match
  WHERE countries.code = subquery.code
-- Order by descending number of languages
ORDER BY lang_num DESC;

## Subquery examples

In [None]:
# nested subqueries
-- Select fields
SELECT c.name, c.continent, e.inflation_rate
  -- From countries
  FROM countries AS c
  -- Join to economies
  INNER JOIN economies AS e
    -- Match on code
    USING(code)
-- Where year is 2015
WHERE year = 2015;

# then nest above in FROM clause
-- Select the maximum inflation rate as max_inf
SELECT MAX(inflation_rate) AS max_inf
  -- Subquery using FROM (alias as subquery)
  FROM (
      SELECT name, continent, inflation_rate
      FROM countries
      INNER JOIN economies
      USING (code)
      WHERE year = 2015) AS subquery
-- Group by continent
GROUP BY continent;

# now, append 2nd query to 1st query using `AND` and `IN`
-- Select fields
SELECT name, continent, inflation_rate
  -- From countries
  FROM countries
    -- Join to economies
    INNER JOIN economies
    -- Match on code
    ON countries.code = economies.code
  -- Where year is 2015
  WHERE year = 2015
    -- And inflation rate in subquery (alias as subquery)
    AND inflation_rate IN (
        SELECT MAX(inflation_rate) AS max_inf
        FROM (
             SELECT name, continent, inflation_rate
             FROM countries
             INNER JOIN economies
             ON countries.code = economies.code
             WHERE year = 2015) AS subquery
      -- Group by continent
        GROUP BY continent);

In [None]:
# another example
-- Select fields
SELECT code, inflation_rate, unemployment_rate
  -- From economies
  FROM economies
  -- Where year is 2015 and code is not in
  WHERE year = 2015 AND code NOT IN
  -- Subquery
  (SELECT code
   FROM countries
   WHERE (gov_form = 'Constitutional Monarchy' 
          OR gov_form LIKE '%Republic%'))
-- Order by inflation rate
ORDER BY inflation_rate;

# Final exercises

## Exercise 1
In this exercise, you'll need to get the country names and other 2015 data in the economies table and the countries table for Central American countries with an official language.
- use table aliasing, but not field alias

In [None]:
-- Select fields
SELECT DISTINCT name, total_investment, imports
  -- From table (with alias)
  FROM economies AS e
    -- Join with table (with alias)
    LEFT JOIN countries AS c
      -- Match on code
      ON (e.code = c.code
      -- and code in Subquery
        AND e.code IN (
          SELECT code
          FROM languages AS l
          WHERE official = 'true'
        ) )
  -- Where region and year are correct
  WHERE region = 'Central America' AND year = 2015
-- Order by field
ORDER BY name;

## Exercise 2
calculate the average fertility rate for each region in 2015.

In [None]:
-- Select fields
SELECT c.region, c.continent, AVG(fertility_rate) AS avg_fert_rate
  -- From left table
  FROM populations AS p
    -- Join to right table
    INNER JOIN countries AS c
      -- Match on join condition
      ON p.country_code = c.code
  -- Where specific records matching some condition
  WHERE year = 2015
-- Group appropriately
GROUP BY c.continent, c.region
-- Order appropriately
ORDER BY avg_fert_rate;

## Exercise 3
- determine the top 10 capital cities in Europe and the Americas in terms of a calculated percentage using `city_proper_pop` and `metroarea_pop` in `cities`
- skip table aliasing

In [None]:
-- Select fields
SELECT name, country_code, city_proper_pop, metroarea_pop,  
      -- Calculate city_perc
      city_proper_pop / metroarea_pop * 100.0 AS city_perc
  -- From appropriate table
  FROM cities
  -- Where 
  WHERE name IN
    -- Subquery
    (SELECT capital
     FROM countries
     WHERE (continent = 'Europe'
        OR continent LIKE '%America%'))
       AND metroarea_pop IS NOT NULL
-- Order appropriately
ORDER BY city_perc DESC
-- Limit amount
LIMIT 10;