# SQL: Aggregation Queries

## Setup

We are now installing the necessary packages to interact with the MySQL database and issue SQL queries using the notebook.

In [None]:
!sudo apt-get install python3-mysqldb
!sudo pip3 install -U sqlalchemy sql_magic

In [None]:
%reload_ext sql_magic

In [None]:
from sqlalchemy import create_engine

conn_string = 'mysql://{user}:{password}@{host}/?charset=utf8'.format(
    host='db.ipeirotis.org',
    user='student',
    password='dwdstudent2015',
    encoding='utf-8')
engine = create_engine(conn_string)

In [None]:
%config SQL.conn_name = 'engine'

## Basic aggregation functions


#### Switch to IMDb

In [None]:
%%read_sql
USE imdb

### `COUNT(*)`

#### Find the number of movies in the database


In [None]:
%%read_sql
SELECT COUNT(*) AS num_movies
FROM movies

#### Find the number of actors in the database


In [None]:
%%read_sql
SELECT COUNT(*) AS num_actors
FROM actors

### `COUNT(attr)`


#### Find the number of movies with a rating



In [None]:
%%read_sql
SELECT COUNT(*) AS rated_movies
FROM movies

#### Find the number of roles where the role is not empty

In [None]:
%%read_sql
SELECT COUNT(role) AS named_roles
FROM roles

In [None]:
%%read_sql
SELECT COUNT(*) AS named_roles
FROM roles
WHERE role IS NOT NULL

### `COUNT(DISTINCT attr)`



#### Find the number of distinct genres in the database


In [None]:
%%read_sql
SELECT COUNT(DISTINCT genre) AS num_genres
FROM movies_genres

#### Find the number of movies that have a genre associated with them

In [None]:
%%read_sql
SELECT COUNT(DISTINCT movie_id) AS num_movies
FROM movies_genres

Compare the query above with the (incorrect!) query below without the `DISTINCT`. Without the `DISTINCT` we may count the same `movie_id` multiple times. Notice that the query below returns as the count a number larger than the actual number of movies in the database.

In [None]:
%%read_sql
SELECT COUNT(movie_id)
FROM movies_genres

### `MIN(attr)`, `MAX(attr)`, `AVG(attr)`, `STDDEV(attr)`, `SUM(attr)`



#### Find the earliest release year and the latest release year for movies


In [None]:
%%read_sql
SELECT 
    MAX(year) AS max_year, 
    MIN(year) AS min_year
FROM movies

#### Find the average rating of the movies and the standard deviation

In [None]:
%%read_sql
SELECT 
    MAX(rank) AS max_rank, 
    MIN(rank) AS min_rank, 
    AVG(rank) AS avg_rank, 
    STDDEV(rank) AS stdev_rank
FROM movies

## `GROUP BY`, Examples on IMDb

#### Switch to IMDb

In [None]:
%%read_sql
USE imdb

#### Count the number of movies that were released in each year

In [None]:
%%read_sql
SELECT year, COUNT(*) AS num_movies
FROM movies
GROUP BY year

#### Compute the average rank for the movies released in each year



In [None]:
%%read_sql
SELECT year, AVG(rank) AS avg_movies
FROM movies
GROUP BY year

#### Compute the min, max, and standard deviation of the movies in each year


In [None]:
%%read_sql
SELECT year, 
    MAX(rank) AS max_rank, 
    MIN(rank) AS min_rank, 
    AVG(rank) AS avg_rank, 
    STDDEV(rank) AS stdev_rank
FROM movies
GROUP BY year

#### Examine the difference between `COUNT(*)` and `COUNT(rank)` when reporting movies per year

In [None]:
%%read_sql
SELECT year, 
    COUNT(*) AS num_movies,
    COUNT(rank) AS rated_movies,
    MAX(rank) AS max_rank, 
    MIN(rank) AS min_rank, 
    AVG(rank) AS avg_rank, 
    STDDEV(rank) AS stdev_rank
FROM movies
GROUP BY year

In [None]:
%%read_sql
SELECT year, 
    COUNT(*) AS num_movies,
    COUNT(rank) AS rated_movies,
    MAX(rank) AS max_rank, 
    MIN(rank) AS min_rank, 
    ROUND(AVG(rank),2) AS avg_rank, 
    ROUND(STDDEV(rank),2) AS stdev_rank
FROM movies
GROUP BY year

#### Compute the number of movies per director ID. 
Rank first the directors with the most movies




In [None]:
%%read_sql
SELECT director_id, 
    COUNT(*) AS num_movies
FROM movies_directors
GROUP BY director_id
ORDER BY num_movies DESC

#### Compute the number of movies per actor ID, 
Rank first the actors with the most movies

In [None]:
%%read_sql
SELECT actor_id, 
    COUNT(*) AS num_movies
FROM roles
GROUP BY actor_id
ORDER BY num_movies DESC

#### Compute the number of actors per movie ID
Rank first the movies with the most actors

In [None]:
%%read_sql
SELECT movie_id, 
    COUNT(*) AS num_roles,
    COUNT(DISTINCT actor_id) AS num_actors
FROM roles
GROUP BY movie_id
ORDER BY num_actors DESC

#### Count the number of male actors and the number of female actors

In [None]:
%%read_sql
SELECT gender, COUNT(*) 
FROM actors
GROUP BY gender

#### Compute the number of movies for each genre



In [None]:
%%read_sql
SELECT genre, COUNT(DISTINCT movie_id), COUNT(movie_id)
FROM movies_genres
GROUP BY genre

## `GROUP BY`, Examples on Facebook

#### Switch to Facebook

In [None]:
%%read_sql
USE facebook

#### List the number of males and females


In [None]:
%%read_sql
SELECT Sex, COUNT(*) AS cnt
FROM Profiles
GROUP BY Sex

#### List the number of students for each political view

In [None]:
%%read_sql
SELECT PoliticalViews, COUNT(*) AS cnt
FROM Profiles
GROUP BY PoliticalViews

#### List the number of males and female students for each political view

In [None]:
%%read_sql
SELECT Sex, PoliticalViews, COUNT(*) AS cnt
FROM Profiles
GROUP BY Sex, PoliticalViews

In [None]:
%%read_sql
SELECT Sex, PoliticalViews, COUNT(*) AS cnt
FROM Profiles
WHERE Sex IS NOT NULL AND PoliticalViews IS NOT NULL
GROUP BY Sex, PoliticalViews

#### Find the most popular TV Shows and Books

In [None]:
%%read_sql
SELECT Book, COUNT(*) AS cnt
FROM FavoriteBooks
GROUP BY Book
ORDER BY cnt DESC
LIMIT 25

In [None]:
%%read_sql
SELECT TVShow, COUNT(*) AS cnt
FROM FavoriteTVShows
GROUP BY TVShow
ORDER BY cnt DESC
LIMIT 25

#### Find the number of students in various relationship statuses

In [None]:
%%read_sql
SELECT Status, COUNT(*) AS cnt
FROM Relationship
GROUP BY Status

#### Find the most popular majors (concentration)

In [None]:
%%read_sql
SELECT Concentration, COUNT(*) AS cnt
FROM Concentration
GROUP BY Concentration
ORDER BY cnt DESC

#### List the number of students per each birth year 
Use the `YEAR(date)` function to get the year value from a datetime column. Then (try to) List only years that have at least 10 students.

In [None]:
%%read_sql
SELECT YEAR(Birthday) AS YoB, COUNT(*) AS cnt
FROM Profiles
WHERE Birthday IS NOT NULL
GROUP BY YoB
ORDER BY cnt DESC

## `HAVING`

#### Switch to IMDb

In [None]:
%%read_sql
USE imdb;

#### Find the movies (just movie IDs) with more than 100 actors



In [None]:
%%read_sql
SELECT movie_id, 
    COUNT(*) AS num_roles,
    COUNT(DISTINCT actor_id) AS num_actors
FROM roles
GROUP BY movie_id
HAVING num_roles>100
ORDER BY num_actors DESC

In [None]:
%%read_sql
SELECT movie_id, 
    COUNT(*) AS num_roles,
    COUNT(DISTINCT actor_id) AS num_actors
FROM roles
GROUP BY movie_id
HAVING num_actors>100
ORDER BY num_actors DESC

#### Find the first names of actors that appear more than 1000 times

In [None]:
%%read_sql
SELECT first_name, COUNT(*) AS cnt
FROM actors
GROUP BY first_name
HAVING cnt>1000

#### Find all the movie ids for movies that have more roles than actors (i.e, the same actor plays multiple roles in the movie)

In [None]:
%%read_sql
SELECT movie_id, 
    COUNT(*) AS num_roles,
    COUNT(DISTINCT actor_id) AS num_actors
FROM roles
GROUP BY movie_id
HAVING num_roles<>num_actors
ORDER BY num_actors DESC

#### Find all the actor ids for actors that have more roles than actors (i.e, the same actor plays multiple roles in the movie)

In [None]:
%%read_sql
SELECT actor_id, 
    COUNT(*) AS num_roles,
    COUNT(DISTINCT movie_id) AS num_movies
FROM roles
GROUP BY actor_id
HAVING num_roles<>num_movies
ORDER BY num_movies DESC

#### Find data quality issues: In the movies_genres table, the same movie id may be associated multiple times with the same genre. Identify these cases.

In [None]:
%%read_sql
SELECT movie_id, genre, COUNT(*) AS cnt
FROM movies_genres
GROUP BY movie_id, genre 
HAVING cnt>1
ORDER BY cnt DESC

### Compare `WHERE` and `HAVING`


In [None]:
%%read_sql
SELECT COUNT(*), COUNT(rank)
FROM movies


In [None]:
%%read_sql
SELECT COUNT(*), COUNT(rank)
FROM movies
WHERE rank IS NOT NULL


## `JOIN` and `GROUP BY` together

#### For each movie genre, list the average rating of the movies from year 2000. 

Also list:
* the maximum and minimum ratings
* the standard deviation of the ratings
* the number of rated movies and the total number of movies




In [None]:
%%read_sql
SELECT G.genre, 
    MAX(M.rank) AS max_rating,
    MIN(M.rank) AS min_rating,
    ROUND(AVG(M.rank),2) AS avg_rating,
    ROUND(STDDEV(M.rank),2) AS std_rating,
    COUNT(*) AS num_movies,
    COUNT(M.rank) AS rated_movies
FROM movies M
    INNER JOIN movies_genres G ON M.id = G.movie_id
WHERE M.year = 2000
GROUP BY G.genre
ORDER BY avg_rating DESC

#### For each director, compute:
* The number of rated and total number of movies
* The average, min, max, and standard deviation of the movie ratings
* Limit the results to directors who directed at least 40 movies, with at least 30 rated movies




In [None]:
%%read_sql
SELECT D.*,
    COUNT(*) AS num_movies,
    COUNT(M.rank) AS rated_movies,
    MAX(M.rank) AS max_rating,
    MIN(M.rank) AS min_rating,
    ROUND(AVG(M.rank),2) AS avg_rating,
    ROUND(STDDEV(M.rank),2) AS std_rating
FROM directors D
    JOIN movies_directors MD ON D.id = MD.director_id
    JOIN movies M ON M.id = MD.movie_id
GROUP BY 
    D.id
HAVING 
    num_movies>40
    AND rated_movies>30
ORDER BY 
    avg_rating DESC

#### What roles have the best movie ratings? 
* Do not include movies without ratings in the calculations for number of movies
* Limit to only roles that appear in at least 10 distinct movies
* Limit only to roles played by at least 10 distinct  actors

In [None]:
%%read_sql
SELECT R.role,
    COUNT(*) AS num_roles,
    COUNT(DISTINCT movie_id) AS num_movies,
    COUNT(DISTINCT actor_id) AS num_actors,
    MAX(M.rank) AS max_rating,
    MIN(M.rank) AS min_rating,
    ROUND(AVG(M.rank),2) AS avg_rating,
    ROUND(STDDEV(M.rank),2) AS std_rating
FROM roles R
    JOIN movies M ON M.id = R.movie_id
WHERE
    M.rank IS NOT NULL
GROUP BY 
    R.role
HAVING
    num_movies>=10
    AND
    num_actors>=10
ORDER BY 
    avg_rating DESC
LIMIT 50