## SQL Queries

To be able to execute SQL queries from within a Jupyter notebook, we will use the `sql_magic` extension (https://github.com/pivotal/sql_magic)

    sudo pip3 install -U sql_magic

In [1]:
from sqlalchemy import create_engine

In [2]:
conn_string = 'mysql://{user}:{password}@{host}/?charset=utf8'.format(
    host = 'db.ipeirotis.org', 
    user = 'student',
    password = 'dwdstudent2015',
    encoding = 'utf-8')
engine = create_engine(conn_string)

In [3]:
%reload_ext sql_magic

In [4]:
%config SQL.conn_name = 'engine'

### Aggregation Queries (IMDB Database)

In [33]:
%%read_sql
USE imdb;

Query started at 08:10:04 PM UTC; Query executed in 0.00 m

<sql_magic.exceptions.EmptyResult at 0x7f610325f4a8>

#### Number of movies for each director

In [34]:
%%read_sql
SELECT director_id, count(*) AS NumberOfMovies
FROM movies_directors 
GROUP BY director_id
LIMIT 100;

Query started at 08:10:04 PM UTC; Query executed in 0.00 m

Unnamed: 0,director_id,NumberOfMovies
0,1,1
1,2,1
2,3,2
3,4,1
4,5,1
...,...,...
95,96,1
96,97,5
97,98,2
98,99,4


#### Rank directors by the number of movies they directed

In [35]:
%%read_sql
SELECT director_id, count(*) AS NumberOfMovies
FROM movies_directors 
GROUP BY director_id
ORDER BY count(*) desc

Query started at 08:10:04 PM UTC; Query executed in 0.01 m

Unnamed: 0,director_id,NumberOfMovies
0,25116,619
1,56530,562
2,30570,536
3,9277,370
4,1958,360
...,...,...
88599,8401,1
88600,511,1
88601,68245,1
88602,57729,1


#### Find the number of actors in each movie

In [36]:
%%read_sql
SELECT movie_id, count(*) 
FROM roles 
GROUP BY movie_id

Query started at 08:10:05 PM UTC; Query executed in 0.10 m

Unnamed: 0,movie_id,count(*)
0,0,2
1,2,20
2,3,4
3,4,4
4,5,1
...,...,...
300247,412315,1
300248,412316,15
300249,412317,11
300250,412318,9


#### Find the movies with more than 100 actors

In [37]:
%%read_sql
SELECT movie_id, count(*) 
FROM roles 
GROUP BY movie_id
HAVING count(*) > 100;

Query started at 08:10:11 PM UTC; Query executed in 0.10 m

Unnamed: 0,movie_id,count(*)
0,687,153
1,846,174
2,1674,164
3,1703,114
4,2250,112
...,...,...
544,411545,188
545,411546,205
546,411585,108
547,411802,533


#### Find the most popular genres (basd on the number of movies)

In [38]:
%%read_sql
SELECT genre, count(*) 
FROM movies_genres
GROUP BY genre
ORDER BY count(*) desc

Query started at 08:10:17 PM UTC; Query executed in 0.01 m

Unnamed: 0,genre,count(*)
0,Short,82597
1,Drama,74615
2,Comedy,57860
3,Documentary,42320
4,Adult,20667
5,Animation,17888
6,Action,14885
7,Romance,13873
8,Crime,12966
9,Family,11232


#### Find the average rank of the movies in the database, per year of release

In [39]:
%%read_sql
SELECT year, avg(rank) 
FROM movies
GROUP BY year

Query started at 08:10:17 PM UTC; Query executed in 0.01 m

Unnamed: 0,year,avg(rank)
0,1888,
1,1890,7.300000
2,1891,3.683333
3,1892,2.866667
4,1893,6.800000
...,...,...
115,2004,6.217399
116,2005,
117,2006,
118,2007,


### Aggregation Queries (Facebook database)

In [40]:
%%read_sql
USE facebook

Query started at 08:10:18 PM UTC; Query executed in 0.00 m

<sql_magic.exceptions.EmptyResult at 0x7f610325f4e0>

#### List the number of males and females

In [41]:
%%read_sql
SELECT sex, count(*) 
FROM Profiles 
GROUP BY sex

Query started at 08:10:18 PM UTC; Query executed in 0.00 m

Unnamed: 0,sex,count(*)
0,,4498
1,Female,12311
2,Male,8975


#### List the number of students for each political view

In [42]:
%%read_sql
SELECT PoliticalViews, count(*) 
FROM Profiles 
GROUP BY PoliticalViews

Query started at 08:10:18 PM UTC; Query executed in 0.00 m

Unnamed: 0,PoliticalViews,count(*)
0,,11091
1,Apathetic,805
2,Conservative,936
3,Liberal,6461
4,Libertarian,325
5,Moderate,2898
6,Other,824
7,Very Conservative,167
8,Very Liberal,2277


#### List the number of males and female students for each political view

In [43]:
%%read_sql
SELECT Sex, PoliticalViews, count(*) 
FROM Profiles 
WHERE Sex is not null and PoliticalViews is not null
GROUP BY Sex, PoliticalViews

Query started at 08:10:18 PM UTC; Query executed in 0.00 m

Unnamed: 0,Sex,PoliticalViews,count(*)
0,Female,Apathetic,309
1,Female,Conservative,428
2,Female,Liberal,4054
3,Female,Libertarian,113
4,Female,Moderate,1444
5,Female,Other,280
6,Female,Very Conservative,38
7,Female,Very Liberal,1362
8,Male,Apathetic,462
9,Male,Conservative,474


#### List the number of students per each birth year: Use the Year(date) function to get the year value FROM a dateime column List only years that have at least 10 students

In [44]:
%%read_sql
SELECT YEAR(birthday), count(*) 
FROM Profiles 
WHERE birthday is not null
GROUP BY YEAR(birthday)
HAVING count(*) > 10

Query started at 08:10:18 PM UTC; Query executed in 0.00 m

Unnamed: 0,YEAR(birthday),count(*)
0,1905,34
1,1974,29
2,1975,28
3,1976,35
4,1977,61
5,1978,90
6,1979,159
7,1980,309
8,1981,585
9,1982,1387


#### Find the most popular tv shows and books

In [45]:
%%read_sql
SELECT TVShow, count(*) 
FROM FavoriteTVShows 
WHERE TVShow is not null 
GROUP BY TVShow
ORDER BY count(*) desc

Query started at 08:10:18 PM UTC; Query executed in 0.00 m

Unnamed: 0,TVShow,count(*)
0,Family Guy,1146
1,Sex And The City,649
2,Lost,640
3,Arrested Development,610
4,Grey s Anatomy,575
...,...,...
6482,Laverne And Shirley,1
6483,Taradise Athf Chapelle Arrested Development Si...,1
6484,Ellen Degeneres Show,1
6485,Hilarious Reality Shows,1


In [46]:
%%read_sql
SELECT Book, count(*) 
FROM FavoriteBooks
WHERE Book is not null 
GROUP BY Book
ORDER BY count(*) desc

Query started at 08:10:18 PM UTC; Query executed in 0.04 m

Unnamed: 0,Book,count(*)
0,Harry Potter,1320
1,Catcher In The Rye,1079
2,The Great Gatsby,963
3,1984,725
4,Pride And Prejudice,602
...,...,...
28665,Fantasy Stuff Like Narnia And Madeline L Engle,1
28666,Fantasy Stuff Sword Of Truth And Exotic Car Ma...,1
28667,Far Away,1
28668,Far From The Maddening Croud,1


#### Find the number of students in various relationship statuses

In [47]:
%%read_sql
SELECT Status, count(*) 
FROM Relationship
GROUP BY Status

Query started at 08:10:21 PM UTC; Query executed in 0.00 m

Unnamed: 0,Status,count(*)
0,Engaged,1
1,In a Relationship,4851
2,In an Open Relationship,565
3,It's complicated,17
4,Married,2337
5,Single,7872


#### Find the most popular majors (concentrations)

In [48]:
%%read_sql
SELECT Concentration,count(*) FROM Concentration
WHERE Concentration is not null
GROUP BY Concentration
ORDER BY count(*) desc;

Query started at 08:10:21 PM UTC; Query executed in 0.00 m

Unnamed: 0,Concentration,count(*)
0,Finance,1810
1,Psychology,1571
2,Economics,1533
3,Journalism and Mass Communication,1267
4,Politics,1196
...,...,...
134,Education (minor only; through School of Educa...,1
135,Slavic Studies,1
136,German and Linguistics (major only),1
137,Ancient Studies (minor only),1


### JOINS

In [49]:
%%read_sql
USE imdb

Query started at 08:10:21 PM UTC; Query executed in 0.00 m

<sql_magic.exceptions.EmptyResult at 0x7f61031d4ef0>

In [50]:
%%read_sql
# List all the movies from year 2000
SELECT *
FROM movies M
WHERE M.year = 2000;

Query started at 08:10:21 PM UTC; Query executed in 0.01 m

Unnamed: 0,id,name,year,rank
0,1,"#7 Train: An Immigrant Journey, The",2000,
1,31,$pent,2000,4.3
2,34,& frres,2000,
3,46,"'70s: The Decade That Changed Television, The",2000,
4,176,'N Sync & Britney Spears: Your #1 Video Reques...,2000,
...,...,...,...,...
11638,412155,"""Quin dijo miedo?""",2000,
11639,412218,"""ndernes magt""",2000,
11640,412221,"""pent hus""",2000,
11641,412232,"""a s'en va et a revient""",2000,


#### List all the movies from year 2000 and their genres

In [51]:
%%read_sql
SELECT *
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
WHERE M.year = 2000;

Query started at 08:10:22 PM UTC; Query executed in 0.01 m

Unnamed: 0,id,name,year,rank,movie_id,genre
0,1,"#7 Train: An Immigrant Journey, The",2000,,1,Short
1,1,"#7 Train: An Immigrant Journey, The",2000,,1,Documentary
2,31,$pent,2000,4.3,31,Drama
3,34,& frres,2000,,34,Short
4,46,"'70s: The Decade That Changed Television, The",2000,,46,Documentary
...,...,...,...,...,...,...
12138,378537,ber die Montage,2000,,378537,Short
12139,378550,"berfall, Der",2000,7.0,378550,Comedy
12140,378550,"berfall, Der",2000,7.0,378550,Crime
12141,378550,"berfall, Der",2000,7.0,378550,Drama


#### List all the Drama movies from year 2000

In [52]:
%%read_sql

SELECT *
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
WHERE M.year = 2000 AND MG.genre = 'Drama';

Query started at 08:10:22 PM UTC; Query executed in 0.01 m

Unnamed: 0,id,name,year,rank,movie_id,genre
0,31,$pent,2000,4.3,31,Drama
1,428,...Or Forever Hold Your Peace,2000,6.9,428,Drama
2,598,1. de ei,2000,,598,Drama
3,992,12 Rounds,2000,,992,Drama
4,1023,12x12,2000,,1023,Drama
...,...,...,...,...,...,...
1864,377490,"a, c'est vraiment toi",2000,,377490,Drama
1865,378026,"tre chair, L'",2000,,378026,Drama
1866,378121,skabrn jarinnar,2000,4.2,378121,Drama
1867,378308,tztalmann und seine Welt. Das Jahr bevor er sc...,2000,6.3,378308,Drama


#### List all the movies from year 2000 and their average rating broken down by genre

In [53]:
%%read_sql
SELECT AVG(M.rank), MG.genre
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
WHERE M.year = 2000
GROUP BY MG.genre;

Query started at 08:10:23 PM UTC; Query executed in 0.01 m

Unnamed: 0,AVG(M.rank),genre
0,5.454639,Action
1,,Adult
2,5.782759,Adventure
3,6.505882,Animation
4,5.899568,Comedy
5,5.908228,Crime
6,6.913761,Documentary
7,6.087035,Drama
8,5.787879,Family
9,5.938158,Fantasy


In [54]:
%%read_sql
# List the average ratings for the movies broken down by genre
SELECT AVG(M.rank) AS avg_rating, MG.genre
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
GROUP BY MG.genre
ORDER BY avg_rating DESC

Query started at 08:10:23 PM UTC; Query executed in 0.02 m

Unnamed: 0,avg_rating,genre
0,6.701768,Film-Noir
1,6.557896,Animation
2,6.496829,Documentary
3,6.428049,Adult
4,6.415912,Music
5,6.394257,Short
6,6.377797,War
7,6.31513,Family
8,6.156967,Romance
9,6.137848,Drama


In [55]:
%%read_sql
SELECT distinct a.first_name, a.last_name 
FROM actors a INNER JOIN roles r ON a.id = r.actor_id
    INNER JOIN movies_directors md ON r.movie_id = md.movie_id
    INNER JOIN directors d ON d.id = md.director_id
WHERE d.first_name = 'Steven' and d.last_name='Spielberg'; 

Query started at 08:10:24 PM UTC; Query executed in 0.04 m

Unnamed: 0,first_name,last_name
0,Dan,Aykroyd
1,E. Hampton,Beagle
2,Ned,Beatty
3,John,Belushi
4,Jordan,Brian
...,...,...
1635,David,Soul
1636,Joan,Darling
1637,Christine,Matchett
1638,Luther,Adler


#### List all the actors that worked with Steven Spielberg

#### Compute the average rank for the movies direcred by steven spielberg

In [56]:
%%read_sql
SELECT avg(rank) 
FROM movies m 
    INNER JOIN movies_directors md ON m.id = md.movie_id
    INNER JOIN directors d ON d.id = md.director_id
WHERE d.first_name = 'Steven' and d.last_name='Spielberg'; 


Query started at 08:10:27 PM UTC; Query executed in 0.00 m

Unnamed: 0,avg(rank)
0,7.546429


#### List the movies of Brad Pitt - exclude those that he plays himself

In [57]:
%%read_sql
SELECT m.name 
FROM movies m 
    INNER JOIN roles r on m.id = r.movie_id 
    INNER JOIN actors a on r.actor_id = a.id
WHERE r.role not like '%%Himself%%' and a.first_name = 'Brad' and a.last_name = 'Pitt' 

Query started at 08:10:27 PM UTC; Query executed in 0.00 m

Unnamed: 0,name
0,Across the Tracks
1,Confessions of a Dangerous Mind
2,Contact
3,Cool World
4,"Dark Side of the Sun, The"
5,"Devil's Own, The"
6,"Favor, The"
7,Fight Club
8,Growing Up Grizzly
9,Happy Together (1989/I)


#### Average rank of Brad Pitt's movies

In [58]:
%%read_sql
SELECT avg(rank) 
FROM movies m 
    INNER JOIN roles r on m.id = r.movie_id 
    INNER JOIN actors a on r.actor_id = a.id
WHERE r.role not like '%%Himself%%' and a.first_name = 'Brad' and a.last_name = 'Pitt' 



Query started at 08:10:27 PM UTC; Query executed in 0.00 m

Unnamed: 0,avg(rank)
0,6.52069


#### List the genre of the movies where Sean Connery appears, and rank them in descending order by count

In [59]:
%%read_sql
SELECT mg.genre, count(*) 
FROM movies_genres mg 
    INNER JOIN movies m ON m.id =mg.movie_id
    INNER JOIN roles r ON m.id = r.movie_id 
    INNER JOIN actors a ON r.actor_id = a.id
WHERE r.role not like '%%Himself%%' and a.first_name = 'Sean' and a.last_name = 'Connery'
GROUP BY genre
ORDER BY count(*) desc



Query started at 08:10:27 PM UTC; Query executed in 0.00 m

Unnamed: 0,genre,count(*)
0,Drama,32
1,Action,28
2,Adventure,22
3,Thriller,21
4,Crime,15
5,Sci-Fi,8
6,Comedy,8
7,Romance,7
8,Fantasy,6
9,Mystery,6


#### Compute the average rank for the movies of each actor and rank the actors in descending order based on that rank. List only actors with at least 20 rated movies.

In [60]:
%%read_sql
SELECT a.*, avg(rank) as avg_rating, count(m.id) as num_movies, count(m.rank) as rated_movies
FROM movies m 
    INNER JOIN roles r on m.id = r.movie_id 
    INNER JOIN actors a ON a.id = r.actor_id
GROUP BY a.id
HAVING count(m.rank)>=20
ORDER BY avg(rank) desc

Query started at 08:10:27 PM UTC; Query executed in 3.57 m

Unnamed: 0,id,first_name,last_name,gender,avg_rating,num_movies,rated_movies
0,234674,Michel,Journiac,M,8.195000,21,20
1,40834,Ion,Besoiu,M,8.181250,79,32
2,391209,Colea,Rautu,M,8.161905,56,21
3,123218,Gheorghe,Dinica,M,8.135484,59,31
4,724198,Leela,Mishra,F,8.122727,130,22
...,...,...,...,...,...,...,...
9500,225254,Jon (II),Jacobs,M,3.290476,30,21
9501,727436,Melissa,Moore,F,3.286364,28,22
9502,606609,Griffin,Drew,F,3.247619,36,21
9503,497017,Lars,von Trier,M,3.172727,32,22


### Subqueries

#### List all the actors that have not worked with Francis Ford Coppola. 

In [61]:
%%read_sql 
SELECT * 
FROM actors 
WHERE id NOT IN (
    SELECT DISTINCT R.actor_id
    FROM roles R 
        INNER JOIN movies_directors MD ON R.movie_id=MD.movie_id
        INNER JOIN directors D ON MD.director_id=D.id 
    WHERE D.first_name = 'Francis Ford' and D.last_name = 'Coppola'
)

Query started at 08:14:02 PM UTC; Query executed in 0.05 m

Unnamed: 0,id,first_name,last_name,gender
0,2,Michael,'babeepower' Viera,M
1,3,Eloy,'Chincheta',M
2,4,Dieguito,'El Cigala',M
3,5,Antonio,'El de Chipiona',M
4,6,José,'El Francés',M
...,...,...,...,...
816728,845461,Herdís,Þorvaldsdóttir,F
816729,845462,Katla Margrét,Þorvaldsdóttir,F
816730,845463,Lilja Nótt,Þórarinsdóttir,F
816731,845464,Hólmfríður,Þórhallsdóttir,F


#### Find the average number of movies directed by each director 

In [62]:
%%read_sql
SELECT avg(noMovies)
FROM 
    (
    SELECT director_id, count(*) as noMovies 
    FROM movies_directors dm 
    GROUP BY director_id
    ) r;

Query started at 08:14:05 PM UTC; Query executed in 0.01 m

Unnamed: 0,avg(noMovies)
0,4.5931


#### Report how many directors have directed one movie, two movies, three movies, etc. (in other words, create a the distribution for the variable "number of movies directed by a given director")

In [63]:
%%read_sql
SELECT NumMovies, COUNT(director_id) as NumDirectors
FROM 
    (
    SELECT director_id, count(*) as NumMovies 
    FROM movies_directors dm 
    GROUP BY director_id
    ) r
GROUP BY NumMovies
ORDER BY NumMovies;

Query started at 08:14:05 PM UTC; Query executed in 0.00 m

Unnamed: 0,NumMovies,NumDirectors
0,1,47339
1,2,13678
2,3,6495
3,4,3982
4,5,2702
...,...,...
196,360,1
197,370,1
198,536,1
199,562,1


#### Find the average number of movies played by each actor

In [64]:
%%read_sql
SELECT avg(noMovies) 
FROM
(
SELECT actor_id, count(distinct movie_id) as noMovies 
FROM roles 
GROUP BY actor_id
) r;

Query started at 08:14:05 PM UTC; Query executed in 2.00 m

Unnamed: 0,avg(noMovies)
0,4.1964


In [65]:
%%read_sql
SELECT NumMovies, COUNT(actor_id) as NumActors
FROM 
    (
    SELECT actor_id, count(*) as NumMovies 
    FROM roles r 
    GROUP BY actor_id
    ) r
GROUP BY NumMovies
ORDER BY NumMovies;

Query started at 08:16:05 PM UTC; Query executed in 0.16 m

Unnamed: 0,NumMovies,NumActors
0,1,508544
1,2,103966
2,3,47369
3,4,28173
4,5,19103
...,...,...
348,543,1
349,544,3
350,549,1
351,672,1


#### Find the maximum number of genres associated with a movie

In [66]:
%%read_sql
SELECT max(noGenres) FROM (
SELECT count(*) as noGenres
FROM movies_genres
GROUP BY movie_id) r;

Query started at 08:16:15 PM UTC; Query executed in 0.02 m

Unnamed: 0,max(noGenres)
0,12


In [67]:
%%read_sql
SELECT max(noMovies)FROM (
SELECT count(*) as noMovies, director_id as director
FROM movies_directors
GROUP BY director_id
) r;

Query started at 08:16:16 PM UTC; Query executed in 0.01 m

Unnamed: 0,max(noMovies)
0,619


In [68]:
%%read_sql
SELECT count(*) as noMovies, director_id as director
FROM movies_directors
GROUP BY director_id
;

Query started at 08:16:17 PM UTC; Query executed in 0.01 m

Unnamed: 0,noMovies,director
0,1,1
1,1,2
2,2,3
3,1,4
4,1,5
...,...,...
88599,1,88797
88600,1,88798
88601,1,88799
88602,1,88800


#### Compare the favorite books of liberal and conservative students

In [69]:
%%read_sql

USE facebook;

SELECT BC.Book, 
    BC.C AS cons,
    BL.L AS libs,
    BC.C/C.NC AS perc_cons, 
    BL.L/L.NL AS perc_libs,
    (BC.C/C.NC)/(BL.L/L.NL) AS lift_cons,
    (BL.L/L.NL)/(BC.C/C.NC) AS lift_libs
FROM 
    (
    SELECT B.Book, COUNT(P.ProfileID) AS C
    FROM Profiles AS P 
        INNER JOIN FavoriteBooks AS B ON P.ProfileID = B.ProfileId
    WHERE PoliticalViews = 'Conservative'
    GROUP BY B.Book
    ) BC 
JOIN 
    (
    SELECT B.Book, COUNT(P.ProfileID) AS L
    FROM Profiles AS P 
        INNER JOIN FavoriteBooks AS B ON P.ProfileID = B.ProfileId
    WHERE PoliticalViews = 'Liberal'
    GROUP BY B.Book
    ) BL ON BC.Book = BL.Book 
JOIN
    (
    SELECT COUNT(*) AS NC 
    FROM Profiles 
    WHERE PoliticalViews = 'Conservative'
    ) C 
JOIN 
    (
    SELECT COUNT(*) AS NL
    FROM Profiles 
    WHERE PoliticalViews = 'Liberal'
    ) L
WHERE BC.C > 5 AND BL.L > 5
ORDER BY lift_cons DESC



Query started at 08:16:18 PM UTC; Query executed in 0.01 m

Unnamed: 0,Book,cons,libs,perc_cons,perc_libs,lift_cons,lift_libs
0,Bible,10,7,0.0107,0.0011,9.861116,0.101408
1,Monkey Business,8,7,0.0085,0.0011,7.888893,0.126760
2,The Bible,21,25,0.0224,0.0039,5.798333,0.172463
3,Moneyball,9,15,0.0096,0.0023,4.141666,0.241449
4,The Prince,14,25,0.0150,0.0039,3.865555,0.258695
...,...,...,...,...,...,...,...
69,The Alchemist,7,91,0.0075,0.0141,0.530983,1.883300
70,Catcher In The Rye,32,452,0.0342,0.0700,0.488692,2.046278
71,Lolita,10,144,0.0107,0.0223,0.479360,2.086117
72,The Lovely Bones,6,93,0.0064,0.0144,0.445340,2.245473
