## SQL Queries

In [12]:
%reload_ext sql

To connect to our database, we issue the following command:

`%sql mysql://username:password@hostname:port/database_name`

For example, to connect to your IMDB database:

In [13]:
%sql mysql://root:dwdstudent2015@localhost:3306/imdb?charset=utf8

'Connected: root@imdb'

To enter a multiline SQL statement using the iPython extension, we use the `%%sql` command at the beginning of the cell.

### SELECT Queries (IMDB database)

Let's connect to the IMDB database first.

In [14]:
%sql mysql://root:dwdstudent2015@localhost:3306/imdb?charset=utf8

'Connected: root@imdb'

In [15]:
%sql USE imdb;

0 rows affected.


[]

#### Find all movie titles with id less than 100.

In [18]:
%%sql 
SELECT * 
FROM movies 
WHERE id<100

96 rows affected.


id,name,year,rank
0,#28,2002,
1,"#7 Train: An Immigrant Journey, The",2000,
2,$,1971,6.4
3,"$1,000 Reward",1913,
4,"$1,000 Reward",1915,
5,"$1,000 Reward",1923,
6,"$1,000,000 Duck",1971,5.0
7,"$1,000,000 Reward, The",1920,
8,"$10,000 Under a Pillow",1921,
9,"$100,000",1915,


#### Find all information about movies that were released before 1895 (excl)

In [None]:
%%sql
SELECT *
FROM movies 
WHERE year<1895

#### Find all information about movies that were released before 1895 and after 2006 (inclusive)

In [None]:
%%sql
SELECT *
FROM movies 
WHERE year<=1895 OR year>=2006

#### Find all information about movies released between 1895 and 1898 (excl)

In [None]:
%%sql
SELECT *
FROM movies 
WHERE year>1895 AND year<1898

#### Find all information about *actresses* who are have first name Skyler

In [None]:
%%sql
SELECT *
FROM actors
WHERE first_name = 'Skyler' AND gender = 'F';

#### Find the director ID of Steven Spielberg

In [None]:
%%sql
SELECT id
FROM directors
WHERE first_name = 'Steven' AND last_name = 'Spielberg';

#### Find the director IDs and the first and last names of directors with the last name Spielberg and Hitchcock

In [None]:
%%sql
SELECT id, first_name, last_name
FROM directors
WHERE last_name = 'Spielberg' OR last_name = 'Hitchcock';

#### Find all genres of films and the corresponding probabilities for the director ID that corresponds to Steven Spielberg. Sort the results by probability.

In [None]:
%%sql
SELECT genre, prob
FROM directors_genres
WHERE director_id = '75380' # the id of Spielberg FROM above
ORDER BY prob DESC

#### Find the id of the movie Schindler's List.

In [None]:
%%sql
SELECT *
FROM movies
WHERE name = "Schindler's List"

#### List all the roles for the movie with id 290070. Sort them alphabetically

In [None]:
%%sql
SELECT *
FROM roles
WHERE movie_id = '290070'

### SELECT Queries (Facebook database)

In [None]:
%sql mysql://root:dwdstudent2015@localhost:3306/facebook?charset=utf8
 
%sql use facebook;

#### Find all names of students FROM the Profiles table

In [None]:
%%sql
SELECT name
FROM Profiles

#### Get the names and sex of all liberal students (Need to use “backticks” (\`) for attribute names with space in them.)

In [None]:
%%sql
SELECT name, sex
FROM Profiles
WHERE `Political Views`='Liberal'

#### Get the High Schools of the students in the database

In [None]:
%%sql
SELECT `High School`
FROM Profiles

#### Find all the possible political views, eliminating duplicate entries

In [None]:
%%sql
SELECT DISTINCT `Political Views`
FROM Profiles

#### Find all possible relationship statuses

In [None]:
%%sql
SELECT DISTINCT RelationshipStatus
FROM RelationshipStatus

#### Find all possible values for the “status” attribute in Profiles 

In [None]:
%%sql
SELECT DISTINCT Status
FROM Profiles

#### Find all possible values for the “Residence” attribute in Profiles, eliminating duplicates

In [None]:
%%sql
SELECT DISTINCT Residence
FROM Profiles

#### Find all students living in Palladium

In [None]:
%%sql
SELECT *
FROM Profiles
WHERE Residence LIKE 'Palladium%'

#### Find all students who attended Stuyvesant

In [None]:
%%sql
SELECT *
FROM Profiles
WHERE `High School` LIKE 'Stuyvesant%'

#### Find all names that contain a digit

In [None]:
%%sql
SELECT * 
FROM Profiles 
WHERE name REGEXP '[0-9]+'

#### Find all students with a name that contains a non-letter character, other than \- and \.

In [None]:
%%sql
SELECT * 
FROM Profiles 
WHERE name NOT REGEXP '^[A-z \-\.]+$'

### Aggregation Queries (IMDB Database)

#### Number of movies for each director

In [None]:
%%sql
SELECT director_id, count(*) AS NumberOfMovies
FROM movies_directors 
GROUP BY director_id;

#### Rank directors by the number of movies they directed

In [None]:
%%sql
SELECT director_id, count(*) AS NumberOfMovies
FROM movies_directors 
GROUP BY director_id
ORDER BY count(*) desc;

#### Find the number of actors in each movie

In [None]:
%%sql
SELECT movie_id, count(*) 
FROMroles 
GROUP BY movie_id;

#### Find the movies with more than 100 actors

In [None]:
%%sql
SELECT movie_id, count(*) 
FROM roles 
GROUP BY movie_id
HAVING count(*) > 100;

#### Find the most popular genres (basd on the number of movies)

In [None]:
%%sql
SELECT genre, count(*) 
FROM movies_genres
GROUP BY genre
ORDER BY count(*) desc

#### Find the average rank of the movies in the database, per year of release

In [None]:
%%sql
SELECT year, avg(rank) 
FROM movies
GROUP BY year

### Aggregation Queries (Facebook database)

#### List the number of males and females

In [None]:
%%sql
SELECT sex, count(*) 
FROM profiles 
GROUP BY sex

#### List the number of students for each political view

In [None]:
%%sql
SELECT `Political Views`, count(*) 
FROM profiles 
GROUP BY `Political Views`

#### List the number of males and female students for each political view

In [None]:
%%sql
SELECT sex,`Political Views`, count(*) 
FROM profiles 
WHERE sex is not null and `Political Views` is not null
GROUP BY sex, `Political Views`

#### List the number of students per each birth year: Use the Year(date) function to get the year value FROM a dateime column List only years that have at least 10 students

In [None]:
%%sql
SELECT YEAR(birthday), count(*) 
FROM profiles 
WHERE birthday is not null
GROUP BY YEAR(birthday)
HAVING count(*) > 10

#### Find the most popular tv shows and books

In [None]:
%%sql
SELECT FavoriteTVShow, count(*) 
FROM FavoriteTVShows 
WHERE FavoriteTVShow is not null 
GROUP BY FavoriteTVShow
ORDER BY count(*) desc
#Same for books

#### Find the number of students in various relationship statuses

In [None]:
%%sql
SELECT RelationshipStatus,count(*) 
FROM RelationshipStatus
GROUP BY RelationshipStatus

#### Find the most popular majors (concentrations)

In [None]:
%%sql
SELECT Concentration,count(*) FROM Concentration
WHERE Concentration is not null
GROUP BY Concentration
ORDER BY count(*) desc;

### JOINS

In [None]:
%%sql
# List all the movies from year 2000
SELECT *
FROM movies M
WHERE M.year = 2000;

In [11]:
%%sql
# List all the movies from year 2000 and their genres
SELECT *
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
WHERE M.year = 2000;
LIMIT 100l

12143 rows affected.


id,name,year,rank,movie_id,genre
1,"#7 Train: An Immigrant Journey, The",2000,,1,Short
1,"#7 Train: An Immigrant Journey, The",2000,,1,Documentary
31,$pent,2000,4.3,31,Drama
34,& frres,2000,,34,Short
46,"'70s: The Decade That Changed Television, The",2000,,46,Documentary
176,'N Sync & Britney Spears: Your #1 Video Requests... And More!,2000,,176,Short
176,'N Sync & Britney Spears: Your #1 Video Requests... And More!,2000,,176,Music
178,'N Sync: 'Ntimate Holiday Special,2000,,178,Family
178,'N Sync: 'Ntimate Holiday Special,2000,,178,Music
180,'N Sync: Making the Tour,2000,,180,Documentary


In [None]:
%%sql
# List all the Drama movies from year 2000
SELECT *
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
WHERE M.year = 2000 AND MG.genre = 'Drama';


In [None]:
%%sql
# List all the movies from year 2000 and their average rating
# broken down by genre
SELECT AVG(M.rank), MG.genre
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
WHERE M.year = 2000
GROUP BY MG.genre;

In [None]:
%%sql
# List the average ratings for the movies broken down by genre
SELECT AVG(M.rank) AS avg_rating, MG.genre
FROM movies M INNER JOIN movies_genres MG ON M.id=MG.movie_id
GROUP BY MG.genre
ORDER BY avg_rating DESC

In [None]:
%%sql
SELECT distinct a.first_name, a.last_name 
FROM actors a INNER JOIN roles r ON a.id = r.actor_id
    INNER JOIN movies_directors md ON r.movie_id = md.movie_id
    INNER JOIN directors d ON d.id = md.director_id
WHERE d.first_name = 'Steven' and d.last_name='Spielberg'; 

#### List all the actors that worked with Steven Spielberg

#### Compute the average rank for the movies direcred by steven spielberg

In [None]:
%%sql
SELECT avg(rank) 
FROM movies m 
    INNER JOIN movies_directors md ON m.id = md.movie_id
    INNER JOIN directors d ON d.id = md.director_id
WHERE d.first_name = 'Steven' and d.last_name='Spielberg'; 


In [None]:
%%sql
SELECT avg(R.ACT) 
FROM movies m 
    INNER JOIN roles md ON m.id = md.movie_id
WHERE d.first_name = 'Steven' and d.last_name='Spielberg'; 

#### List the movies of Brad Pitt - exclude those that he plays himself

In [None]:
%%sql
SELECT m.name 
FROM movies m 
    INNER JOIN roles r on m.id = r.movie_id 
    INNER JOIN actors a on r.actor_id = a.id
WHERE r.role not like '%Himself%' and a.first_name = 'Brad' and a.last_name = 'Pitt' 

#### Average rank of Brad Pitt's movies

In [None]:
%%sql
SELECT avg(rank) 
FROM movies m 
    INNER JOIN roles r on m.id = r.movie_id 
    INNER JOIN actors a on r.actor_id = a.id
WHERE r.role not like '%H%' and a.first_name = 'Brad' and a.last_name = 'Pitt' 



#### List the genre of the movies where Sean Connery appears, and rank them in descending order by count

In [None]:
%%sql
SELECT mg.genre, count(*) 
FROM movies_genres mg 
    INNER JOIN movies m ON m.id  =mg.movies_id
    INNER JOIN roles r ON m.id = r.movie_id 
    INNER JOIN actors a ON r.actor_id = a.id
WHERE r.role not like '%Himself%' and a.first_name = 'Sean' and a.last_name = 'Connery'
GROUP BY genre
ORDER BY count(*) desc



#### Compute the average rank for the movies of each actor and rank the actors in descending order based on that rank

In [None]:
%%sql
SELECT r.actor_id, avg(rank) 
FROM movies m 
    INNER JOIN roles r on m.id = r.movie_id 
GROUP BY r.actor_id
ORDER BY avg(rank) desc

### Subqueries

#### List all the actors that have not worked with Francis Ford Coppola. 

In [None]:
%%sql 
SELECT id FROM actors WHERE id NOT IN (
SELECT DISTINCT R.actor_id
FROM roles R 
    INNER JOIN movies_directors MD ON R.movie_id=MD.movie_id
    INNER JOIN directors D ON MD.director_id=D.id 
WHERE D.first_name = 'Francis Ford' and D.last_name = 'Coppola')

#### Find the average number of movies directed by each director 

In [None]:
%%sql
SELECT avg(noMovies)
FROM 
    (
    SELECT director_id, count(*) as noMovies 
    FROM movies_directors dm 
    GROUP BY director_id
    ) r;

#### Report how many directors have directed one movie, two movies, three movies, etc. (in other words, create a the distribution for the variable "number of movies directed by a given director")

In [None]:
%%sql
SELECT NumMovies, COUNT(director_id) as NumDirectors
FROM 
    (
    SELECT director_id, count(*) as NumMovies 
    FROM movies_directors dm 
    GROUP BY director_id
    ) r
GROUP BY NumMovies
ORDER BY NumMovies;

#### Find the average number of movies played by each actor

In [None]:
%%sql
SELECT avg(noMovies) 
FROM
(
SELECT actor_id, count(distinct movie_id) as noMovies 
FROM roles 
GROUP BY actor_id
) r;

In [None]:
%%sql
SELECT NumMovies, COUNT(actor_id) as NumActors
FROM 
    (
    SELECT actor_id, count(*) as NumMovies 
    FROM roles r 
    GROUP BY actor_id
    ) r
GROUP BY NumMovies
ORDER BY NumMovies;

In [None]:
%%sql
SELECT actor_id, count(*) As NumberOfMovies
FROM roles 
GROUP BY movie_id;

#### Find the maximum number of genres associated with a movie

In [None]:
%%sql
SELECT max(noGenres) FROM (
SELECT count(*) as noGenres
FROM movies_genres
GROUP BY movie_id) r;

In [None]:
%%sql
SELECT max(noMovies)FROM (
SELECT count(*) as noMovies, director_id as director
FROM movies_directors
GROUP BY director_id
) ;

In [None]:
%%sql
SELECT count(*) as noMovies, director_id as director
FROM movies_directors
GROUP BY director_id
;

#### Compare the favorite books of liberal and conservative students

In [None]:
%%sql

USE facebook;

DROP VIEW IF EXISTS BooksLiberals;
CREATE VIEW BooksLiberals AS
SELECT B.FavoriteBook, COUNT(P.ProfileID) AS NumLiberalStudents
FROM Profiles AS P 
    INNER JOIN FavoriteBooks AS B ON P.ProfileID = B.ProfileId
WHERE `Political Views` = 'Liberal'
GROUP BY B.FavoriteBook
ORDER BY COUNT(P.ProfileID) DESC;

DROP VIEW IF EXISTS BooksConservatives;
CREATE VIEW BooksConservatives AS
SELECT B.FavoriteBook, COUNT(P.ProfileID) AS NumConservativeStudents
FROM Profiles AS P 
    INNER JOIN FavoriteBooks AS B ON P.ProfileID = B.ProfileId
WHERE `Political Views` = 'Conservative'
GROUP BY B.FavoriteBook
ORDER BY COUNT(P.ProfileID) DESC;

DROP VIEW IF EXISTS Conservatives;
CREATE VIEW Conservatives AS
SELECT COUNT(*) AS NumCons 
FROM Profiles 
WHERE `Political Views` = 'Conservative';

DROP VIEW IF EXISTS Liberals;
CREATE VIEW Liberals AS
SELECT COUNT(*) AS NumLib
FROM Profiles 
WHERE `Political Views` = 'Liberal';

SELECT BC.FavoriteBook, 
    BC.NumConservativeStudents AS Conservatives,
    BL.NumLiberalStudents AS Liberals,
    BC.NumConservativeStudents/C.NumCons AS PercentageConservatives, 
    BL.NumLiberalStudents/L.NumLib AS PercentageLiberals,
    (BC.NumConservativeStudents/C.NumCons)/(BL.NumLiberalStudents/L.NumLib) AS OddsBeingConservative
FROM BooksConservatives AS BC 
    INNER JOIN BooksLiberals BL ON BC.FavoriteBook = BL.FavoriteBook, 
    Conservatives C, Liberals L
WHERE BC.NumConservativeStudents > 5 AND BL.NumLiberalStudents > 5
ORDER BY OddsBeingConservative DESC

