In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pymysql
import getpass

%matplotlib inline

Based on iMDB dataset (exported January 2020), find out the "best director" (highest avg. rating score) from the db

In [2]:
# I connect with a db locally
conn = pymysql.connect(host="localhost",
                       port=3306,
                       user="root",
                       passwd=getpass.getpass(),
                       db="movies")

········


In [3]:
q = """
SELECT
    tp.nconst,                     
    count(*),
    AVG(averageRating) as rating,
    SUM(numVotes) as votes
FROM
    title_principals tp
JOIN
    title_basics tb
    ON tp.tconst = tb.tconst
JOIN
    title_ratings tr
    ON tp.tconst = tr.tconst
WHERE
    tp.category = "director"
    AND tb.titleType = "movie"
    AND tb.startYear BETWEEN 2000 AND 2019
GROUP BY
    tp.nconst
HAVING
    SUM(numVotes) > 100000
ORDER BY
    3
DESC
"""
# nconst is the unique identifier that imdb uses for identify people
# tconst is the unique identifier that imdb uses for identify movies
# we consider only movies between 2000 and 2019
# in this query we search for directors
# Minimum popularity required: threshold of at least 100.000 votes (in the sum of the whole movies in the timeframe)

In [4]:
%time
directors = pd.read_sql(q,conn)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


In [5]:
directors.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count(*),890.0,5.338202,3.927521,1.0,3.0,5.0,7.0,51.0
rating,890.0,6.515766,0.728162,3.075,6.1,6.566667,6.995,8.4
votes,890.0,532562.130337,754621.77993,100005.0,151483.0,269894.5,642321.75,11109933.0


- 890 directors has received (accumulative) more than 100.000 votes in iMDB for their movies
- avg votes are 532.562, but the most popular has received over 11 Mio votes
- max avg rating is 8.4 and min 3.075 in the dataset
- more than the population has directed 5 movies or more. The max is 51(!) movies in 20 years

In [6]:
directors.head(20)

Unnamed: 0,nconst,count(*),rating,votes
0,nm2937122,1,8.4,321912.0
1,nm2130108,1,8.4,297925.0
2,nm0745247,1,8.4,297925.0
3,nm0634240,9,8.333333,11109933.0
4,nm0881279,4,8.2,2716628.0
5,nm0677037,1,8.2,883846.0
6,nm0215455,1,8.2,568233.0
7,nm0386246,5,8.1,616893.0
8,nm0254178,1,8.1,156318.0
9,nm0594503,4,8.075,1062877.0


Challenges in the result:
- we can't recognise the name of the director (nmXXXXXX)
- top directors has directed only 1 movie (in 20 years), how to do a fair comparison?

In [7]:
# for retrieve the names, we will webscrap
import requests
import bs4

---
take one director as example

In [8]:
base = "https://www.imdb.com/name/"

code = "nm0634240"

In [9]:
url = base + code
r = requests.get(url)
soup = bs4.BeautifulSoup(r.content, "html.parser")

In [10]:
# movies = soup.find("div", class_="a-section imdb-scroll-table-inner").find_all("tr")
# movies[1].find("a", class_="a-link-normal").contents[0]
soup.title.contents[0]

'Christopher Nolan - IMDb'

In [11]:
name = (soup.find("div", class_="name-overview-widget")
            .find("td", class_="name-overview-widget__section")
            .find("span", class_="itemprop")
            .contents[0])

In [12]:
name

'Christopher Nolan'

---
create a function to retrieve the names

initially create this function, working individually but is not for retrieve a series

```python

def get_name(name_code):
    # define url
    base = "https://www.imdb.com/name/"
    url = base + name_code
    
    # get the soup
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.content, "html.parser")
    
    # get name
    return (soup.find("div", class_="name-overview-widget")
                .find("td", class_="name-overview-widget__section")
                .find("span", class_="itemprop")
                .contents[0])
```


alternative solution

In [13]:
def get_name_title(name_code):
    # define url
    base = "https://www.imdb.com/name/"
    url = base + name_code
    
    # get the soup
    r = requests.get(url)
    soup = bs4.BeautifulSoup(r.content, "html.parser")
    
    # get name
    name = soup.title.contents[0]
    
    return name[:-7]

In [14]:
nconst_list = [n for n in directors.nconst]
get_name_title(nconst_list[0])

'Adrian Molina'

In [15]:
list_of_names = [get_name_title(n) for n in nconst_list]

In [16]:
list_of_names

['Adrian Molina',
 'Bob Persichetti',
 'Rodney Rothman',
 'Christopher Nolan',
 'Lee Unkrich',
 'Bob Peterson',
 'Ronnie Del Carmen',
 'Rajkumar Hirani',
 'Adam Elliot',
 'Hayao Miyazaki',
 'Jan Pinkava',
 'Loveleen Tandan',
 'Stephen Chbosky',
 'Jared Bush',
 'Quentin Tarantino',
 'Peter Jackson',
 'Dan Scanlon',
 'Peter Ramsey',
 'Nuri Bilge Ceylan',
 'Dean DeBlois',
 'Satoshi Kon',
 'Josh Cooley',
 'Martin McDonagh',
 'Alfonso Cuarón',
 'Mike Johnson',
 'Jaco Van Dormael',
 'Nathan Greno',
 'Oriol Paulo',
 'Greta Gerwig',
 'Chris McKay',
 'Martin Scorsese',
 'Wes Anderson',
 'Mel Gibson',
 'David Silverman',
 'Chris Sanders',
 'Nitesh Tiwari',
 'Pete Docter',
 'David Fincher',
 'Asghar Farhadi',
 'Denis Villeneuve',
 'Spike Jonze',
 'Andrey Zvyagintsev',
 'Alejandro G. Iñárritu',
 'Andrew Stanton',
 'Richard Curtis',
 'Ted Demme',
 'Eric Bress',
 'J. Mackye Gruber',
 'Alan Parker',
 'Ben Affleck',
 'Michael Gracey',
 'Aneesh Chaganty',
 'Rich Moore',
 'Farhan Akhtar',
 'Hong-jin Na'

In [20]:
directors = directors.assign(name=list_of_names)

In [26]:
directors.rating = directors.rating.round(2)

In [30]:
directors.votes = directors.votes.astype(int)

In [31]:
directors.head(20)

Unnamed: 0,nconst,count(*),rating,votes,name
0,nm2937122,1,8.4,321912,Adrian Molina
1,nm2130108,1,8.4,297925,Bob Persichetti
2,nm0745247,1,8.4,297925,Rodney Rothman
3,nm0634240,9,8.33,11109933,Christopher Nolan
4,nm0881279,4,8.2,2716628,Lee Unkrich
5,nm0677037,1,8.2,883846,Bob Peterson
6,nm0215455,1,8.2,568233,Ronnie Del Carmen
7,nm0386246,5,8.1,616893,Rajkumar Hirani
8,nm0254178,1,8.1,156318,Adam Elliot
9,nm0594503,4,8.07,1062877,Hayao Miyazaki


The list is not so obvious, the first positions are occupied by directors of animation movies (actually position 2 and 3 belong to the same movie, which was co-directed)

Also found some of more iconic directors (Christopher Nolan (4), Quentin Tarantino(15) or Peter Jackson(16)



In [33]:
#directors.to_csv("../data/top_directors_imdb.csv", index=False)

to be continued