## Load and Filter Data

Load movie data from the supplied json files. Filter out duplicates and movies with missing information.

In [371]:
import json

In [372]:
movie_data = []

with open('download/movies.json') as f:
    movie_data.extend(json.load(f)['movies'])
with open('download/more_movies.json') as f:
    movie_data.extend(json.load(f)['movies'])
    
len(movie_data)

213

In [373]:
sorted(movie_data[1].keys())

['abridged_cast',
 'alternate_ids',
 'id',
 'links',
 'mpaa_rating',
 'posters',
 'ratings',
 'release_dates',
 'runtime',
 'synopsis',
 'title',
 'year']

In [374]:
requiredFields = set([
    'id',
    'links',
    'mpaa_rating',
    'posters',
    'ratings',
    'runtime',
    'synopsis',
    'title',
    'year'
])

Filter out movies that don't have enough data

In [375]:
def allFilled(movie):
    for k in movie:
        if not movie.get(k) and k in requiredFields:
            return False
    return True

filtered_data = [m for m in movie_data if allFilled(m)]
len(filtered_data)

192

In [376]:
ids = set()
dupes = set()

for m in filtered_data:
    if m['id'] in ids:
        dupes.add(m['id'])
    ids.add(m['id'])
    
print("uniques:\t", len(ids))
print("duplicates:\t", len(dupes))

uniques:	 78
duplicates:	 46


In [377]:
movie_dict = {m['id']: m for m in filtered_data}

## Download Hi-Res Posters

Many of the rottentomatoes movies don't have high-resolution data, but we can find it.

As a last resort, downloads photos from [TMDB](https://www.themoviedb.org/documentation/api). Set your API key as environment variable `TMDB_KEY`.

In [283]:
import os
import re
import requests
import time

In [284]:
def posterPath(rt_id):
    return os.path.join('download', 'posters', rt_id + '.jpg')

In [378]:
def getURLFromTMDB(imdb_id):
    KEY = os.environ['TMDB_KEY']
    IMG_PATTERN = 'http://api.themoviedb.org/3/movie/{imdbid}/images?api_key={key}' 
    r = requests.get(IMG_PATTERN.format(imdbid=imdb_id, key=KEY))
    api_response = r.json()
    print (api_response)
    if not api_response.get('posters'):
        return None
    for p in api_response['posters']:
        if p['iso_639_1'] == 'en':
            return 'http://image.tmdb.org/t/p/original' + p['file_path']
    return None


def getPosterURL(movie):
    link = m['posters']['original']
    # links with 'resizing' in them only return thumbnails
    if 'resizing' in link:
        # we can find the flixter link
        if 'movie' in link:
            print("> extracted: {} ({})".format(movie['title'], movie['id']))
            path = re.match('^.*?(/movie.*)$', link).group(1)
            return "http://content9.flixster.com" + path
        elif m.get('alternate_ids') and m['alternate_ids'].get('imdb'):
            print("> tmdb: {} ({})".format(movie['title'], movie['id']))
            imdb_id = m['alternate_ids']['imdb']
            return getURLFromTMDB(imdb_id)
        else:
            return None
    print("> direct: {} ({})".format(movie['title'], movie['id']))
    return link

In [286]:
notFound = []

for m in movie_dict.values():
    print(m['id'], url)
    if os.path.exists(posterPath(m['id'])):
        continue
    print (posterPath(m['id']))
    url = getPosterURL(m)    
    if url:
        response = requests.get(url)
        if response.status_code == 200:
            with open(posterPath(m['id']), 'wb') as f:
                f.write(response.content)
        elif response.status_code == 404:
            print(">> Not found:", m['id'], url)
            notFound.append(m['id'])
        else:
            print(">> Error:", response.status_code, m['id'], url)
        time.sleep(0.1)  # respect our API limits!

771416410 http://images.rottentomatoescdn.com/images/redesign/poster_default.gif
download/posters/771416410.jpg
> tmdb: Risen (771416410)
{'status_message': 'The resource you requested could not be found.', 'status_code': 34}
771254328 None
download/posters/771254328.jpg
> extracted: Point Break (771254328)
771402390 http://content9.flixster.com/movie/11/19/73/11197391_ori.jpg
download/posters/771402390.jpg
> extracted: The Lady In The Van (771402390)
770785949 http://content9.flixster.com/movie/11/27/63/11276341_ori.jpg
download/posters/770785949.jpg
> extracted: Hail, Caesar! (770785949)
771420359 http://content9.flixster.com/movie/11/43/44/11434453_ori.jpg
download/posters/771420359.jpg
> extracted: Spotlight (771420359)
771385342 http://content9.flixster.com/movie/11/20/15/11201558_ori.jpg
download/posters/771385342.jpg
> direct: Jungle Shuffle (771385342)
771306118 http://d3biamo577v4eu.cloudfront.net/static/images/redesign/poster_default_thumb.gif
download/posters/771306118.jpg
>

Filter out movies with no poster:

In [379]:
print("Before:", len(movie_dict))
for rt_id in set(movie_dict.keys()):
    if not os.path.exists(posterPath(rt_id)):
        del movie_dict[rt_id]
print("After:", len(movie_dict))

Before: 78
After: 66


## Generate Thumbnail Sprites

In [380]:
import PIL
import thumbnails

In [381]:
THUMB_SIZE = (200, 300)
MAX_SPRITE_SIZE = 5
SPRITE_SIZE = (THUMB_SIZE[0] * MAX_SPRITE_SIZE, THUMB_SIZE[1] * MAX_SPRITE_SIZE)

In [382]:
thumb_indices = {}  # Each index is a 3-tuple: file_number, row, column
sprites = []

for i, movie in enumerate(movie_dict.values()):
    f = int(i / MAX_SPRITE_SIZE ** 2)
    j = int(i % MAX_SPRITE_SIZE ** 2)
    row = int(j / MAX_SPRITE_SIZE)
    col = int(j % MAX_SPRITE_SIZE)
    
    if len(sprites) == f:
        sprites.append(PIL.Image.new(mode='RGB', size=SPRITE_SIZE))
    
    path = os.path.abspath(posterPath(movie['id']))
    thumb = thumbnails.get_thumbnail(path, "{}x{}".format(*THUMB_SIZE), crop='center')
    sprites[-1].paste(thumb.image, (col * THUMB_SIZE[0], row * THUMB_SIZE[1]))
    
    thumb_indices[movie['id']] = (f, row, col)

In [387]:
SPRITE_DIR = 'thumb_sprites'

for i, s in enumerate(sprites):
    s.save(os.path.join(SPRITE_DIR, "sprite-{}.jpg".format(i)))
    
spriteInfo = {
    'thumb_size': THUMB_SIZE,
    'thumb_indices': thumb_indices,
    'n_sheets': len(sprites)
}

with open(os.path.join(SPRITE_DIR, "sprites.json"), 'w') as f:
    json.dump(spriteInfo, f)