# Download Poster URLs from Top 1000 movies from 2005-2010 (6 years)

- Should get total 6000 posters
- Also download genre ID

In [4]:
# code to download all English movies in 2012-2016
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import seaborn as sns # Seaborn visualization package
import pandas as pd
import time
import re # Regex
import collections

# File reading things
import pickle
import json
import csv
import datetime # For handling dates

# The "requests" library makes working with HTTP requests easier
import requests
import os
from bs4 import BeautifulSoup
from IPython.display import Image, display # Display image from URL
from IPython.core.display import HTML # Display image from URL

# TMDB API wrapper
import tmdbsimple as tmdb

# IMDB access
import imdb

# Display options
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
%matplotlib inline

In [6]:
release_year_range = range(2005,2011)
release_year_range

# Get 20 results per page, and want 1000 movies => 50 pages to download. 
n_pages_per_year = 1000/20

In [20]:
### Viewing metadata and downloading poster URLs
# My API key (Lexi)
tmdb.API_KEY = 'ee0df3ce88063f7f6cd466ff61266a55'
    
# Create empty list to store each page: n_pages x n_years
movies_list = [None] * int( n_pages_per_year * len(release_year_range))

# For each year
ind = 0
for i in range(len(release_year_range)):
    release_year = int(release_year_range[i])
    
    # Get first n_pages_per_year pages of movie data
    for page_n in range(1, n_pages_per_year + 1):
        url ='https://api.themoviedb.org/3/discover/movie?api_key=ee0df3ce88063f7f6cd466ff61266a55&language=en-US&sort_by=popularity.desc&include_adult=false&include_video=false&page='+ str(page_n)+ '&primary_release_year=' + str(release_year) + '&with_original_language=en'
        response_page = requests.get(url)
        movies_list[ind] = response_page.json()
        ind = ind + 1
        # pause for a bit
        time.sleep(0.5)

In [21]:
len(movies_list)

300

In [22]:
file_name = 'poster_urls_Top1000_each_2005-2016.csv'

### Save json contents as CSV
# movies_per_page = 20

with open(file_name, "w") as file:
    csv_file = csv.writer(file)  
    # Add column names: poster URL, Title, release date, ID, Genres
    csv_file.writerow(['poster_path', 'release_date', 'id', 'genre_ids'])
    
    # For each page in list, get attributes of movie
    for i in range(len(movies_list)):
        # For each movie in the page
        for item in movies_list[i]['results']:
            csv_file.writerow([item['poster_path'], item['release_date'], item['id'], item['genre_ids']])

In [23]:
movies_list[i]

{u'page': 50,
 u'results': [{u'adult': False,
   u'backdrop_path': u'/enOlEH5j9yu8lfT7DHarU3Xy2hz.jpg',
   u'genre_ids': [27],
   u'id': 137618,
   u'original_language': u'en',
   u'original_title': u'\u0e43\u0e04\u0e23...\u0e43\u0e19\u0e2b\u0e49\u0e2d\u0e07',
   u'overview': u'Nida (Sinjai Plengpanich) is a single mother who takes care of her anti-social son who has locked himself in his room for five years. The only way to communicate with her son is to write on a piece of paper and slip it under the door. But, when outsiders start to get curious about what is going on behind the door of her son\u2019s room, a series of terrible events starts to happen.',
   u'popularity': 1.036701,
   u'poster_path': u'/nbIKkOlqjoqdqgbB2dRV2pkvXVf.jpg',
   u'release_date': u'2010-09-03',
   u'title': u'Who Are You?',
   u'video': False,
   u'vote_average': 6.2,
   u'vote_count': 5},
  {u'adult': False,
   u'backdrop_path': u'/zwauZI4GANu0CcfM1HDmJsZUEzg.jpg',
   u'genre_ids': [18, 10749],
   u'id': 

In [26]:
# Check if CSV is readable- print head + tail

Posters_df = pd.read_csv(file_name)
print Posters_df.shape
print Posters_df.head(n=3)
print Posters_df.tail(n=3)



(6000, 4)
                        poster_path release_date   id        genre_ids
0  /6sASqcdrEHXxUhA3nFpjrRecPD2.jpg   2005-11-05  674  [12, 14, 10751]
1  /dr6x4GyyegBWtinPBzipY02J2lV.jpg   2005-06-14  272     [28, 80, 18]
2  /AoGpqw4S4ZGgwhlM3FgzFVwyIGl.jpg   2005-12-11  752     [28, 53, 14]
                           poster_path release_date      id genre_ids
5997  /koA9VgXTKl7NqnybxYjyy3AODtL.jpg   2010-07-15   51597      [18]
5998  /fMKkLEp278E8qx1W5gN8XLjZp3I.jpg   2010-01-01   28596  [35, 27]
5999  /cAp7kWDqAGB3UePxEo2HFh4vsXf.jpg   2010-08-13  148800      [18]


In [31]:
# Check n missing poster path
print 'N posters missing URL:',np.sum(Posters_df['poster_path'].isnull())


N posters missing URL: 37
