# Download Scripts for Poster Images from Movielens 100k Dataset

Scripts are based on https://github.com/babu-thomas/movielens-posters

In [30]:
import csv
import urllib.parse
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook as tqdm, trange

In [21]:
#
# Scrap IMDB URLs
#
items = pd.read_csv('u.item.zip', compression='zip', sep='|', usecols=[0, 1],
                    encoding = "ISO-8859-1", names=['movie_id', 'movie_title'])
for _, row in tqdm(items.iterrows()): 
    domain = 'http://www.imdb.com'
    search_url = domain + '/find?q=' + urllib.parse.quote_plus(row.movie_title)
    with urllib.request.urlopen(search_url) as response:
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')

        try:
            title = soup.find('table', class_='findList').tr.a['href']
            movie_url = domain + title
            with open('movie_url.csv', 'a', newline='') as out_csv:
                writer = csv.writer(out_csv, delimiter=',')
                writer.writerow([row.movie_id, movie_url])
        except AttributeError:
            pass

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
#
# Scrap poster images and URLs
#
urls = pd.read_csv('movie_url.csv', names=['movie_id', 'movie_url'])
for _, row in tqdm(urls.iterrows()):
        domain = 'http://www.imdb.com'
        with urllib.request.urlopen(row.movie_url) as response:
            html = response.read()
            soup = BeautifulSoup(html, 'html.parser')

            try:
                image_url = soup.find('div', class_='poster').a.img['src']
                extension = '.jpg'
                image_url = ''.join(image_url.partition('_')[0]) + extension
                filename = 'imdb-posters/' + row.movie_id + extension
                with urllib.request.urlopen(image_url) as response:
                    with open(filename, 'wb') as out_image:
                        out_image.write(response.read())
                    with open('movie_poster.csv', 'a', newline='') as out_csv:
                        writer = csv.writer(out_csv, delimiter=',')
                        writer.writerow([row.movie_id, row.image_url])
            except AttributeError:
                pass

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [33]:
urls = pd.read_csv('movie_url.csv', names=['movie_id', 'movie_url'])

thumbnails_per_movie = 20
with open('movie_thumbnail.csv', 'a', newline='') as out_csv:
    writer = csv.writer(out_csv, delimiter=',')
    
    for _, row in urls.iterrows():
        print(f'Loading thumbnails for movie {row.movie_id}')
        
        domain = 'http://www.imdb.com'
        media_url = row.movie_url + 'mediaindex'
        with urllib.request.urlopen(media_url) as response:
            html = response.read()
            soup = BeautifulSoup(html, 'html.parser')

            try:
                grid = soup.find('div', id='media_index_thumbnail_grid')
                thumbnails = grid.find_all('img')
                indexes = np.random.choice(np.arange(len(thumbnails)), min(thumbnails_per_movie, len(thumbnails)), replace=False)
                for i in indexes:
                    filename = f'{row["movie_id"]}_{i}.jpg'
                    image_url = thumbnails[i]['src']
                    writer.writerow([row.movie_id, image_url])
                    with urllib.request.urlopen(image_url) as response:
                        with open(f'imdb-thumbnails-{thumbnails_per_movie}/' + filename, 'wb') as out_image:
                            out_image.write(response.read())
            except AttributeError as e:
                print(e)

Loading thumbnails for movie 1
Loading thumbnails for movie 2
Loading thumbnails for movie 3
Loading thumbnails for movie 4
Loading thumbnails for movie 5
Loading thumbnails for movie 7
'NoneType' object has no attribute 'find_all'
Loading thumbnails for movie 8
Loading thumbnails for movie 9
Loading thumbnails for movie 10
Loading thumbnails for movie 11
Loading thumbnails for movie 12
'NoneType' object has no attribute 'find_all'
Loading thumbnails for movie 13
Loading thumbnails for movie 14
Loading thumbnails for movie 15
Loading thumbnails for movie 16
Loading thumbnails for movie 17
Loading thumbnails for movie 18
Loading thumbnails for movie 19
Loading thumbnails for movie 20
Loading thumbnails for movie 21
Loading thumbnails for movie 22
Loading thumbnails for movie 23
Loading thumbnails for movie 24
Loading thumbnails for movie 25
Loading thumbnails for movie 26
'NoneType' object has no attribute 'find_all'
Loading thumbnails for movie 27
Loading thumbnails for movie 28
Loadin