# Keras Transfer Learning with Movie Posters

by Parwesh Rallapalli

To interface with the Kaggle dataset:
https://www.kaggle.com/sohier/film-recommendation-engine-converted-to-use-tmdb

Using Keras with CSV: http://nghiaho.com/?p=2333

Transfer Learning with Keras: https://www.kaggle.com/abnera/transfer-learning-keras-xception-cnn

MovieLens: https://grouplens.org/datasets/movielens/

AlexNet in Keras: https://engmrk.com/alexnet-implementation-using-keras/

In [1]:
import numpy as np
import tensorflow as tf

print(tf.__version__)

  from ._conv import register_converters as _register_converters


1.9.0


https://github.com/kuk/log-progress

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

__Load `movies_metadata.csv` file and download movie posters__

The Movies Dataset: https://www.kaggle.com/rounakbanik/the-movies-dataset
>These files contain metadata for all 45,000 movies listed in the Full MovieLens Dataset. The dataset consists of movies released on or before July 2017. Data points include cast, crew, plot keywords, budget, revenue, posters, release dates, languages, production companies, countries, TMDB vote counts and vote averages.

>This dataset also has files containing 26 million ratings from 270,000 users for all 45,000 movies. Ratings are on a scale of 1-5 and have been obtained from the official GroupLens website.

Extract poster image paths (500 x 750 px)

In [3]:
import os
import sys
import csv
import urllib.request

def download_poster_images(path, num):
    with open('data/movies_metadata.csv') as f:
        reader = csv.DictReader(f)
        i = 0
        count = 0
        for row in log_progress(reader, every=1, size=num, name='Images Downloaded'):
            if count = 5:
                image_path = 'https://image.tmdb.org/t/p/w500'+str(row['poster_path'])
                title = str(row['original_title'])
                fullfilename = os.path.join(path, title+'.jpg') 
                print("{:20s} {:s}".format(title, image_path))
                #sys.stdout.flush()
                urllib.request.urlretrieve(image_path, fullfilename)
                count = 0
            count += 1
            i += 1
            if(i==num):
                break
    

def delete_poster_images(path):
    filelist = [f for f in os.listdir(path) if f.endswith('.jpg')]
    for f in log_progress(filelist, every=1, name="Images Deleted"):
        os.remove(os.path.join(path, f))

In [4]:
import json
import pandas as pd

def load_tmdb_movies(path):
    df = pd.read_csv(path)
    df['release_date'] = pd.to_datetime(df['release_date']).apply(lambda x: x.date())
    json_columns = ['genres', 'keywords', 'production_countries', 'production_companies', 'spoken_languages']
    for column in json_columns:
        df[column] = df[column].apply(json.loads)
    return df

In [5]:
#from IPython.lib import backgroundjobs as bg
#jobs = bg.BackgroundJobManager()
#jobs.new(fetch_urls, log_progress)
poster_images_folder_path = os.path.join(os.getcwd(), 'poster_images')
download_poster_images(poster_images_folder_path, 100)

VBox(children=(HTML(value=''), IntProgress(value=0)))

Toy Story            https://image.tmdb.org/t/p/w500/rhIRbceoE9lR4veEXuwCC2wARtG.jpg
Jumanji              https://image.tmdb.org/t/p/w500/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg
Grumpier Old Men     https://image.tmdb.org/t/p/w500/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg
Waiting to Exhale    https://image.tmdb.org/t/p/w500/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg
Father of the Bride Part II https://image.tmdb.org/t/p/w500/e64sOI48hQXyru7naBFyssKFxVd.jpg
Heat                 https://image.tmdb.org/t/p/w500/zMyfPUelumio3tiDKPffaUpsQTD.jpg
Sabrina              https://image.tmdb.org/t/p/w500/jQh15y5YB7bWz1NtffNZmRw0s9D.jpg
Tom and Huck         https://image.tmdb.org/t/p/w500/sGO5Qa55p7wTu7FJcX4H4xIVKvS.jpg
Sudden Death         https://image.tmdb.org/t/p/w500/eoWvKD60lT95Ss1MYNgVExpo5iU.jpg
GoldenEye            https://image.tmdb.org/t/p/w500/5c0ovjT41KnYIHYuF4AWsTe3sKh.jpg
The American President https://image.tmdb.org/t/p/w500/lymPNGLZgPHuqM29rKMGV46ANij.jpg
Dracula: Dead and Loving It https://image.tmdb.org/t/p/w

Shopping             https://image.tmdb.org/t/p/w500/jdYL6f0KXnYBcnWnwGtnanJtKlE.jpg
Heidi Fleiss: Hollywood Madam https://image.tmdb.org/t/p/w500/j9k3UTx7OZ0xoCws1oACYQZFF5N.jpg
City Hall            https://image.tmdb.org/t/p/w500/ttdjD9NpxXsTJidPT3o1Gk0oTkG.jpg
Bottle Rocket        https://image.tmdb.org/t/p/w500/iuO10cRKrMfql5yc8YTgdfHt7gR.jpg


100 poster images ==> 100 training examples

**Genres included in the MovieLens dataset:**

    Action
    Adventure
    Animation
    Children's
    Comedy
    Crime
    Documentary
    Drama
    Fantasy
    Film-Noir
    Horror
    Musical
    Mystery
    Romance
    Sci-Fi
    Thriller
    War
    Western
    (no genres listed)
   
 19 total genres ==> 19 classes
 
 https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py
 
 https://keras.io/preprocessing/image/
 
__Example of using .flow_from_directory(directory):__

`train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)`

`test_datagen = ImageDataGenerator(rescale=1./255)`

`train_generator = train_datagen.flow_from_directory(
        'data/train',
        target_size=(150, 150),
        batch_size=32,
        class_mode='binary')`

`validation_generator = test_datagen.flow_from_directory(
        'data/validation',
        target_size=(150, 150),
        batch_size=32,
        class_mode='binary')`

`model.fit_generator(
        train_generator,
        steps_per_epoch=2000,
        epochs=50,
        validation_data=validation_generator,
        validation_steps=800)`

In [None]:
batch_size = 3
num_classes = 19  #19 genres
epochs = 10

In [6]:
import keras.preprocessing.image
from keras.models import Sequential
from keras.layers import Dense, Activation

img_rows, img_cols = 500, 750 #images are 500x750 px
#filelist = [f for f in poster_images_folder_path if f.endswith('.jpg')]

train_generator = train_datagen.flow_from_directory(
    poster_images_folder_path,
    target_size=(img_rows, img_cols),
    batch_size=batch_size,
    class_mode=binary)

Using TensorFlow backend.


NameError: name 'train_datagen' is not defined

In [None]:
delete_poster_images(poster_images_folder_path)