# Extract Poster Feature Vectors

This notebook details the steps needed to extract pretrained ResNet50 feature vectors for movie poster thumbnails stored in the '.../thumbnails' directory. This directory does not contain any other folders other than the thumbnails. 

In [1]:
import sys
sys.path.append("..")
import os
import pandas as pd
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
tf.compat.v1.enable_eager_execution()

## Import movieids from thumbnail folder

In [2]:
import os
df_posters = pd.DataFrame({"movieid":os.listdir("../data/posters/thumbnails")})
df_posters.head()

Unnamed: 0,movieid
0,tt1467265.jpg
1,tt6396074.jpg
2,tt2013243.jpg
3,tt0119103.jpg
4,tt7218564.jpg


## Downselect to movieids with valid poster images

In [3]:
df_valid_posters = pd.read_csv('../data/posters/ValidPosters.csv')
df_valid_posters.head()

Unnamed: 0,movieid
0,tt1062961.jpg
1,tt3885736.jpg
2,tt0027902.jpg
3,tt5066056.jpg
4,tt0052306.jpg


In [4]:
df_process = df_valid_posters.merge(df_posters,how='left')
df_process.head()

Unnamed: 0,movieid
0,tt1062961.jpg
1,tt3885736.jpg
2,tt0027902.jpg
3,tt5066056.jpg
4,tt0052306.jpg


In [7]:
df_process.to_csv('Posters_movie_ids.csv')

In [5]:
df_process.loc[:,'file_loc'] = '../data/posters/thumbnails/' + df_process.movieid
df_process.head()

Unnamed: 0,movieid,file_loc
0,tt1062961.jpg,/Users/krsrik/Documents/Projects/W266/w266-Kar...
1,tt3885736.jpg,/Users/krsrik/Documents/Projects/W266/w266-Kar...
2,tt0027902.jpg,/Users/krsrik/Documents/Projects/W266/w266-Kar...
3,tt5066056.jpg,/Users/krsrik/Documents/Projects/W266/w266-Kar...
4,tt0052306.jpg,/Users/krsrik/Documents/Projects/W266/w266-Kar...


## Setup pretrained ResNet Model

In [7]:
inputs = tf.keras.layers.Input(shape=(224, 224, 3))
resnet_layer = hub.KerasLayer("https://tfhub.dev/google/imagenet/resnet_v2_50/feature_vector/4",trainable=False)

feature_vector = resnet_layer(inputs)

model = tf.keras.models.Model(inputs=inputs, outputs=feature_vector)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
keras_layer (KerasLayer)     (None, 2048)              23564800  
Total params: 23,564,800
Trainable params: 0
Non-trainable params: 23,564,800
_________________________________________________________________


## Resize and Normalize Images

In [8]:
IMG_SIZE = 224 # Specify height and width of image to match the input format of the model
CHANNELS = 3 # Keep RGB color channels to match the input format of the model
def parse_function(filename,label):
    """Function that returns a tuple of normalized image array and labels array.
    Args:
        filename: string representing path to image
        label: 0/1 one-dimensional array of size N_LABELS
    """
    try:
        # Read an image from a file
        image_string = tf.io.read_file(filename)
        # Decode it into a dense vector
        image_decoded = tf.image.decode_jpeg(image_string, channels=CHANNELS)
        # Resize it to fixed shape
        image_resized = tf.image.resize(image_decoded, [IMG_SIZE, IMG_SIZE])
        # Normalize it from [0, 255] to [0.0, 1.0]
        image_normalized = image_resized / 255.0
    except:
        image_normalized = None
    return image_normalized, label

## Batch Images for tensorflow

In [9]:
BATCH_SIZE = 1024 # Big enough to measure an F1-score
AUTOTUNE = tf.data.experimental.AUTOTUNE # Adapt preprocessing and prefetching dynamically
SHUFFLE_BUFFER_SIZE = 1024 # Shuffle the training data by a chunck of 1024 observations

In [10]:
def create_dataset(filenames, labels, is_training=True):
    """Load and parse dataset.
    Args:
        filenames: list of image paths
        labels: numpy array of shape (BATCH_SIZE, N_LABELS)
        is_training: boolean to indicate training mode
    """
    
    # Create a first dataset of file paths and labels
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    # Parse and preprocess observations in parallel
    dataset = dataset.map(parse_function, num_parallel_calls=AUTOTUNE)
    
    if is_training == True:
        # This is a small dataset, only load it once, and keep it in memory.
        dataset = dataset.cache()
        # Shuffle the data each buffer size
        dataset = dataset.shuffle(buffer_size=SHUFFLE_BUFFER_SIZE)
        
    # Batch the data for multiple steps
    dataset = dataset.batch(BATCH_SIZE)
    # Fetch batches in the background while the model is training.
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [11]:
filenames = list(df_process.file_loc)
labels = list(df_process.movieid)
dataset = create_dataset(filenames, labels)

## Generate Feature Vectors for each batch

In [38]:
import time
NUM_BATCHES = int(np.ceil(df_process.shape[0]/BATCH_SIZE))
print(NUM_BATCHES)
f_vectors = np.array([])
label_vector = []
ctr = 1
for f, l in dataset.take(NUM_BATCHES):
    start_time = time.time()
    if ctr ==1:
        f_vectors = model(f).numpy()       
    else:
        f_vectors = np.append(f_vectors,model(f).numpy(),axis=0)
    label_vector.extend(l.numpy())
    end_time = time.time()
    duration = end_time - start_time
    print("Batch #" + str(ctr) + " completed in " + str(duration) + " seconds...")
    ctr +=1


80
Batch #1 completed in 99.71660876274109 seconds...
Batch #2 completed in 93.30944585800171 seconds...
Batch #3 completed in 93.17532110214233 seconds...
Batch #4 completed in 95.90331816673279 seconds...
Batch #5 completed in 87.30925512313843 seconds...
Batch #6 completed in 98.59444499015808 seconds...
Batch #7 completed in 107.37575387954712 seconds...
Batch #8 completed in 99.47844314575195 seconds...
Batch #9 completed in 97.47757983207703 seconds...
Batch #10 completed in 105.36129307746887 seconds...
Batch #11 completed in 110.07495784759521 seconds...
Batch #12 completed in 107.26291298866272 seconds...
Batch #13 completed in 99.03047275543213 seconds...
Batch #14 completed in 112.88965392112732 seconds...
Batch #15 completed in 122.93072271347046 seconds...
Batch #16 completed in 104.8819739818573 seconds...
Batch #17 completed in 100.13288593292236 seconds...
Batch #18 completed in 109.38337516784668 seconds...
Batch #19 completed in 121.76791787147522 seconds...
Batch #20

## Save feature vectors to file

In [39]:
print(f_vectors.shape)
np.save('../data/image_feature_vectors.data', f_vectors)
df_process.movieid.to_csv('../data/image_tracking_id.data', header=None, index=None)

(81124, 2048)
