In [1]:
import tensorflow as tf
import torch
import numpy as np
import pandas as pd
import glob
import sys
import os
import joblib

np.random.seed(42)

# Where to get the data
PATH = os.path.join(os.getcwd(), "data_dir")

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

## 1. Get the data

In [2]:
def fetch_movies(path=PATH):
    """Load .pkl movie files"""
    filenames = glob.glob(os.path.join(PATH, "tt*.pkl"))
    movies = []
    for fn in filenames:
        movie = joblib.load(fn)
        movies.append(movie)
    return movies

In [3]:
def split_train_test(data, train_size=60):
    """Split data into train and test sets"""
    # For stable output across runs
    np.random.seed(42)
    # Shuffle indices
    shuffled_indices = np.random.permutation(len(data))
    train_indices = shuffled_indices[:train_size]
    test_indices = shuffled_indices[train_size:]
    train_set = [data[i] for i in train_indices]
    test_set = [data[i] for i in test_indices]
    return train_set, test_set

In [28]:
# Get the train and test data sets
movies = fetch_movies()

In [5]:
# Split movies into training and validation sets
movies_train, movies_val = split_train_test(movies)

In [6]:
# Longest movie length
max_movie_length = max([movie['place'].shape[0] for movie in movies])
max_movie_length

3096

## 2. Data processing

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def transform_movies(movies, features=['place', 'cast', 'action', 'audio'], pad_len=4000):
    """
    Unroll the given features by column and separate features from labels.
    Then pad the sequences in each movie to the length of the longest movie.
    """
    X, Y = [], []
    # Unroll the features
    for movie in movies: 
        row = torch.cat([movie[feat] for feat in features], dim=1)
        X.append(row.numpy())
        # Pre-pad the label since its length is N-1
        labels = movie['scene_transition_boundary_ground_truth']
        labels = torch.cat([torch.tensor([False]), labels])
        Y.append(labels.numpy())
    # Pad the sequences
    X_padded = pad_sequences(X, maxlen=pad_len, padding='post', dtype='float32')
    Y_padded = pad_sequences(Y, value=False, maxlen=pad_len, padding='post')
    return X_padded, Y_padded

In [8]:
# Transform training and validation sets
X_train, y_train = transform_movies(movies_train)
X_val, y_val = transform_movies(movies_val)

## 3. Build and train models

In [9]:
ACCURACY_THRESHOLD = 0.95

class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy') > ACCURACY_THRESHOLD):
            print("\nReached 95% accuracy, so cancelling training!")
            self.model.stop_training = True

In [11]:
INPUT_DIM = 2048 + 512 + 512 + 512
NUM_EPOCHS = 1

### 3.1 LSTM model

In [None]:
lstm_model = tf.keras.Sequential([
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=256,
                                                       input_shape=[INPUT_DIM],
                                                       return_sequences=True)),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='sigmoid'))
])

In [None]:
lstm_model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.RMSprop(lr=2e-5),
              metrics=['accuracy'])

In [None]:
%%time
lstm_model.fit(X_train, 
          y_train, 
          epochs=NUM_EPOCHS, 
          callbacks=[myCallback()],
          validation_data=(X_val, y_val),
          verbose=2)

### 3.2 WaveNet model

In [12]:
wave_model = tf.keras.models.Sequential()
wave_model.add(tf.keras.layers.InputLayer(input_shape=[None, INPUT_DIM]))
for rate in (1, 2, 4, 8) * 2:
    wave_model.add(tf.keras.layers.Conv1D(filters=20, kernel_size=2, padding="causal",
                                     activation="relu", dilation_rate=rate))
wave_model.add(tf.keras.layers.Conv1D(filters=10, kernel_size=1))
wave_model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

wave_model.compile(loss='binary_crossentropy',
              optimizer=tf.keras.optimizers.RMSprop(lr=2e-5),
              metrics=['accuracy'])

In [13]:
%%time
wave_model.fit(X_train, 
          y_train, 
          epochs=NUM_EPOCHS, 
          callbacks=[myCallback()],
          validation_data=(X_val, y_val),
          verbose=2)

2/2 - 38s - loss: 0.6719 - accuracy: 0.9674 - val_loss: 0.6718 - val_accuracy: 0.9629

Reached 95% accuracy, so cancelling training!
CPU times: user 35.4 s, sys: 22.9 s, total: 58.3 s
Wall time: 38.2 s


<tensorflow.python.keras.callbacks.History at 0x7fc4cbc675b0>

## 4. Model selection

### 4.1 Retrain WaveNet on entire dataset

In [14]:
# First, transform the entire dataset 
X, y = transform_movies(movies)

In [15]:
%%time
# Retrain on entire dataset
wave_model.fit(X, y, 
               epochs=NUM_EPOCHS, 
               callbacks=[myCallback()],
               validation_data=(X_val, y_val),
               verbose=2)

2/2 - 35s - loss: 0.6654 - accuracy: 0.9675 - val_loss: 0.6676 - val_accuracy: 0.9629

Reached 95% accuracy, so cancelling training!
CPU times: user 36.1 s, sys: 26.2 s, total: 1min 2s
Wall time: 35 s


<tensorflow.python.keras.callbacks.History at 0x7fc290e35be0>

## 4. Train and predict

In [16]:
def unpad_predictions(movies, yhat_probs):
    """Truncate the padded predictions to movie's original length"""
    imdb_lengths = [(movie['imdb_id'], movie['place'].shape[0]) for movie in movies]
    yhat_dict = dict()
    for (imdb, length), yhat in zip(imdb_lengths, yhat_probs):
        yhat = yhat[1:length]
        yhat_dict[imdb] = yhat
    return yhat_dict

In [17]:
# Predict on the entire dataset
yhat_probs = wave_model.predict(X)

In [18]:
yhat_unpadded_dict = unpad_predictions(movies, yhat_probs)

In [21]:
def write_predictions(yhat_unpadded_dict, path=PATH):
    for imdb in yhat_unpadded_dict.keys():
        # Load existing pkl movie file
        filename = os.path.join(PATH, imdb + ".pkl")
        x = joblib.load(filename)
        # Update scen transition boundary prediction
        x['scene_transition_boundary_prediction'] = yhat_unpadded_dict[imdb].flatten()
        # Write pkl file
        joblib.dump(x, filename)

In [22]:
write_predictions(yhat_unpadded_dict)

In [24]:
filenames = glob.glob(os.path.join(PATH, "tt*.pkl"))