In [1]:
import tensorflow as tf
from tensorflow import keras
import torch
import numpy as np
import pandas as pd
import glob
import sys
import os
import joblib
import pickle

np.random.seed(42)

# Where to get the data
PATH = os.path.join(os.getcwd(), "data_dir")

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

## 1. Get the data

In [2]:
def fetch_movies(path=PATH):
    """
    Load .pkl movie files
    
    Argument:
    ---------
    path -- string representing files path
    """
    filenames = glob.glob(os.path.join(PATH, "tt*.pkl"))
    movies = []
    for fn in filenames:
        try:
            with open(fn, 'rb') as fin:
                movies.append(pickle.load(fin))
        except EOFError:
            break
    return movies

In [3]:
movies = fetch_movies()

In [4]:
def split_train_test(data, train_size=60):
    """
    Split data into train and test sets
    
    Argument:
    --------
    data -- a list of dictionaries each containing a movie information
    train_size -- integer representing the number of movies used for training
    """
    # For stable output across runs
    np.random.seed(42)
    # Shuffle indices
    shuffled_indices = np.random.permutation(len(data))
    train_indices = shuffled_indices[:train_size]
    test_indices = shuffled_indices[train_size:]
    train_set = [data[i] for i in train_indices]
    test_set = [data[i] for i in test_indices]
    return train_set, test_set

In [5]:
# Get the train and test data sets
movies = fetch_movies()

In [None]:
# Movie length
movie_lengths = [movie['place'].shape[0] for movie in movies]
print("Max movie length: {}".format(max(movie_lengths)))
print("Min movie length: {}".format(min(movie_lengths)))

In [6]:
FEATURES_DIM = 2048 + 512 + 512 + 512
MAX_MOVIE_LENGTH = 4000
NUM_EPOCHS = 50

In [None]:
# Split movies into training and validation sets
# movies_train, movies_val = split_train_test(movies)

## 2. Data processing

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def transform_movies(movies, features=['place', 'cast', 'action', 'audio'], pad_len=MAX_MOVIE_LENGTH):
    """
    Unroll the given features by column and separate features from labels.
    Then pad the sequences in each movie to the length of the longest movie.
    
    Arguments:
    ----------
    movies -- a list of dictionaries each containing a movie information
    features -- list of string representing data features
    pad-len -- integer for the maximum length of a movie
    
    Return:
    -------
    X_padded -- a 2D numpy array
    Y_padded -- a 2D numpy array
    """
    X, Y = [], []
    # Unroll the features
    for movie in movies: 
        row = torch.cat([movie[feat] for feat in features], dim=1)
        X.append(row.numpy())
        # Pre-pad the label since its length is N-1
        labels = movie['scene_transition_boundary_ground_truth']
        labels = torch.cat([torch.tensor([False]), labels])
        Y.append(labels.numpy())
    # Pad the sequences
    X_padded = pad_sequences(X, maxlen=pad_len, padding='post', dtype='float32')
    Y_padded = pad_sequences(Y, value=False, maxlen=pad_len, padding='post')
    return X_padded, Y_padded

In [None]:
# # Transform training and validation sets
# X_train, y_train = transform_movies(movies_train)
# X_val, y_val = transform_movies(movies_val)

## 3. Build and train models

In [8]:
class myCallback(tf.keras.callbacks.Callback):
    """To stop training early once accuracy reach 95%"""
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('accuracy') > 0.95):
            print("\nReached 95% accuracy, so cancelling training!")
            self.model.stop_training = True

In [9]:
def unpad_predictions(movies, yhat_probs):
    """
    Truncate the padded predictions to movie's original length
    
    Arguments:
    ----------
    movies -- a list of dictionaries containing movies information
    yhat_probs -- a 2D numpy array representing prediction for the given movies data set
    
    Return:
    -------
    yhat_dict -- a dictionary with each movie imbd_id as key and 
                 prediction probabilities as a 1D numpy array
    """
    imdb_lengths = [(movie['imdb_id'], movie['place'].shape[0]) for movie in movies]
    yhat_dict = dict()
    for (imdb, length), yhat in zip(imdb_lengths, yhat_probs):
        yhat = yhat[1:length]
        yhat_dict[imdb] = yhat
    return yhat_dict

In [10]:
def write_predictions(yhat_unpadded_dict, path=PATH):
    """
    Pickle the predictions
    
    Arguments:
    ----------
    yhat_unpadded_dict -- a dictionary of prediction consistent with the length of the ground-truth label
    path -- a string representing the files path
    """
    for imdb in yhat_unpadded_dict.keys():
        # Load existing pkl movie file
        filename = os.path.join(PATH, imdb + ".pkl")
        try:
            x = pickle.load(open(filename, "rb"))
            x['scene_transition_boundary_prediction'] = yhat_unpadded_dict[imdb].flatten()
            pickle.dump(x, open(filename, "wb"))
        except:
            break

In [11]:
# First, transform the entire dataset 
X, y = transform_movies(movies)

### 3.1 LSTM model

In [None]:
# def build_lstm(n_neurons=32, input_shape=[FEATURES_DIM]):
#     model = tf.keras.Sequential([
#             tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=n_neurons,
#                                                                input_shape=input_shape,
#                                                                return_sequences=True)),
#             tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='sigmoid'))])
#     model.compile(loss='binary_crossentropy',
#               optimizer=tf.keras.optimizers.RMSprop(lr=2e-5),
#               metrics=['accuracy'])
#     return model

In [None]:
# # Fit classifier
# lstm_clf = build_lstm()
# lstm_clf.fit(X, y, epochs=NUM_EPOCHS, callbacks=[myCallback()], verbose=2)

In [None]:
# %%time
# # Predict
# lstm_yhat = lstm_clf.predict(X)
# lstm_yhat_unpadded = unpad_predictions(movies, lstm_yhat)

In [None]:
# # Pickle predictions
# write_predictions(lstm_yhat_unpadded)

### 3.2 WaveNet model

In [44]:
def build_wave_net(input_shape=[None, FEATURES_DIM], num_blocks=2, num_layers=3, 
                   filters1=20, filters2=10, kern1=2, kern2=1, padding='same'):
    rates = [2**i for i in range(num_layers)]
    wave_model = tf.keras.models.Sequential()
    wave_model.add(tf.keras.layers.InputLayer(input_shape=input_shape))
    for rate in rates * num_blocks:
        wave_model.add(tf.keras.layers.Conv1D(filters=filters1, 
                                              kernel_size=kern1, 
                                              padding='same',
                                              activation='relu', dilation_rate=rate))
    wave_model.add(tf.keras.layers.Conv1D(filters=filters2, kernel_size=kern2))
    wave_model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    wave_model.compile(loss='binary_crossentropy',
                  optimizer=tf.keras.optimizers.RMSprop(lr=2e-5),
                  metrics=['accuracy'])
    return wave_model

In [45]:
wave_clf = build_wave_net()

In [46]:
%%time
wave_clf.fit(X, y, epochs=5, verbose=2)

Epoch 1/5
2/2 - 46s - loss: 0.6757 - accuracy: 0.8971
Epoch 2/5
2/2 - 39s - loss: 0.6616 - accuracy: 0.9399
Epoch 3/5
2/2 - 55s - loss: 0.6483 - accuracy: 0.9569
Epoch 4/5
2/2 - 23s - loss: 0.6362 - accuracy: 0.9638
Epoch 5/5
2/2 - 18s - loss: 0.6258 - accuracy: 0.9665
CPU times: user 2min 39s, sys: 1min 43s, total: 4min 23s
Wall time: 3min 2s


<tensorflow.python.keras.callbacks.History at 0x7fd9e7ac7070>

In [41]:
wave_hat = wave_clf.predict(X)

In [42]:
wave_hat_unpadded = unpad_predictions(movies, wave_hat)

In [43]:
write_predictions(wave_hat_unpadded)

## 4. Hyperparameter tuning

In [53]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import RandomizedSearchCV

keras_clf = KerasClassifier(build_wave_net)

params_distribs = {
    "num_blocks": [2, 3],
    "num_layers": np.arange(4, 11),
    "filters1": np.arange(10, 21),
    "filters2": np.arange(2, 11)
}

rnd_search_cv = RandomizedSearchCV(keras_clf, params_distribs, n_iter=5, cv=3)

In [None]:
%%time
rnd_search_cv.fit(X, y, epochs=NUM_EPOCHS, callbacks=[myCallback()])
print("Best score: {}".format(rnd_search.best_score_))
print("Parameters:")
for param, value in rnd_search_cv.best_params_.items():
    print("\t{}: {}".format(param, value))

Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50
Epoch 3/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50

Reached 95% accuracy, so cancelling training!


Epoch 1/50
Epoch 2/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50

Reached 95% accuracy, so cancelling training!
Epoch 1/50
Epoch 2/50
Epoch 3/50


## Model selection and performance