<a href="https://colab.research.google.com/github/howarudo/movie_genre_prediction/blob/master/notebooks/howard_stacking_models_acc_0.47.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up and Load Raw Data

In [None]:
!pip install -q scikit-multilearn transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m80.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np

## Load Zipped Raw Data and Preprocessed Image Data From Google Drive

In [None]:
# only run this once!
# from google.colab import drive

# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import zipfile
import io

ZIP_FILE_PATH = "drive/MyDrive/personal_projects/movie_genre_prediction/clean_data.zip"
zf = zipfile.ZipFile(ZIP_FILE_PATH, "r")
zf.extractall()

In [None]:
big_data_df = pd.read_csv('raw_data/clean_data/clean_big_data.csv').drop(columns=['Unnamed: 0'])
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"['Adventure', 'Comedy', 'Fantasy']",barbie suffers a crisis that leads her to ques...


In [None]:
image_array = np.load('raw_data/clean_data/clean_image_array.npy')
image_array.shape

(23140, 256, 256, 3)

In [None]:
# Double check if our image_array and big_data_df are the same size
assert big_data_df.shape[0] == image_array.shape[0]

## Preprocess Genres and Split to Train, Test, Validation Datasets

In [None]:
def preprocess_genre(df: pd.DataFrame):
    df["genre"] = df["genre"].apply(eval).apply(lambda x: [genre.strip() for genre in x])
    return df

In [None]:
big_data_df = preprocess_genre(big_data_df)
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Multilabel_binarizer is fit to an array of list of labels
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(big_data_df['genre'])

#transform target variable
y = multilabel_binarizer.transform(big_data_df['genre'])
genre_names = multilabel_binarizer.classes_

# Adding the name of genres
for i in range(len(genre_names)):
    big_data_df[f"{genre_names[i]}"] = y[:,i]

print(y.shape, big_data_df.shape)

(23140, 22) (23140, 25)


In [None]:
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,...,Music,Musical,Mystery,Reality-TV,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from skmultilearn.model_selection import iterative_train_test_split

def balanced_split(df: pd.DataFrame, labels: np.array, image_array: np.array, test_size=0.5):
    """
    index_array: an array of our current df index,
    iterative_train_test_split: a function that considers the distribution of possible labels when splitting
    """
    index_array = np.expand_dims(np.arange(len(df)), axis=1)
    train_index, y_train, test_index, y_test = iterative_train_test_split(index_array, labels, test_size)
    train_image_array, test_image_array = np.take(image_array, train_index.ravel(), axis=0), np.take(image_array, test_index.ravel(), axis=0)
    return df.iloc[train_index[:, 0]], train_image_array, y_train, df.iloc[test_index[:, 0]], test_image_array, y_test

In [None]:
# Split dataset to train and test_val (will split test and val again!)
train_df, train_image_array, y_train, test_val_df, test_val_image_array, y_test_val = balanced_split(big_data_df, y, image_array, 0.3)
print(
    train_df.shape,
    y_train.shape,
    train_image_array.shape,
    test_val_df.shape,
    test_val_image_array.shape,
    y_test_val.shape
    )

(16275, 25) (16275, 22) (16275, 256, 256, 3) (6865, 25) (6865, 256, 256, 3) (6865, 22)


In [None]:
# Split test_val further to test and val datasets!
test_df, test_image_array, y_test, val_df, val_image_array, y_val = balanced_split(test_val_df, y_test_val, test_val_image_array)
print(
    test_df.shape,
    test_image_array.shape,
    y_test.shape,
    val_df.shape,
    val_image_array.shape,
    y_val.shape
    )

(3407, 25) (3407, 256, 256, 3) (3407, 22) (3458, 25) (3458, 256, 256, 3) (3458, 22)


**RECAP**

Let's recap what we have up to this point!
Balanced train, test, validation datasets consisting of:
1. Dataframes with imdb_id and plot (train_df, test_df, val_df)
2. Preprocessed image_arrays (train_image_array, test_image_array and val_image_array)
3. Labels! (y_train, y_test, y_val)

# Text Preprocessing

In [None]:
from transformers import AutoTokenizer

def tokenize_encode_multimodal(df):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    text = df['plot'].to_list()
    encodings = tokenizer(text, truncation = True, padding = True, max_length = 128, return_tensors = "np")
    return encodings['input_ids']

In [None]:
X_train_text = tokenize_encode_multimodal(train_df)
X_test_text = tokenize_encode_multimodal(test_df)
X_val_text = tokenize_encode_multimodal(val_df)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
X_train_img = train_image_array
X_test_img = test_image_array
X_val_img = val_image_array

# RESNET 50

In [None]:
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense

In [None]:
def load_model():
    model = ResNet50(weights="imagenet", include_top=False, input_shape=(256, 256, 3))
    return model


In [None]:
def set_nontrainable_layers(model):
    model.trainable = False
    return model

In [None]:
def add_last_layers(model):
    '''Take a pre-trained model, set its parameters as non-trainable, and add additional trainable layers on top'''
    base_model = load_model()
    base_model = set_nontrainable_layers(base_model)
    flatten_layer = Flatten()
    dense_layer = Dense(500, activation='relu')
    prediction_layer = Dense(512, activation='relu')

    model = Sequential([
        base_model,
        flatten_layer,
        dense_layer,
        prediction_layer
    ])
    return model

# BERT Model

In [None]:
from transformers import TFBertModel
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input

In [None]:
# define the BERT-based text feature extractor
def build_text_model():
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    inputs = Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
    outputs = bert_model(inputs)[1]
    text_model = Model(inputs=inputs, outputs=outputs)
    return text_model

# Multimodal

In [None]:
from tensorflow.keras.layers import concatenate
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# define the multimodal document classification model
def build_multimodal_model(num_classes):
    model = load_model()
    model = set_nontrainable_layers(model)
    img_model = add_last_layers(model)
    text_model = build_text_model()
    img_input = Input(shape=(256, 256, 3), name='img_input')
    text_input = Input(shape=(None,), dtype=tf.int32, name='text_input')
    img_features = img_model(img_input)
    text_features = text_model(text_input)
    concat_features = concatenate([img_features, text_features])
    x = Dense(512, activation='relu')(concat_features)
    x = Dense(num_classes, activation='sigmoid')(x)
    multimodal_model = Model(inputs=[img_input, text_input], outputs=x)
    return multimodal_model

In [None]:
num_classes = y_train.shape[1]
multimodal_model = build_multimodal_model(num_classes)
multimodal_model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 img_input (InputLayer)         [(None, 256, 256, 3  0           []                               
                                )]                                                                
                                                                                                  
 text_input (InputLayer)        [(None, None)]       0           []                               
                                                                                                  
 sequential (Sequential)        (None, 512)          89380724    ['img_input[0][0]']              
                                                                                                  
 model (Functional)             (None, 768)          109482240   ['text_input[0][0]']       

In [None]:
legacy_adam = tf.keras.optimizers.legacy.Adam()

In [None]:
# compile the model and train on the train set
multimodal_model.compile(optimizer=legacy_adam, loss='binary_crossentropy', metrics=['accuracy', 'AUC', 'binary_accuracy', 'categorical_accuracy'])

es = EarlyStopping(monitor = 'val_accuracy',
                   mode = 'max',
                   patience = 10,
                   verbose = 1,
                   restore_best_weights = True)

history = multimodal_model.fit([(X_train_img, X_train_text)], tf.convert_to_tensor(y_train), epochs=5, batch_size=16, validation_data=([(X_val_img, X_val_text)], tf.convert_to_tensor(y_val)), callbacks = [es])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
