# Set Up and Load Raw Data

In [1]:
!pip install -q scikit-multilearn transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np

## Load Zipped Raw Data and Preprocessed Image Data From Google Drive

In [3]:
# only run this once!
# from google.colab import drive

# drive.mount('/content/drive')

In [4]:
import zipfile
import io

ZIP_FILE_PATH = "drive/MyDrive/personal_projects/movie_genre_prediction/clean_data.zip"
zf = zipfile.ZipFile(ZIP_FILE_PATH, "r")
zf.extractall()

In [5]:
big_data_df = pd.read_csv('raw_data/clean_data/clean_big_data.csv').drop(columns=['Unnamed: 0'])
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"['Adventure', 'Comedy', 'Fantasy']",barbie suffers a crisis that leads her to ques...


In [6]:
image_array = np.load('raw_data/clean_data/clean_image_array.npy')
image_array.shape

(23140, 256, 256, 3)

In [7]:
# Double check if our image_array and big_data_df are the same size
assert big_data_df.shape[0] == image_array.shape[0]

## Preprocess Genres and Split to Train, Test, Validation Datasets

In [8]:
def preprocess_genre(df: pd.DataFrame):
    df["genre"] = df["genre"].apply(eval).apply(lambda x: [genre.strip() for genre in x])
    return df

In [9]:
big_data_df = preprocess_genre(big_data_df)
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

# Multilabel_binarizer is fit to an array of list of labels
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(big_data_df['genre'])

#transform target variable
y = multilabel_binarizer.transform(big_data_df['genre'])
genre_names = multilabel_binarizer.classes_

# Adding the name of genres
for i in range(len(genre_names)):
    big_data_df[f"{genre_names[i]}"] = y[:,i]

print(y.shape, big_data_df.shape)

(23140, 22) (23140, 25)


In [11]:
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,...,Music,Musical,Mystery,Reality-TV,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
from skmultilearn.model_selection import iterative_train_test_split

def balanced_split(df: pd.DataFrame, labels: np.array, image_array: np.array, test_size=0.5):
    """
    index_array: an array of our current df index,
    iterative_train_test_split: a function that considers the distribution of possible labels when splitting
    """
    index_array = np.expand_dims(np.arange(len(df)), axis=1)
    train_index, y_train, test_index, y_test = iterative_train_test_split(index_array, labels, test_size)
    train_image_array, test_image_array = np.take(image_array, train_index.ravel(), axis=0), np.take(image_array, test_index.ravel(), axis=0)
    return df.iloc[train_index[:, 0]], train_image_array, y_train, df.iloc[test_index[:, 0]], test_image_array, y_test

In [13]:
# Split dataset to train and test_val (will split test and val again!)
train_df, train_image_array, y_train, test_val_df, test_val_image_array, y_test_val = balanced_split(big_data_df, y, image_array, 0.3)
print(
    train_df.shape,
    y_train.shape,
    train_image_array.shape,
    test_val_df.shape,
    test_val_image_array.shape,
    y_test_val.shape
    )

(16280, 25) (16280, 22) (16280, 256, 256, 3) (6860, 25) (6860, 256, 256, 3) (6860, 22)


In [14]:
# Split test_val further to test and val datasets!
test_df, test_image_array, y_test, val_df, val_image_array, y_val = balanced_split(test_val_df, y_test_val, test_val_image_array)
print(
    test_df.shape,
    test_image_array.shape,
    y_test.shape,
    val_df.shape,
    val_image_array.shape,
    y_val.shape
    )

(3423, 25) (3423, 256, 256, 3) (3423, 22) (3437, 25) (3437, 256, 256, 3) (3437, 22)


**RECAP**

Let's recap what we have up to this point!
Balanced train, test, validation datasets consisting of:
1. Dataframes with imdb_id and plot (train_df, test_df, val_df)
2. Preprocessed image_arrays (train_image_array, test_image_array and val_image_array)
3. Labels! (y_train, y_test, y_val)

# Text Preprocessing

In [15]:
from transformers import AutoTokenizer

def tokenize_encode_multimodal(df):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    text = df['plot'].to_list()
    encodings = tokenizer(text, truncation = True, padding = True, max_length = 128, return_tensors = "np")
    return encodings['input_ids']

In [16]:
X_train_text = tokenize_encode_multimodal(train_df)
X_test_text = tokenize_encode_multimodal(test_df)
X_val_text = tokenize_encode_multimodal(val_df)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
X_train_img = train_image_array
X_test_img = test_image_array
X_val_img = val_image_array

# RESNET 50

In [18]:
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense

In [19]:
def load_model():
    model = ResNet50(weights="imagenet", include_top=False, input_shape=(256, 256, 3))
    return model


In [20]:
def set_nontrainable_layers(model):
    model.trainable = False
    return model

In [21]:
def add_last_layers(model):
    '''Take a pre-trained model, set its parameters as non-trainable, and add additional trainable layers on top'''
    base_model = load_model()
    base_model = set_nontrainable_layers(base_model)
    flatten_layer = Flatten()
    dense_layer = Dense(500, activation='relu')
    prediction_layer = Dense(512, activation='relu')

    model = Sequential([
        base_model,
        flatten_layer,
        dense_layer,
        prediction_layer
    ])
    return model

# BERT Model

In [22]:
from transformers import TFBertModel
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input

In [23]:
# define the BERT-based text feature extractor
def build_text_model():
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')
    inputs = Input(shape=(None,), dtype=tf.int32, name='input_word_ids')
    outputs = bert_model(inputs)[1]
    text_model = Model(inputs=inputs, outputs=outputs)
    return text_model

# Multimodal

In [24]:
from tensorflow.keras.layers import concatenate
from tensorflow.keras.callbacks import EarlyStopping

In [25]:
# define the multimodal document classification model
def build_multimodal_model(num_classes):
    model = load_model()
    model = set_nontrainable_layers(model)
    img_model = add_last_layers(model)
    text_model = build_text_model()
    img_input = Input(shape=(256, 256, 3), name='img_input')
    text_input = Input(shape=(None,), dtype=tf.int32, name='text_input')
    img_features = img_model(img_input)
    text_features = text_model(text_input)
    concat_features = concatenate([img_features, text_features])
    x = Dense(512, activation='relu')(concat_features)
    x = Dense(num_classes, activation='sigmoid')(x)
    multimodal_model = Model(inputs=[img_input, text_input], outputs=x)
    return multimodal_model

In [26]:
num_classes = y_train.shape[1]
multimodal_model = build_multimodal_model(num_classes)
multimodal_model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 img_input (InputLayer)      [(None, 256, 256, 3)]        0         []                            
                                                                                                  
 text_input (InputLayer)     [(None, None)]               0         []                            
                                                                                                  
 sequential (Sequential)     (None, 512)                  8938072   ['img_input[0][0]']           
                                                          4                                       
                                                                                                  
 model (Functional)          (None, 768)                  1094822   ['text_input[0][0]']    

In [27]:
legacy_adam = tf.keras.optimizers.legacy.Adam()

In [28]:
# compile the model and train on the train set
multimodal_model.compile(optimizer=legacy_adam, loss='binary_crossentropy', metrics=['accuracy', 'AUC', 'binary_accuracy'])

es = EarlyStopping(monitor = 'val_auc',
                   mode = 'max',
                   patience = 10,
                   verbose = 1,
                   restore_best_weights = True)

history = multimodal_model.fit([(X_train_img, X_train_text)], tf.convert_to_tensor(y_train), epochs=10, batch_size=16, validation_data=([(X_val_img, X_val_text)], tf.convert_to_tensor(y_val)), callbacks = [es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Comparing predictions with y_test!

In [None]:
y_pred = multimodal_model.predict([X_test_img, X_test_text])

In [None]:
def round_up_predictions(y_pred, threshold):
    holder = np.zeros(y_pred.shape, dtype=np.int8)
    for i in range(len(y_pred)):
        holder[i] = np.array([1 if pred > threshold else 0 for pred in y_pred[i]], dtype=np.int8)
    return holder
y_pred_binary = round_up_predictions(y_pred, 0.3)

In [None]:
def all_correct(y_pred_binary, y_test):
    assert y_pred_binary.shape == y_test.shape
    ans = []
    for i in range(y_pred_binary.shape[0]):
        if np.all(y_pred_binary[i] == y_test[i]):
            ans.append(1)
        else:
            ans.append(0)
    return np.average(np.array(ans))
print(all_correct(y_pred_binary, y_test))

In [None]:
def f_all_correct(threshold):
    y_pred_binary = round_up_predictions(y_pred, threshold)
    return -1 * all_correct(y_pred_binary, y_test)

In [None]:
from scipy.optimize import minimize_scalar

In [None]:
minimize_scalar(f_all_correct, bounds=(0, 1))

So, if we set the threshold to 0.3798, we get a "all_correct" accuracy of 0.026!

In [None]:
def one_correct(y_pred_binary, y_test):
    assert y_pred_binary.shape == y_test.shape
    ans = []
    for i in range(y_pred_binary.shape[0]):
        y_pred_i = y_pred_binary[i]
        y_test_i = y_test[i]
        appended = False
        for j in range(y_pred_i.shape[0]):
            if y_pred_i[j] == 1 and y_test_i[j] == 1 and not appended:
                ans.append(1)
                appended = True
        if not appended:
            ans.append(0)
    assert y_pred_binary.shape[0] == len(ans)
    return np.average(np.array(ans))
print(one_correct(y_pred_binary, y_test))

0.7908401400233372


In [None]:
def f_one_correct(threshold):
    y_pred_binary = round_up_predictions(y_pred, threshold)
    return -1 * one_correct(y_pred_binary, y_test)

In [None]:
minimize_scalar(f_one_correct, bounds=(0, 1))

 message: Solution found.
 success: True
  status: 0
     fun: -0.9988331388564761
       x: 0.0011907108069308574
     nit: 24
    nfev: 24

## Conclusions:
Okay, we need an actual metric. Not just "one correct" or "all correct". Because adjusting the threshold could definitely allow us to get extremely high scores for one metric and sacrificing the other metric.

One correct: x = 0.001, p = 0.9988
All correct: x = 0.3789, p = 0.0268

# y_pred and y_test comparison with AUC-ROC

In [31]:
y_pred = multimodal_model.predict([X_test_img, X_test_text])



In [32]:
from sklearn.metrics import roc_auc_score
roc_auc_score_avg = roc_auc_score(y_test, y_pred, average='micro')
print(roc_auc_score_avg)

0.7645625429210074


In [33]:
for i in range(len(y[0])):
    print(f"{i}. {genre_names[i]}: {y_test[:, i].sum()}")

0. Action: 733
1. Adventure: 625
2. Animation: 295
3. Biography: 313
4. Comedy: 1115
5. Crime: 510
6. Drama: 1818
7. Family: 310
8. Fantasy: 293
9. Film-Noir: 10
10. History: 264
11. Horror: 436
12. Music: 297
13. Musical: 247
14. Mystery: 294
15. Reality-TV: 0
16. Romance: 578
17. Sci-Fi: 241
18. Sport: 209
19. Thriller: 382
20. War: 236
21. Western: 36


In [34]:
roc_auc_score_per_class = roc_auc_score(
    np.delete(y_test, [15], axis=1),
    np.delete(y_pred, [15], axis=1),
    average=None,
    multi_class='ovr',
    )

In [35]:
roc_auc_scores_df = pd.DataFrame(data={
    'genre': np.delete(genre_names, [15], axis=0),
    'roc_auc': roc_auc_score_per_class,
    'no. in y_test': np.delete(y_test, [15], axis=1).sum(axis=0),
    'sum of prob. in y_pred': np.delete(y_pred, [15], axis=1).sum(axis=0)
})

In [36]:
roc_auc_scores_df

Unnamed: 0,genre,roc_auc,no. in y_test,sum of prob. in y_pred
0,Action,0.664614,733,519.953369
1,Adventure,0.660203,625,549.637634
2,Animation,0.843848,295,336.871033
3,Biography,0.598243,313,181.020004
4,Comedy,0.718432,1115,1716.513794
5,Crime,0.631979,510,489.857971
6,Drama,0.63812,1818,1717.355347
7,Family,0.701297,310,261.933655
8,Fantasy,0.603369,293,256.635529
9,Film-Noir,0.684588,10,10.825526
