In [1]:
# Install Keras version 2.3.1
!pip install -q tensorflow scikit-multilearn

In [2]:
import pandas as pd
import numpy as np

In [3]:
# only run this once!
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import zipfile
import io

ZIP_FILE_PATH = "drive/MyDrive/personal_projects/movie_genre_prediction/clean_data.zip"
zf = zipfile.ZipFile(ZIP_FILE_PATH, "r")
zf.extractall()

In [5]:
big_data_df = pd.read_csv('raw_data/clean_data/clean_big_data.csv').drop(columns=['Unnamed: 0'])
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"['Adventure', 'Comedy', 'Fantasy']",barbie suffers a crisis that leads her to ques...


In [6]:
image_array = np.load('raw_data/clean_data/clean_image_array.npy')
image_array.shape

(23140, 256, 256, 3)

In [7]:
# Double check if our image_array and big_data_df are the same size
assert big_data_df.shape[0] == image_array.shape[0]

In [8]:

def preprocess_genre(df: pd.DataFrame):
    df["genre"] = df["genre"].apply(eval).apply(lambda x: [genre.strip() for genre in x])
    return df

In [9]:
big_data_df = preprocess_genre(big_data_df)
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...


In [10]:
from sklearn.preprocessing import MultiLabelBinarizer

# Multilabel_binarizer is fit to an array of list of labels
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(big_data_df['genre'])

#transform target variable
y = multilabel_binarizer.transform(big_data_df['genre'])
genre_names = multilabel_binarizer.classes_

# Adding the name of genres
for i in range(len(genre_names)):
    big_data_df[f"{genre_names[i]}"] = y[:,i]

print(y.shape, big_data_df.shape)

(23140, 22) (23140, 25)


In [11]:
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,...,Music,Musical,Mystery,Reality-TV,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
from skmultilearn.model_selection import iterative_train_test_split

def balanced_split(df: pd.DataFrame, labels: np.array, image_array: np.array, test_size=0.5):
    """
    index_array: an array of our current df index,
    iterative_train_test_split: a function that considers the distribution of possible labels when splitting
    """
    index_array = np.expand_dims(np.arange(len(df)), axis=1)
    train_index, y_train, test_index, y_test = iterative_train_test_split(index_array, labels, test_size)
    train_image_array, test_image_array = np.take(image_array, train_index.ravel(), axis=0), np.take(image_array, test_index.ravel(), axis=0)
    return df.iloc[train_index[:, 0]], train_image_array, y_train, df.iloc[test_index[:, 0]], test_image_array, y_test

In [13]:
# Split dataset to train and test_val (will split test and val again!)
train_df, train_image_array, y_train, test_val_df, test_val_image_array, y_test_val = balanced_split(big_data_df, y, image_array, 0.3)
print(
    train_df.shape,
    y_train.shape,
    train_image_array.shape,
    test_val_df.shape,
    test_val_image_array.shape,
    y_test_val.shape
    )

(16295, 25) (16295, 22) (16295, 256, 256, 3) (6845, 25) (6845, 256, 256, 3) (6845, 22)


In [14]:
# Split test_val further to test and val datasets!
test_df, test_image_array, y_test, val_df, val_image_array, y_val = balanced_split(test_val_df, y_test_val, test_val_image_array)
print(
    test_df.shape,
    test_image_array.shape,
    y_test.shape,
    val_df.shape,
    val_image_array.shape,
    y_val.shape
    )

(3411, 25) (3411, 256, 256, 3) (3411, 22) (3434, 25) (3434, 256, 256, 3) (3434, 22)


In [15]:
from tensorflow.keras.applications.resnet50 import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Dropout, AveragePooling2D
from tensorflow.keras.optimizers import Adam

In [33]:
def create_resnet_model(base_model, num_classes):
    x = base_model.output
    x = Flatten()(x)
    x = Dense(512, activation='relu')(x)
    x = Dense(256, activation='relu')(x)

    predictions = Dense(num_classes, activation='relu')(x)

    model = Model(inputs=base_model.input, outputs=predictions)
    return model

In [34]:
base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(256, 256, 3))
base_model.trainable = False
resnet_model = create_resnet_model(base_model, y_train.shape[1])

In [35]:
resnet_model.compile(
    optimizer=Adam(),
    loss='binary_crossentropy',
    metrics=['accuracy', 'binary_accuracy', 'AUC']
)

In [36]:
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
es = EarlyStopping(monitor = 'val_auc',
                   mode = 'max',
                   patience = 10,
                   verbose = 1,
                   restore_best_weights = True)
history = resnet_model.fit(train_image_array, tf.convert_to_tensor(y_train), epochs=10, batch_size=16, validation_data=(val_image_array, tf.convert_to_tensor(y_val)), callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [37]:
y_pred = resnet_model.predict(test_image_array)



In [38]:
from sklearn.metrics import roc_auc_score
roc_auc_score_avg = roc_auc_score(y_test, y_pred, average='micro')
print(roc_auc_score_avg)

0.6413917705701441


In [39]:
for i in range(len(y[0])):
    print(f"{i}. {genre_names[i]}: {y_test[:, i].sum()}")

0. Action: 733
1. Adventure: 618
2. Animation: 295
3. Biography: 313
4. Comedy: 1112
5. Crime: 518
6. Drama: 1804
7. Family: 312
8. Fantasy: 291
9. Film-Noir: 9
10. History: 263
11. Horror: 436
12. Music: 297
13. Musical: 247
14. Mystery: 287
15. Reality-TV: 0
16. Romance: 569
17. Sci-Fi: 241
18. Sport: 222
19. Thriller: 382
20. War: 228
21. Western: 33


In [40]:
roc_auc_score_per_class = roc_auc_score(
    np.delete(y_test, [15], axis=1),
    np.delete(y_pred, [15], axis=1),
    average=None,
    multi_class='ovr',
    )

In [41]:
roc_auc_scores_df = pd.DataFrame(data={
    'genre': np.delete(genre_names, [15], axis=0),
    'roc_auc': roc_auc_score_per_class,
    'no. in y_test': np.delete(y_test, [15], axis=1).sum(axis=0),
    'sum of prob. in y_pred': np.delete(y_pred, [15], axis=1).sum(axis=0)
})

In [42]:
roc_auc_scores_df

Unnamed: 0,genre,roc_auc,no. in y_test,sum of prob. in y_pred
0,Action,0.5,733,0.0
1,Adventure,0.548973,618,1981586.5
2,Animation,0.609868,295,3651170.5
3,Biography,0.5,313,0.0
4,Comedy,0.513093,1112,5608818.5
5,Crime,0.5,518,0.0
6,Drama,0.492293,1804,5279602.5
7,Family,0.5,312,0.0
8,Fantasy,0.5,291,0.0
9,Film-Noir,0.5,9,0.0
