In [1]:
!pip install -q scikit-multilearn transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m87.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m116.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np

In [3]:
# only run this once!
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import zipfile
import io

ZIP_FILE_PATH = "drive/MyDrive/personal_projects/movie_genre_prediction/clean_data.zip"
zf = zipfile.ZipFile(ZIP_FILE_PATH, "r")
zf.extractall()

In [5]:
big_data_df = pd.read_csv('raw_data/clean_data/clean_big_data.csv').drop(columns=['Unnamed: 0'])
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"['Adventure', 'Comedy', 'Fantasy']",barbie suffers a crisis that leads her to ques...


In [6]:
def preprocess_genre(df: pd.DataFrame):
    df["genre"] = df["genre"].apply(eval).apply(lambda x: [genre.strip() for genre in x])
    return df

In [7]:
big_data_df = preprocess_genre(big_data_df)
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

# Multilabel_binarizer is fit to an array of list of labels
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(big_data_df['genre'])

#transform target variable
y = multilabel_binarizer.transform(big_data_df['genre'])
genre_names = multilabel_binarizer.classes_

# Adding the name of genres
for i in range(len(genre_names)):
    big_data_df[f"{genre_names[i]}"] = y[:,i]

print(y.shape, big_data_df.shape)

(23140, 22) (23140, 25)


In [9]:
big_data_df.head(1)

Unnamed: 0,imdb_id,genre,plot,Action,Adventure,Animation,Biography,Comedy,Crime,Drama,...,Music,Musical,Mystery,Reality-TV,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt1517268,"[Adventure, Comedy, Fantasy]",barbie suffers a crisis that leads her to ques...,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
from skmultilearn.model_selection import iterative_train_test_split

def balanced_split(df: pd.DataFrame, labels: np.array, test_size=0.5):
    """
    index_array: an array of our current df index,
    iterative_train_test_split: a function that considers the distribution of possible labels when splitting
    """
    index_array = np.expand_dims(np.arange(len(df)), axis=1)
    train_index, y_train, test_index, y_test = iterative_train_test_split(index_array, labels, test_size)
    return df.iloc[train_index[:, 0]], y_train, df.iloc[test_index[:, 0]], y_test

In [11]:
# Split dataset to train and test_val (will split test and val again!)
train_df, y_train, test_val_df, y_test_val = balanced_split(big_data_df, y, 0.3)
print(
    train_df.shape,
    y_train.shape,
    test_val_df.shape,
    y_test_val.shape
    )

(16278, 25) (16278, 22) (6862, 25) (6862, 22)


In [12]:
test_df, y_test, val_df, y_val = balanced_split(test_val_df, y_test_val)
print(
    test_df.shape,
    y_test.shape,
    val_df.shape,
    y_val.shape
    )

(3414, 25) (3414, 22) (3448, 25) (3448, 22)


In [16]:
from transformers import AutoTokenizer

def tokenize_encode_multimodal(df):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    text = df['plot'].to_list()
    encodings = tokenizer(text, truncation = True, padding = True, max_length = 128, return_tensors = "np")
    return encodings['input_ids'], encodings['attention_mask']

In [19]:
X_train_text, train_attention_mask = tokenize_encode_multimodal(train_df)
X_test_text, test_attention_mask = tokenize_encode_multimodal(test_df)
X_val_text, val_attention_mask = tokenize_encode_multimodal(val_df)
print(X_train_text.shape, train_attention_mask.shape)

(16278, 128) (16278, 128)


In [25]:
from transformers import TFBertModel
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense, Dropout

In [26]:
def create_bert_model(bert_model, num_classes):
    input_ids = Input(shape=(128,), dtype=tf.int32, name='input_word_ids')
    attention_masks = Input(shape=(128,), dtype=tf.int32, name='input_attention_masks')
    outputs = bert_model([input_ids, attention_masks])
    outputs = outputs[1]
    outputs = Dense(64, activation='relu')(outputs)
    outputs = Dropout(0.4)(outputs)
    outputs = Dense(num_classes, activation='sigmoid')(outputs)
    model = Model(inputs=[input_ids, attention_masks], outputs=outputs)
    return model

In [27]:
num_classes = y_train.shape[1]
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
text_model = create_bert_model(bert_model, num_classes)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [30]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
text_model.compile(Adam(), loss='binary_crossentropy', metrics=['accuracy', 'AUC', 'binary_accuracy'])
es = EarlyStopping(monitor = 'val_auc',
                   mode = 'max',
                   patience = 10,
                   verbose = 1,
                   restore_best_weights = True)
history = text_model.fit([X_train_text, train_attention_mask], tf.convert_to_tensor(y_train), epochs=10, batch_size=16, validation_data=([X_val_text, val_attention_mask], tf.convert_to_tensor(y_val)), callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [34]:
y_pred = text_model.predict([X_test_text, test_attention_mask])



In [35]:
from sklearn.metrics import roc_auc_score
roc_auc_score_avg = roc_auc_score(y_test, y_pred, average='micro')
print(roc_auc_score_avg)

0.7446352927909147


In [36]:
for i in range(len(y[0])):
    print(f"{i}. {genre_names[i]}: {y_test[:, i].sum()}")

0. Action: 733
1. Adventure: 624
2. Animation: 296
3. Biography: 313
4. Comedy: 1118
5. Crime: 513
6. Drama: 1818
7. Family: 312
8. Fantasy: 293
9. Film-Noir: 12
10. History: 264
11. Horror: 436
12. Music: 296
13. Musical: 247
14. Mystery: 291
15. Reality-TV: 0
16. Romance: 568
17. Sci-Fi: 241
18. Sport: 211
19. Thriller: 381
20. War: 228
21. Western: 36


In [37]:
roc_auc_score_per_class = roc_auc_score(
    np.delete(y_test, [15], axis=1),
    np.delete(y_pred, [15], axis=1),
    average=None,
    multi_class='ovr',
    )

In [38]:
roc_auc_scores_df = pd.DataFrame(data={
    'genre': np.delete(genre_names, [15], axis=0),
    'roc_auc': roc_auc_score_per_class,
    'no. in y_test': np.delete(y_test, [15], axis=1).sum(axis=0),
    'sum of prob. in y_pred': np.delete(y_pred, [15], axis=1).sum(axis=0)
})

In [39]:
roc_auc_scores_df

Unnamed: 0,genre,roc_auc,no. in y_test,sum of prob. in y_pred
0,Action,0.504436,733,713.741455
1,Adventure,0.518223,624,641.713806
2,Animation,0.500246,296,265.309814
3,Biography,0.485645,313,289.466248
4,Comedy,0.493769,1118,1095.855469
5,Crime,0.500214,513,433.003052
6,Drama,0.5,1818,1831.716919
7,Family,0.496596,312,258.00824
8,Fantasy,0.498263,293,265.977661
9,Film-Noir,0.490741,12,20.14974
