In [1]:
import numpy as np
import pandas as pd

import os
import pickle as pkl
import re

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.preprocessing import MultiLabelBinarizer

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

import autokeras as ak # !pip install autokeras

import json

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

2023-01-24 06:35:47.153843: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
SEED = 321 #434

In [3]:
use_cache = True

data_dir = 'data' # folder containing original datasets
tmp_dir = 'tmp' # temporal folder containing intermediate results of Jupyter notebooks
out_dir = 'out' # folder containing the model and other files needed for classification

In [4]:
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

In [5]:
df = pkl.load(open(f'{tmp_dir}/data.pkl', 'rb'))

In [6]:
def remove_labels(df, labels, in_col='genres', out_col='genres2'):
    
    if type(labels)==str:
        labels = [labels]
        
    df[out_col] = [sorted(list(set(_).difference(set(labels)))) for _ in df[in_col]]
    
    return df

### Let us remove genres that have <1000 entries as well as 'Musical', 'Mystery', and 'Biography' (they were found to be poorly perfoming in training, which makes sense).

In [7]:
labels, abund = np.unique(np.concatenate(df['genres'].tolist()), return_counts=True)
        
to_remove = set(labels[np.where(abund<1000)[0]])
to_remove = to_remove.union({'Musical', 'Mystery', 'Biography'})

to_remove

{'Biography', 'Film-Noir', 'History', 'Music', 'Musical', 'Mystery', 'Sport'}

In [8]:
df = remove_labels(df, to_remove)
df

Unnamed: 0_level_0,name,year,revenue,genres,synopsis,clean,genres2
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23890098,Taxi Blues,1990,,[Drama],"Shlykov, a hard-working taxi driver and Lyosha...","Shlykov, a hard-working taxi driver and Lyosha...",[Drama]
31186339,The Hunger Games,2012,686533290.0,"[Action, Adventure, Drama, Sci-Fi]",The nation of Panem consists of a wealthy Capi...,The nation of Panem consists of a wealthy Capi...,"[Action, Adventure, Drama, Sci-Fi]"
20663735,Narasimham,2000,,"[Action, Drama, Musical]",Poovalli Induchoodan is sentenced for six yea...,Poovalli Induchoodan is sentenced for six year...,"[Action, Drama]"
2231378,The Lemon Drop Kid,1951,2300000.0,[Comedy],"The Lemon Drop Kid , a New York City swindler,...","The Lemon Drop Kid, a New York City swindler, ...",[Comedy]
595909,A Cry in the Dark,1988,6908797.0,"[Crime, Drama]",Seventh-day Adventist Church pastor Michael Ch...,Seventh-day Adventist Church pastor Michael Ch...,"[Crime, Drama]"
...,...,...,...,...,...,...,...
2867597,Mr. Bill's Real Life Adventures,1986,,[Comedy],"An attempt to bring the famed ""Mr. Bill"" clay ...",An attempt to bring the famed clay characters ...,[Comedy]
1096473,The Last Command,1928,,"[Drama, War]","In 1928 Hollywood, director Leo Andreyev look...","In 1928 Hollywood, director Leo Andreyev looks...","[Drama, War]"
35102018,Randy Parsons: American Luthier,2011,,"[Biography, Music]",American Luthier focuses on Randy Parsons’ tra...,American Luthier focuses on Randy transformati...,[]
8628195,Kabuliwala,1961,,[Drama],"Abdur Rehman Khan , a middle-aged dry fruit se...","Abdur Rehman Khan, a middle-aged dry fruit sel...",[Drama]


In [9]:
# from collections import Counter
# Counter([len(_) for _ in df['genres2']])

### Binarizing labels for multi-label classification with Autokeras' TextClassifier model. It appears to be the simplest method that provided good f1 micro scores. Also, it does not require a heavy cleaning of the input such as stemming.

### Other models could include Bidirectional LSTM layers, or RoBERTa transformer model. Also one could fine-tune a pretrained model that calculates a semantic similarity between texts on pairs (synopsis, genres).

In [10]:
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(df['genres2'])
classes = mlb.classes_

classes, labels.shape

(array(['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Drama',
        'Family', 'Fantasy', 'Horror', 'Romance', 'Sci-Fi', 'Thriller',
        'War', 'Western'], dtype=object),
 (39086, 14))

In [11]:
text_input = df['clean'].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(text_input, labels, test_size=0.2, random_state=SEED)

In [12]:
METRICS = [
    keras.metrics.CategoricalAccuracy(name='accuracy'),
    keras.metrics.Precision(name='precision'),
    keras.metrics.Recall(name='recall'),
]

2023-01-24 06:36:32.657235: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
ak_model = ak.TextClassifier(
    multi_label=True,
    overwrite=True, 
    max_trials=1,
    project_name='ak_model_14_vanilla',
    metrics=METRICS,
    seed=SEED
) 

In [None]:
%time history = ak_model.fit(x_train, y_train, epochs=10)

In [60]:
tf_model = ak_model.export_model()
tf_model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None,)]                 0         
                                                                 
 expand_last_dim (ExpandLast  (None, 1)                0         
 Dim)                                                            
                                                                 
 text_vectorization (TextVec  (None, 512)              0         
 torization)                                                     
                                                                 
 embedding (Embedding)       (None, 512, 64)           320064    
                                                                 
 dropout (Dropout)           (None, 512, 64)           0         
                                                                 
 conv1d (Conv1D)             (None, 508, 256)          82176 

In [82]:
y_pred = tf_model.predict(x_test)



### We chose a threshold of 0.95 for a prediction array normalized by its maximum value in order to allow for multiple genres

In [83]:
new_y_pred = np.where((y_pred.T/y_pred.max(1)).T > 0.95, 1, 0)

In [84]:
from collections import Counter

Counter(new_y_pred.sum(1))

Counter({1: 6702, 2: 1020, 3: 93, 4: 3})

### f1 micro avg is 0.47. 

In [85]:
print(classification_report(y_test, 
                            new_y_pred, 
                            output_dict=False,
                            target_names=classes))

              precision    recall  f1-score   support

      Action       0.56      0.43      0.49      1389
   Adventure       0.52      0.19      0.28      1245
   Animation       0.69      0.49      0.57       521
      Comedy       0.65      0.42      0.51      2514
       Crime       0.60      0.22      0.33      1022
       Drama       0.76      0.58      0.66      4000
      Family       0.66      0.14      0.24       795
     Fantasy       0.49      0.07      0.12       431
      Horror       0.74      0.57      0.65       846
     Romance       0.64      0.12      0.21      1467
      Sci-Fi       0.68      0.38      0.49       471
    Thriller       0.59      0.20      0.29      1417
         War       0.75      0.19      0.31       315
     Western       0.80      0.31      0.44       236

   micro avg       0.67      0.37      0.47     16669
   macro avg       0.65      0.31      0.40     16669
weighted avg       0.66      0.37      0.45     16669
 samples avg       0.69   

  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
# np.sum(y_test * new_y_pred)/len(y_test)

In [87]:
# new_y_pred.sum(0).astype(int), np.unique(new_y_pred.sum(1).astype(int))

In [62]:
tf_model.save(f'{out_dir}/tf_model')



INFO:tensorflow:Assets written to: out/tf_model/assets


INFO:tensorflow:Assets written to: out/tf_model/assets


CPU times: user 739 ms, sys: 16.6 ms, total: 756 ms
Wall time: 751 ms


### In order to choose similar moview, we compute predictions for each movie with revenue > $10 million.

In [13]:
text_input_pred = tf_model.predict(text_input)

2023-01-23 08:59:14.896054: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8401


  56/1222 [>.............................] - ETA: 3s

2023-01-23 08:59:15.956849: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.




In [15]:
mask = df['revenue'] > 1e7
len(mask)

39086

In [16]:
df2 = df[mask]
text_input_pred2 = text_input_pred[mask]

In [17]:
out_data = df2[['name', 'year', 'genres']].to_dict(orient='list')
out_data['pred'] = text_input_pred2

out_data.keys()

dict_keys(['name', 'year', 'genres', 'pred'])

In [18]:
pkl.dump([classes, out_data], open(f'{out_dir}/aux_data.pkl', 'wb'))

### checking that spoiler database also produces a good f1 micro average

In [89]:
df_imdb = pkl.load(open(f'{tmp_dir}/df_imdb.pkl', 'rb'))
df_imdb

Unnamed: 0,movie_id,plot_summary,duration,genre,rating,release_date,plot_synopsis,clean
0,tt0105112,"Former CIA analyst, Jack Ryan is in England wi...",1h 57min,"[Action, Thriller]",6.9,1992-06-05,"Jack Ryan (Ford) is on a ""working vacation"" in...",Jack Ryan is on a in London with his family. H...
1,tt1204975,"Billy (Michael Douglas), Paddy (Robert De Niro...",1h 45min,[Comedy],6.6,2013-11-01,Four boys around the age of 10 are friends in ...,Four boys around the age of 10 are friends in ...
2,tt0040897,"Fred C. Dobbs and Bob Curtin, both down on the...",2h 6min,"[Adventure, Drama, Western]",8.3,1948-01-24,Fred Dobbs (Humphrey Bogart) and Bob Curtin (T...,Fred Dobbs and Bob Curtin are down on their lu...
3,tt0126886,Tracy Flick is running unopposed for this year...,1h 43min,"[Comedy, Drama, Romance]",7.3,1999-05-07,Jim McAllister (Matthew Broderick) is a much-a...,Jim McAllister is a much-admired high school h...
4,tt0286716,"Bruce Banner, a brilliant scientist with a clo...",2h 18min,"[Action, Sci-Fi]",5.7,2003-06-20,Bruce Banner (Eric Bana) is a research scienti...,Bruce Banner is a research scientist at a Berk...
...,...,...,...,...,...,...,...,...
1334,tt0120655,An abortion clinic worker with a special herit...,2h 10min,"[Adventure, Comedy, Drama]",7.3,1999-11-12,The film opens with a homeless man (Bud Cort) ...,The film opens with a homeless man on a desert...
1335,tt0276751,Twelve year old Marcus Brewer lives with his c...,1h 41min,"[Comedy, Drama, Romance]",7.1,2002-05-17,Will Freeman (Hugh Grant) is a 38-year-old bac...,Will Freeman is a 38-year-old bachelor who pri...
1336,tt0289879,Evan Treborn grows up in a small town with his...,1h 53min,"[Sci-Fi, Thriller]",7.7,2004-01-23,"In the year 1998, Evan Treborn (Ashton Kutcher...","In the year 1998, Evan Treborn who suffered se..."
1337,tt1723811,Brandon is a 30-something man living in New Yo...,1h 41min,[Drama],7.2,2012-01-13,"Brandon (Michael Fassbender) is a successful, ...","Brandon is a successful, handsome thirty-somet..."


In [90]:
imdb_labels = mlb.transform(df_imdb['genre'])



In [91]:
y_pred = tf_model.predict(df_imdb['clean'].tolist())



In [93]:
new_y_pred = np.where((y_pred.T/y_pred.max(1)).T > 0.95, 1, 0)

In [94]:
# from collections import Counter

Counter(new_y_pred.sum(1))

Counter({2: 243, 1: 1076, 3: 20})

In [95]:
print(classification_report(imdb_labels, 
                            new_y_pred, 
                            output_dict=False,
                            target_names=classes))

              precision    recall  f1-score   support

      Action       0.66      0.38      0.49       394
   Adventure       0.68      0.34      0.45       387
   Animation       0.36      0.14      0.20        73
      Comedy       0.67      0.50      0.57       409
       Crime       0.82      0.22      0.35       252
       Drama       0.84      0.43      0.57       679
      Family       0.44      0.18      0.25       113
     Fantasy       0.33      0.04      0.08       161
      Horror       0.53      0.49      0.51       102
     Romance       0.74      0.15      0.25       194
      Sci-Fi       0.62      0.41      0.49       180
    Thriller       0.42      0.24      0.30       212
         War       0.64      0.30      0.41        23
     Western       0.50      0.20      0.29        10

   micro avg       0.67      0.34      0.45      3189
   macro avg       0.59      0.29      0.37      3189
weighted avg       0.67      0.34      0.44      3189
 samples avg       0.69   

  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
# single_genre = lambda y_pred : np.floor((y_pred.T/y_pred.T.max(0)).T).astype(int)