In [1]:
import pandas as pd
import re 
import numpy as np
import spacy
import time

from numpy import asarray
from numpy import zeros
from nltk.corpus import stopwords
from nltk import word_tokenize

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.recurrent import SimpleRNN, LSTM
from keras.layers import Dense, Flatten, Embedding, Masking, Bidirectional
from keras_self_attention import SeqSelfAttention

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from gensim.models import Word2Vec



In [2]:
# Taken from Spring 2021 Class Notes
from typing import List

# Encodes documents into integers
def integer_encode_documents(docs: List[str], tokenizer: Tokenizer)-> List[List[int]]:
    documents = []
    for d in docs:
        doc_integers = []
        for i in text_to_word_sequence(d):
            doc_integers.append(tokenizer.word_index[i])
        documents.append(doc_integers)
    return documents

def integer_encode_documents(docs, tokenizer):
    return tokenizer.texts_to_sequences(docs)

In [3]:
# determines the maximum token length for a group of documents
def get_max_token_length_per_doc(docs: List[List[str]])-> int:
    return max(list(map(lambda x: len(x.split()), docs)))

In [4]:
# Professors Function: Concates the keys together
def concat_keys(x):
    first, second = x[0], x[1]
    if first <= second:
        return f"{first}{second}"
    else:
        return f"{second}{first}"

In [5]:
# Function that removes characters
def replace(replacements):
    for replace in replacements: 
        genres['overview'] = genres['overview'].str.replace(replace, '', case = False)

In [6]:
# Read in the dataset
reviews = pd.read_csv('movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
# Parses JSON to find the genres for the movie
regex = "\'name\': \'([A-z]{1,})"
reviews['genres_group'] = reviews['genres'].str.findall(regex)

# Parses JSON to find the production companies for a movie
regex = "\'name\': \'([A-z]{1,})"
reviews['production_companies'] = reviews['production_companies'].str.findall(regex)

In [8]:
# Exlodes genres column to have one title, genre and overview per line
# Note a movie can have multiple genres, in this dataset each observation for a movie represents 
reviews = reviews.explode('genres_group').reset_index()

In [9]:
# Groups genres into 7 different distinct categories
reviews['genres_group'] = np.where(reviews['genres_group'] == 'Drama', 'Drama',
                          np.where(reviews['genres_group'] == 'Comedy', 'Comedy', 
                          np.where(reviews['genres_group'].isin(['Thriller', 'Crime', 'Horror', 'Mystery']), 'Thriller/Horror',
                          np.where(reviews['genres_group'].isin(['Action', 'Adventure', 'Science', 'Fantasy']), 'Action/Adventure',
                          np.where(reviews['genres_group'].isin(['Animation', 'Family']), 'Animation/Family', 
                          np.where(reviews['genres_group'].isin(['History', 'War', 'Western']), 'History/War/Western', 
                          np.where(reviews['genres_group'] == 'Documentary', 'Documentary', 'Drop')))))))

In [10]:
# Drops genres that are too obsecure for the prediction purposes of this excercise
reviews = reviews.loc[reviews['genres_group'] != 'Drop']

In [11]:
genres = reviews[['original_title', 'overview', 'genres_group', 'production_companies', 'runtime', 'budget', 'revenue']]
genres = genres.drop_duplicates(subset = ['original_title', 'genres_group'])

In [12]:
# Counts of each genre
genres['genres_group'].value_counts()

Drama                  19515
Comedy                 12958
Thriller/Horror        12526
Action/Adventure       10844
Documentary             3929
Animation/Family        3737
History/War/Western     3290
Name: genres_group, dtype: int64

In [13]:
# Sets all words in review to lowercase
genres['overview'] = genres['overview'].str.lower()

In [14]:
# Searches for any malformed text that needs to be replaced
genres['overview'].str.extract(r'(&#[0-9]+)')[0].unique()

array([nan], dtype=object)

In [15]:
# removes any unneeded punctuation
replace(['/><br', '<br', "\"", "\'", "/", '=',
         '<', '>', ',', '_', '\n', '\.', '-', '\n', '\(', '\)', ':'])

In [16]:
# Drops any missing data
genres = genres.dropna()

In [17]:
potential_duplicates = 'abcdefghijklmnopqrstuvwxyz!?'

# Loops through the alphabet and replaces charaters that appear 3+ times in a row with one occurence
for i in potential_duplicates: 
    if i == '?':
        i = '\?'
    genres['overview'] = genres.overview.apply(lambda x: re.sub(i + i + i + '+', i, x))

In [18]:
# New Spacy model to include POS tagging and lemmatization in pipeline
nlp = spacy.load("en_core_web_sm", disable=['ner', 'parser'])

In [19]:
# Performs lemmatization on all genre summaries
genres['overview'] = genres['overview'].apply(lambda x: ' '.join([token.lemma_ for token in nlp(x)]))

In [20]:
# Gets list of stop words
# other stop words were selected based on word frequecnies
stop_words = list(stopwords.words('english')) + ['also', 'well', 'much', 'get', 'take', 'make', 'try',
                                                 'live', 'come', 'must', 'turn', 'film', 'movie', 'story',
                                                 'back', 'way', 'set', 'group']

In [21]:
# Removes stop words from data
genres['overview'] = genres['overview'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop_words]))

In [22]:
# Removes any overviews that are missing genres
genres = genres.loc[genres['overview'] != '']

### Baseline Model

In [23]:
encoder = LabelEncoder()

# Encodes categorical fields into a numerical array
baseline_labels = to_categorical(encoder.fit_transform(genres['genres_group']))
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(genres['overview'], baseline_labels, test_size = 0.15)

In [24]:
# Collects document embeddings in train and test splits
embeddings_train = np.concatenate(X_train_base.reset_index()['overview'].apply(lambda x: nlp(x).vector)).reshape(len(X_train_base), 96)
embeddings_test = np.concatenate(X_test_base.reset_index()['overview'].apply(lambda x: nlp(x).vector)).reshape(len(X_test_base), 96)

In [25]:
# Converts array into a single value for genre label (between 0 and 6)
y_train_base_num = [i.tolist().index(1) for i in y_train_base]
y_test_base_num = [i.tolist().index(1) for i in y_test_base]

In [26]:
# Fits and trains a basic LR model
lr_w2v = LogisticRegression(random_state = 42)
lr_w2v.fit(embeddings_train, y_train_base_num)

# Gathers predictions based on Logisitc Regression model 
predictions_w2v = lr_w2v.predict(embeddings_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [27]:
# Prints evaluation metrics
print(np.mean(predictions_w2v == y_test_base_num) * 100)
print()
print(roc_auc_score(y_test_base_num, lr_w2v.predict_proba(embeddings_test), multi_class = 'ovo'))

confusion_matrix(y_test_base_num, predictions_w2v)

31.17474451077608

0.6412836093508879


array([[ 158,    3,  150,   28, 1123,    4,  169],
       [  60,    0,  100,   10,  380,    1,   35],
       [  97,    1,  335,   38, 1255,    0,  130],
       [  24,    0,   44,   82,  412,    1,   16],
       [ 117,    0,  280,   51, 2258,    4,  195],
       [  38,    1,   33,   10,  361,    2,   27],
       [ 123,    2,  169,   19, 1287,    4,  246]])

In [28]:
print(classification_report(y_test_base_num, predictions_w2v))

              precision    recall  f1-score   support

           0       0.26      0.10      0.14      1635
           1       0.00      0.00      0.00       586
           2       0.30      0.18      0.23      1856
           3       0.34      0.14      0.20       579
           4       0.32      0.78      0.45      2905
           5       0.12      0.00      0.01       472
           6       0.30      0.13      0.18      1850

    accuracy                           0.31      9883
   macro avg       0.24      0.19      0.17      9883
weighted avg       0.28      0.31      0.25      9883



### Primary Model

In [29]:
# Fits tokenizer on text
tokenizer = Tokenizer(num_words = 10000, oov_token = 'UNKNOWN_TOKEN')
tokenizer.fit_on_texts(genres['overview'])

In [30]:
# Creates custom word embeddings based on the overview text
docs = [word_tokenize(single) for single in genres['overview']]
model = Word2Vec(docs, vector_size=300, window=10, min_count=1, workers=4)
model.wv.save_word2vec_format('genre_overview.txt', binary=False)

In [31]:
# Integer encodes and pads text
docs = integer_encode_documents(genres['overview'], tokenizer)
max_length = get_max_token_length_per_doc(genres['overview'])
padded_docs = pad_sequences(docs, maxlen=max_length, padding='post')

In [32]:
VOCAB_SIZE = 10000

# Loads word embeddings
def load_glove_vectors():
    embeddings_index = {}
    with open('genre_overview.txt') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    print('Loaded %s word vectors.' % len(embeddings_index))
    return embeddings_index
embeddings_index = load_glove_vectors()

# Creates embedding matrix based on the vocab size
embedding_matrix_bbc = zeros((VOCAB_SIZE, 300))
for word, i in tokenizer.word_index.items():
    if i == 10000:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: # check that it is an actual word that we have embeddings for
        embedding_matrix_bbc[i] = embedding_vector

Loaded 73150 word vectors.


In [33]:
encoder = LabelEncoder()

# Encodes categorical variables 
labels = to_categorical(encoder.fit_transform(genres['genres_group']))
X_train, X_test, y_train, y_test = train_test_split(padded_docs, labels, test_size = 0.1)

In [34]:
# Shows which genres correspond to which numerical label
nums = [i.tolist().index(1) for i in labels]
genres_copy = genres.copy()
genres_copy['group_num'] = nums
genres_copy.groupby('genres_group')['group_num'].mean()

genres_group
Action/Adventure       0
Animation/Family       1
Comedy                 2
Documentary            3
Drama                  4
History/War/Western    5
Thriller/Horror        6
Name: group_num, dtype: int64

### LTSM

In [35]:
import tensorflow as tf
tf.random.set_seed(42)

In [36]:
# Complies a LSTM model, trying to predict 7 genre classes from text
final_model = Sequential()
final_model.add(Embedding(VOCAB_SIZE, 300, weights = [embedding_matrix_bbc], input_length = max_length, trainable = False))
final_model.add(Masking(mask_value = 0.0))
final_model.add(LSTM(units=32, input_shape = (1, max_length)))
final_model.add(Dense(16))
final_model.add(Dense(7, activation = 'softmax'))

final_model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
final_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 146, 300)          3000000   
_________________________________________________________________
masking (Masking)            (None, 146, 300)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 32)                42624     
_________________________________________________________________
dense (Dense)                (None, 16)                528       
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 119       
Total params: 3,043,271
Trainable params: 43,271
Non-trainable params: 3,000,000
_________________________________________________________________


In [37]:
# fit the model
final_model.fit(X_train, y_train, epochs=5, verbose=1)

# evaluate the model
loss, accuracy = final_model.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 46.346235


In [38]:
# Stores model predictions as genre label (between 0 - 6)
predictions = np.argmax(final_model.predict(X_test), axis=-1)

In [39]:
y_test_num = [i.tolist().index(1) for i in y_test]

In [40]:
# Prints evaluation metrics
print('Accuracy: ', round(np.mean(predictions == y_test_num) * 100, 1))

print('ROC_AUC: ', round(roc_auc_score(y_test_num, final_model.predict(X_test), multi_class = 'ovo'), 3))

confusion_matrix(y_test_num, predictions)

Accuracy:  44.9
ROC_AUC:  0.82


array([[ 373,   23,  130,   24,  197,   31,  267],
       [ 113,   42,   70,   13,   74,    2,   38],
       [ 114,   33,  479,   18,  439,    9,  154],
       [  14,    2,   33,  244,   73,    1,   15],
       [ 133,   16,  278,   48, 1134,   24,  356],
       [  38,    2,   23,   25,  189,   35,   27],
       [ 186,    4,   88,   22,  281,    2,  653]])

In [41]:
print(classification_report(y_test_num, predictions))

              precision    recall  f1-score   support

           0       0.38      0.36      0.37      1045
           1       0.34      0.12      0.18       352
           2       0.44      0.38      0.41      1246
           3       0.62      0.64      0.63       382
           4       0.48      0.57      0.52      1989
           5       0.34      0.10      0.16       339
           6       0.43      0.53      0.48      1236

    accuracy                           0.45      6589
   macro avg       0.43      0.39      0.39      6589
weighted avg       0.44      0.45      0.44      6589



### Self-Attention

In [42]:
# Complies a Bi-directional LSTM model architecture
model = Sequential()
model.add(Embedding(input_dim=VOCAB_SIZE, output_dim=300, mask_zero=True, input_length = max_length))
model.add(Bidirectional(LSTM(units=300,return_sequences=True, input_shape = (1, max_length))))
model.add(SeqSelfAttention(attention_activation='softmax'))
model.add(LSTM(units=32, input_shape = (1, max_length)))
model.add(Dense(7, activation = 'softmax'))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 146, 300)          3000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 146, 600)          1442400   
_________________________________________________________________
seq_self_attention (SeqSelfA (None, 146, 600)          38465     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                81024     
_________________________________________________________________
dense_2 (Dense)              (None, 7)                 231       
Total params: 4,562,120
Trainable params: 4,562,120
Non-trainable params: 0
_________________________________________________________________


In [43]:
# fit the model
model.fit(X_train, y_train, epochs=5, verbose=1)

# evaluate the model
loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Accuracy: 55.298084


In [44]:
predictions = np.argmax(model.predict(X_test), axis=-1)
y_test_num = [i.tolist().index(1) for i in y_test]

In [45]:
# Prints evaluation metrics
print('Accuracy: ', round(np.mean(predictions == y_test_num) * 100, 3))

print('ROC_AUC: ', round(roc_auc_score(y_test_num, model.predict(X_test), multi_class = 'ovo'), 3))

confusion_matrix(y_test_num, predictions)

Accuracy:  41.235
ROC_AUC:  0.795


array([[297,  43, 143,  24, 201,  30, 307],
       [111,  47,  87,  17,  68,   0,  22],
       [116,  48, 491,  36, 397,   9, 149],
       [  6,   7,  29, 267,  52,   1,  20],
       [154,  19, 349,  76, 984,  30, 377],
       [ 29,   0,  25,  28, 188,  40,  29],
       [205,   1, 103,  28, 298,  10, 591]])

In [46]:
print(classification_report(y_test_num, predictions))

              precision    recall  f1-score   support

           0       0.32      0.28      0.30      1045
           1       0.28      0.13      0.18       352
           2       0.40      0.39      0.40      1246
           3       0.56      0.70      0.62       382
           4       0.45      0.49      0.47      1989
           5       0.33      0.12      0.17       339
           6       0.40      0.48      0.43      1236

    accuracy                           0.41      6589
   macro avg       0.39      0.37      0.37      6589
weighted avg       0.40      0.41      0.40      6589



### Model Output

In [47]:
# Times how long it takes to score all documents
start = time.time()
all_predictions = final_model.predict(padded_docs)
end = time.time()
print(end - start)

19.645273685455322


In [48]:
len(all_predictions)

65884

In [49]:
# Averages documents scored per second
len(all_predictions) / 22.32

2951.7921146953404

In [50]:
# Creates a pandas dataframe with all predictions
predictions_dataframe = pd.DataFrame(all_predictions, columns = ['Action/Adventure', 'Animation/Family', 'Comedy', 
                                             'Docuemntary', 'Drama', 'History/War/Western', 'Thriller/Horror'])

In [51]:
# Gathers all predictions
predicted_class = np.argmax(all_predictions, axis=-1)

In [52]:
# Renames predictions numbers to actual genre namme
predictions_dataframe['Prediction'] = predicted_class.tolist()
predictions_dataframe['Prediction'] = predictions_dataframe['Prediction'].map({0:'Action/Adventure', 
                                                                               1: 'Animation/Family', 
                                                                               2: 'Comedy', 
                                                                               3: 'Docuemntary', 
                                                                               4: 'Drama', 
                                                                               5: 'History/War/Western', 
                                                                               6: 'Thriller/Horror'})

In [53]:
# Concats predictions and original overview dataframe together
genre_overview_predictions = pd.concat([genres[['original_title', 'overview', 'genres_group']].reset_index(drop = True), predictions_dataframe], axis = 1)

In [54]:
correct = genre_overview_predictions[genre_overview_predictions['genres_group'] == genre_overview_predictions['Prediction']]
misclassified = genre_overview_predictions[genre_overview_predictions['genres_group'] != genre_overview_predictions['Prediction']]

In [55]:
reviews[reviews['original_title'] == 'Toy Story']['overview'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."

In [56]:
# View Toy Story Predictions
genre_overview_predictions[genre_overview_predictions['original_title'] == 'Toy Story']

Unnamed: 0,original_title,overview,genres_group,Action/Adventure,Animation/Family,Comedy,Docuemntary,Drama,History/War/Western,Thriller/Horror,Prediction
0,Toy Story,lead woody andys toy happily room andys birthd...,Animation/Family,0.118108,0.296782,0.431155,0.003588,0.130151,0.005305,0.014911,Comedy
1,Toy Story,lead woody andys toy happily room andys birthd...,Comedy,0.118108,0.296782,0.431155,0.003588,0.130151,0.005305,0.014911,Comedy


In [57]:
# View Jumani Predictions
round(genre_overview_predictions[genre_overview_predictions['original_title'] == 'Jumanji'], 3) * 100

Unnamed: 0,original_title,overview,genres_group,Action/Adventure,Animation/Family,Comedy,Docuemntary,Drama,History/War/Western,Thriller/Horror,Prediction
2,JumanjiJumanjiJumanjiJumanjiJumanjiJumanjiJuma...,sibling judy peter discover enchanted board ga...,Action/AdventureAction/AdventureAction/Adventu...,46.399998,11.200001,13.2,0.5,7.8,0.1,20.800001,Action/AdventureAction/AdventureAction/Adventu...
3,JumanjiJumanjiJumanjiJumanjiJumanjiJumanjiJuma...,sibling judy peter discover enchanted board ga...,Animation/FamilyAnimation/FamilyAnimation/Fami...,46.399998,11.200001,13.2,0.5,7.8,0.1,20.800001,Action/AdventureAction/AdventureAction/Adventu...


In [58]:
reviews[reviews['original_title'] == 'Jumanji']['overview'][3]

"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."