In [1]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn import utils
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D, LSTM, SimpleRNN, Dense, Dropout, Flatten, Bidirectional
from keras.layers import Input, concatenate, Activation
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.regularizers import l2

stop_words = set(stopwords.words('english'))

import time

Using TensorFlow backend.


read in the data

In [2]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)

In [3]:
#parse each row to get label vectors from json
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #number of genres we are looking at
        for i in range(numElems):
            genre_str = (json_genres[i]['name'])
            if genre_str in genre_map.keys():
                ret[genre_dict[genre_map[genre_str]]] = 1
        return ret
    except Exception as excep:
        print('Exception' + str(excep))
        return ''

Get dictionary for genre to its index in label vector

In [4]:
genre_dict = {}
genre_dict['Action-Adventure'] = 0
genre_dict['Romance'] = 1
genre_dict['Horror-Thriller'] = 2
genre_dict['Comedy'] = 3
genre_dict['Science Fiction'] = 4
#genre_dict['Drama'] = 5
genre_dict

{'Action-Adventure': 0,
 'Romance': 1,
 'Horror-Thriller': 2,
 'Comedy': 3,
 'Science Fiction': 4}

In [5]:
#map original labels to more coarse grained labels
genre_map = {}
genre_map['Adventure'] = 'Action-Adventure'
genre_map['Romance'] = 'Romance'
genre_map['Horror'] = 'Horror-Thriller'
genre_map['Thriller'] = 'Horror-Thriller'
genre_map['Comedy'] = 'Comedy'
#genre_map['War'] = 'Action-Adventure'#not sure about this
genre_map['Action'] = 'Action-Adventure'
genre_map['Science Fiction'] = 'Science Fiction'
#genre_map['Drama'] = 'Drama'

In [6]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret

In [7]:
getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [8]:
#put to lower case, remove punctation, remove stopwords
def cleanText(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text)
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text

all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [9]:
all_data = all_data[all_data.genres_vect.map(sum) > 0] #drop rows that now have no labels 

In [10]:
#neural net data only needs a few cols
nn_data = all_data[['cleanOverview', 'genres_vect', 'overview']]

In [11]:
train, test = train_test_split(nn_data, test_size=0.2, random_state=42)

Extract actual features and labels from train and test set

In [12]:
#gettrian and test features for classification. Just need text and lables for this
x = train['cleanOverview'].values.tolist()
y = train['genres_vect']
x_test = test['cleanOverview'].values.tolist()
y_test = test['genres_vect']

In [13]:
#convert labels from array of lists to numpy array

y_train = y.tolist()
y_train = np.array(y_train)

y_test = y_test.tolist()
y_test = np.array(y_test)

Get initial word embedding vectors

In [14]:
tok = [word_tokenize(ov) for ov in x]

In [15]:
word_vec_len = 32
w2v = Word2Vec(tok, min_count = 2, size=word_vec_len)

In [16]:
num_words_kept = 100000 #using 100000 most popular words, use throughout

tokenizer = Tokenizer(num_words_kept)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

max_seq_len = 150 #larger than averaage but not too large

#get actual train features to feed into neural nets for training
x_train_seq = pad_sequences(sequences, maxlen=max_seq_len)

In [17]:
test_sequences = tokenizer.texts_to_sequences(x_test)
#get actual test features to feed into neural nets for testing
x_test_seq = pad_sequences(test_sequences, maxlen=max_seq_len)

Get word embeddings matrix for start input to neural net

In [18]:
#Citation: This technique to get word embeddings comes, with some minor changes, mostly from: 
#https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-11-cnn-word2vec-41f5e28eda74

embeddings_index = {}
for w in w2v.wv.vocab.keys():
    embeddings_index[w] = w2v.wv[w]


embedding_matrix = np.zeros((num_words_kept, word_vec_len))
for word, i in tokenizer.word_index.items():
    if i >= num_words_kept:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Below we define evlaution metric functions

In [19]:
def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuruacy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [20]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better. Percent incorrectly chosen labels counting assignment and non-assignment equally
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])


#K is what we imported keras backend as

#metric for keras for early stopping
#takes in raw labels from kerass (not yet converted to 0 and 1s)
#NOT the same as accuracy, this is total labels correctly identified divided by union of total labels
#this weights rows with more labels higher, where accruacy does not, but this is still a good metric for early stopping
def raw_multi_label_accuracy(y_true, y_pred):
    positives = K.greater_equal(y_pred, 0.5)
    positives = K.cast(positives, K.floatx())
    new_y_pred = positives #+ ((1-positives)*y_pred)
    intersection = y_true * new_y_pred
    union = 1 -((1-y_true)*(1-new_y_pred))
    accuracy = K.sum(intersection) / K.sum(union)
    return accuracy
    

In [21]:
def get_all_metrics(actual_labels, predictions):
    print('Getting evaluation metrics for each label:')
    get_per_label_metrics(actual_labels, predictions)
    print('Getting evaluations for multilabel problem')
    print('Multilabel accuracy: ' + str(multi_label_accuracy(actual_labels, predictions)))
    print('Multilabel precision: ' + str(multi_label_precision(actual_labels, predictions)))
    print('Multilabel recall: ' + str(multi_label_recall(actual_labels, predictions)))
    print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(actual_labels, predictions))))

In [22]:
#for early stopping only after certain number of epochs. wait until delay epochs until early stopping
#not same as patience. Want to not even start looking until delay is reached
class DelayedEarlyStopping(EarlyStopping):
    def __init__(self, monitor, min_delta=0, patience=0, verbose=0, mode='auto', delay = 100):
        super(DelayedEarlyStopping, self).__init__(monitor=monitor, min_delta=min_delta, patience=patience,verbose=verbose, mode=mode)
        self.delay = delay

    def on_epoch_end(self, epoch, logs=None):
        if epoch > self.delay:
            super().on_epoch_end(epoch, logs)

In [23]:
def nn_output_to_predictions(res):
    label_predictions = []
    for i in range(res.shape[0]):
        pred = [0]*len(genre_dict)
        for j in range(res.shape[1]):
            if res[i][j] >= .5:
                pred[j] = 1
        label_predictions.append(pred)
    return np.array(label_predictions)

Convolutional Neural Networks

In [24]:
model_cnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
#e = Embedding(num_words_kept, word_vec_len, input_length=max_seq_len, trainable=True)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model_cnn.add(Dropout(.5))
model_cnn.add(Dense(len(genre_dict), activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
model_cnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 1738 samples, validate on 194 samples
Epoch 1/1000
 - 2s - loss: 0.7068 - raw_multi_label_accuracy: 0.0811 - val_loss: 0.6563 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 2/1000
 - 1s - loss: 0.6603 - raw_multi_label_accuracy: 0.0669 - val_loss: 0.6390 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 3/1000
 - 2s - loss: 0.6441 - raw_multi_label_accuracy: 0.0310 - val_loss: 0.6279 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 4/1000
 - 1s - loss: 0.6285 - raw_multi_label_accuracy: 0.0191 - val_loss: 0.6200 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 5/1000
 - 1s - loss: 0.6163 - raw_multi_label_accuracy: 0.0132 - val_loss: 0.6117 - val_raw_multi_label_accura

In [25]:
predictions = nn_output_to_predictions(model_cnn.predict(x_test_seq))

In [26]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6894409937888198
Precision for Action-Adventure: 0.635036496350365
Recall for Action-Adventure: 0.46524064171123

Accuruacy for Romance: 0.7805383022774327
Precision for Romance: 0.5
Recall for Romance: 0.25471698113207547

Accuruacy for Horror-Thriller: 0.639751552795031
Precision for Horror-Thriller: 0.5478723404255319
Recall for Horror-Thriller: 0.5364583333333334

Accuruacy for Comedy: 0.6376811594202898
Precision for Comedy: 0.6130952380952381
Recall for Comedy: 0.4835680751173709

Accuruacy for Science Fiction: 0.8819875776397516
Precision for Science Fiction: 0.4444444444444444
Recall for Science Fiction: 0.07142857142857142

Getting evaluations for multilabel problem
Multilabel accuracy: 0.3969979296066253
Multilabel precision: 0.5842956120092379
Multilabel recall: 0.4560041407867495
Percent of correctly decided label decisions: 72.58799171842651


CNN but with multiple filter sizes so we don't just filter on group of words at a time

In [27]:
model_input = Input(shape=(max_seq_len,), dtype='int32')
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)(model_input)
two_word_filter = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(e)
two_word_filter = GlobalMaxPooling1D()(two_word_filter)
three_word_filter = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(e)
three_word_filter = GlobalMaxPooling1D()(three_word_filter)
four_word_filter = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(e)
four_word_filter = GlobalMaxPooling1D()(four_word_filter)
merged = concatenate([two_word_filter, three_word_filter, four_word_filter], axis=1)

merged = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(merged)
merged = Dropout(0.5)(merged)
merged = Dense(len(genre_dict))(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[model_input], outputs=[output])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
model.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 1738 samples, validate on 194 samples
Epoch 1/1000
 - 5s - loss: 2.8934 - raw_multi_label_accuracy: 0.0731 - val_loss: 2.3014 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 2/1000
 - 4s - loss: 1.9350 - raw_multi_label_accuracy: 0.0523 - val_loss: 1.5458 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 3/1000
 - 4s - loss: 1.3322 - raw_multi_label_accuracy: 0.0526 - val_loss: 1.1074 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 4/1000
 - 4s - loss: 0.9845 - raw_multi_label_accuracy: 0.0213 - val_loss: 0.8576 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 5/1000
 - 4s - loss: 0.7949 - raw_multi_label_accuracy: 0.0257 - val_loss: 0.7277 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 6/1000
 - 4s - loss: 0.6930 - raw_multi_label_accuracy: 0.0168 - val_loss: 0.6554 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 7/1000
 - 4s - loss: 0.6349 - raw_multi_label_accuracy: 0.0550 - val_loss: 0.6195 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 8/1000
 - 4s - loss: 0.5873 - 

In [28]:
predictions = nn_output_to_predictions(model.predict(x_test_seq))

In [29]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.7018633540372671
Precision for Action-Adventure: 0.7415730337078652
Recall for Action-Adventure: 0.35294117647058826

Accuruacy for Romance: 0.7639751552795031
Precision for Romance: 0.4090909090909091
Recall for Romance: 0.16981132075471697

Accuruacy for Horror-Thriller: 0.6107660455486542
Precision for Horror-Thriller: 0.5106382978723404
Recall for Horror-Thriller: 0.5

Accuruacy for Comedy: 0.6356107660455487
Precision for Comedy: 0.5911330049261084
Recall for Comedy: 0.5633802816901409

Accuruacy for Science Fiction: 0.8819875776397516
Precision for Science Fiction: 0.4
Recall for Science Fiction: 0.03571428571428571

Getting evaluations for multilabel problem
Multilabel accuracy: 0.3687025534851621
Multilabel precision: 0.5723270440251573
Multilabel recall: 0.41459627329192544
Percent of correctly decided label decisions: 71.8840579710145


Regular Neural Network

In [30]:
normal_nn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
normal_nn.add(e)
normal_nn.add(Flatten())
normal_nn.add(Dense(256, activation='relu'))
normal_nn.add(Dense(len(genre_dict), activation='sigmoid'))
normal_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
normal_nn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 1738 samples, validate on 194 samples
Epoch 1/1000
 - 2s - loss: 0.6184 - raw_multi_label_accuracy: 0.0674 - val_loss: 0.5949 - val_raw_multi_label_accuracy: 0.0233
Epoch 2/1000
 - 1s - loss: 0.5818 - raw_multi_label_accuracy: 0.0787 - val_loss: 0.5882 - val_raw_multi_label_accuracy: 0.0646
Epoch 3/1000
 - 1s - loss: 0.5621 - raw_multi_label_accuracy: 0.1364 - val_loss: 0.5856 - val_raw_multi_label_accuracy: 0.1140
Epoch 4/1000
 - 1s - loss: 0.5308 - raw_multi_label_accuracy: 0.2153 - val_loss: 0.5916 - val_raw_multi_label_accuracy: 0.1494
Epoch 5/1000
 - 1s - loss: 0.4795 - raw_multi_label_accuracy: 0.3448 - val_loss: 0.5865 - val_raw_multi_label_accuracy: 0.1931
Epoch 6/1000
 - 1s - loss: 0.3859 - raw_multi_label_accuracy: 0.5413 - val_loss: 0.5696 - val_raw_multi_label_accuracy: 0.2488
Epoch 7/1000
 - 1s - loss: 0.2840 - raw_multi_label_accuracy: 0.6966 - val_loss: 0.5551 - val_raw_multi_label_accuracy: 0.2434
Epoch 8/1000
 - 1s - loss: 0.1958 - raw_multi_label_accuracy: 0.

In [31]:
predictions = nn_output_to_predictions(normal_nn.predict(x_test_seq))

In [32]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6645962732919255
Precision for Action-Adventure: 0.5850340136054422
Recall for Action-Adventure: 0.45989304812834225

Accuruacy for Romance: 0.7743271221532091
Precision for Romance: 0.45454545454545453
Recall for Romance: 0.14150943396226415

Accuruacy for Horror-Thriller: 0.6356107660455487
Precision for Horror-Thriller: 0.5784313725490197
Recall for Horror-Thriller: 0.3072916666666667

Accuruacy for Comedy: 0.5734989648033126
Precision for Comedy: 0.5238095238095238
Recall for Comedy: 0.3615023474178404

Accuruacy for Science Fiction: 0.8923395445134575
Precision for Science Fiction: 0.8333333333333334
Recall for Science Fiction: 0.08928571428571429

Getting evaluations for multilabel problem
Multilabel accuracy: 0.2768115942028984
Multilabel precision: 0.5448343079922028
Multilabel recall: 0.31935817805383016
Percent of correctly decided label decisions: 70.80745341614907


LSTM

In [33]:
lstm_model = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
lstm_model.add(e)
lstm_model.add(LSTM(100, dropout=0.25, recurrent_dropout=0.25))
lstm_model.add(Dense(256, activation='relu'))
lstm_model.add(Dense(len(genre_dict), activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
lstm_model.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 1738 samples, validate on 194 samples
Epoch 1/1000
 - 9s - loss: 0.6379 - raw_multi_label_accuracy: 0.0574 - val_loss: 0.5884 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 2/1000
 - 7s - loss: 0.5940 - raw_multi_label_accuracy: 0.0806 - val_loss: 0.5833 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 3/1000
 - 7s - loss: 0.5901 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5838 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 4/1000
 - 7s - loss: 0.5858 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5816 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 5/1000
 - 7s - loss: 0.5837 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5806 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 6/1000
 - 7s - loss: 0.5799 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5825 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 7/1000
 - 7s - loss: 0.5688 - raw_multi_label_accuracy: 0.0250 - val_loss: 0.5716 - val_raw_multi_label_accuracy: 0.0170
Epoch 8/1000
 - 7s - los

In [34]:
predictions = nn_output_to_predictions(lstm_model.predict(x_test_seq))

In [35]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6128364389233955
Precision for Action-Adventure: 0.5
Recall for Action-Adventure: 0.6524064171122995

Accuruacy for Romance: 0.7536231884057971
Precision for Romance: 0.42528735632183906
Recall for Romance: 0.3490566037735849

Accuruacy for Horror-Thriller: 0.6935817805383023
Precision for Horror-Thriller: 0.6341463414634146
Recall for Horror-Thriller: 0.5416666666666666

Accuruacy for Comedy: 0.6894409937888198
Precision for Comedy: 0.6082474226804123
Recall for Comedy: 0.8309859154929577

Accuruacy for Science Fiction: 0.8737060041407867
Precision for Science Fiction: 0.4074074074074074
Recall for Science Fiction: 0.19642857142857142

Getting evaluations for multilabel problem
Multilabel accuracy: 0.4752587991718428
Multilabel precision: 0.5819502074688798
Multilabel recall: 0.6280193236714974
Percent of correctly decided label decisions: 72.46376811594203


simple rnn

In [36]:
rnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
rnn.add(e)
rnn.add(SimpleRNN(32, activation = 'relu'))
rnn.add(Dense(256, activation='relu'))
rnn.add(Dense(len(genre_dict), activation='sigmoid'))
rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
rnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 1738 samples, validate on 194 samples
Epoch 1/1000
 - 2s - loss: 0.6380 - raw_multi_label_accuracy: 0.0919 - val_loss: 0.5904 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 2/1000
 - 1s - loss: 0.5982 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5907 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 3/1000
 - 1s - loss: 0.5869 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5862 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 4/1000
 - 1s - loss: 0.5705 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5810 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 5/1000
 - 1s - loss: 0.5424 - raw_multi_label_accuracy: 0.0355 - val_loss: 0.5818 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 6/1000
 - 1s - loss: 0.4851 - raw_multi_label_accuracy: 0.2078 - val_loss: 0.5652 - val_raw_multi_label_accuracy: 0.1086
Epoch 7/1000
 - 1s - loss: 0.3807 - raw_multi_label_accuracy: 0.5266 - val_loss: 0.5684 - val_raw_multi_label_accuracy: 0.1891
Epoch 8/1000
 - 1s - loss: 0.273

In [37]:
predictions = nn_output_to_predictions(rnn.predict(x_test_seq))

In [38]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.5755693581780539
Precision for Action-Adventure: 0.449438202247191
Recall for Action-Adventure: 0.42780748663101603

Accuruacy for Romance: 0.7391304347826086
Precision for Romance: 0.3148148148148148
Recall for Romance: 0.16037735849056603

Accuruacy for Horror-Thriller: 0.5507246376811594
Precision for Horror-Thriller: 0.430939226519337
Recall for Horror-Thriller: 0.40625

Accuruacy for Comedy: 0.5652173913043478
Precision for Comedy: 0.5099337748344371
Recall for Comedy: 0.3615023474178404

Accuruacy for Science Fiction: 0.8881987577639752
Precision for Science Fiction: 0.6666666666666666
Recall for Science Fiction: 0.07142857142857142

Getting evaluations for multilabel problem
Multilabel accuracy: 0.279503105590062
Multilabel precision: 0.4498806682577566
Multilabel recall: 0.34247757073844026
Percent of correctly decided label decisions: 66.3768115942029


bidirectional-LSTM

In [39]:
bi_lstm = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
bi_lstm.add(e)
bi_lstm.add(Bidirectional(LSTM(100, dropout=0.25, recurrent_dropout=0.25)))
bi_lstm.add(Dense(256, activation='relu'))
bi_lstm.add(Dense(len(genre_dict), activation='sigmoid'))
bi_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
bi_lstm.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 1738 samples, validate on 194 samples
Epoch 1/1000
 - 17s - loss: 0.6350 - raw_multi_label_accuracy: 0.0454 - val_loss: 0.5895 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 2/1000
 - 14s - loss: 0.5946 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5831 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 3/1000
 - 14s - loss: 0.5894 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5841 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 4/1000
 - 14s - loss: 0.5858 - raw_multi_label_accuracy: 0.0000e+00 - val_loss: 0.5802 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 5/1000
 - 14s - loss: 0.5814 - raw_multi_label_accuracy: 3.6883e-04 - val_loss: 0.5784 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 6/1000
 - 14s - loss: 0.5649 - raw_multi_label_accuracy: 0.0442 - val_loss: 0.5572 - val_raw_multi_label_accuracy: 0.1319
Epoch 7/1000
 - 14s - loss: 0.5124 - raw_multi_label_accuracy: 0.3126 - val_loss: 0.5082 - val_raw_multi_label_accuracy: 0.2687
Epoch 8/1000
 - 14s -

In [40]:
predictions = nn_output_to_predictions(bi_lstm.predict(x_test_seq))

In [41]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.598343685300207
Precision for Action-Adventure: 0.48859934853420195
Recall for Action-Adventure: 0.8021390374331551

Accuruacy for Romance: 0.7598343685300207
Precision for Romance: 0.4418604651162791
Recall for Romance: 0.3584905660377358

Accuruacy for Horror-Thriller: 0.6832298136645962
Precision for Horror-Thriller: 0.5924170616113744
Recall for Horror-Thriller: 0.6510416666666666

Accuruacy for Comedy: 0.7267080745341615
Precision for Comedy: 0.6653061224489796
Recall for Comedy: 0.7652582159624414

Accuruacy for Science Fiction: 0.8819875776397516
Precision for Science Fiction: 0.48148148148148145
Recall for Science Fiction: 0.23214285714285715

Getting evaluations for multilabel problem
Multilabel accuracy: 0.4924775707384408
Multilabel precision: 0.5950655624568669
Multilabel recall: 0.6878881987577641
Percent of correctly decided label decisions: 73.00207039337474
