In [1]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn import utils
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D, LSTM, SimpleRNN, Dense, Dropout, Flatten, Bidirectional
from keras.layers import Input, concatenate, Activation
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.regularizers import l2

stop_words = set(stopwords.words('english'))

import time

Using TensorFlow backend.


read in the data

In [2]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)

In [3]:
#parse each row to get label vectors from json
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #number of genres we are looking at
        for i in range(numElems):
            genre_str = (json_genres[i]['name'])
            if genre_str in genre_map.keys():
                ret[genre_dict[genre_map[genre_str]]] = 1
        return ret
    except Exception as excep:
        print('Exception' + str(excep))
        return ''

Get dictionary for genre to its index in label vector

In [4]:
genre_dict = {}
genre_dict['Action-Adventure'] = 0
genre_dict['Romance'] = 1
genre_dict['Horror-Thriller'] = 2
genre_dict['Comedy'] = 3
genre_dict['Science Fiction'] = 4
genre_dict['Drama'] = 5
genre_dict

{'Action-Adventure': 0,
 'Romance': 1,
 'Horror-Thriller': 2,
 'Comedy': 3,
 'Science Fiction': 4,
 'Drama': 5}

In [5]:
#map original labels to more coarse grained labels
genre_map = {}
genre_map['Adventure'] = 'Action-Adventure'
genre_map['Romance'] = 'Romance'
genre_map['Horror'] = 'Horror-Thriller'
genre_map['Thriller'] = 'Horror-Thriller'
genre_map['Comedy'] = 'Comedy'
genre_map['Fantasy'] = 'Science Fiction'
genre_map['Action'] = 'Action-Adventure'
genre_map['Science Fiction'] = 'Science Fiction'
genre_map['Drama'] = 'Drama'

In [6]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret

In [7]:
getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [8]:
#put to lower case, remove punctation, remove stopwords
def cleanText(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text)
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text

all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [9]:
all_data = all_data[all_data.genres_vect.map(sum) > 0] #drop rows that now have no labels 

In [10]:
#neural net data only needs a few cols
nn_data = all_data[['cleanOverview', 'genres_vect', 'overview']]

In [11]:
train, test = train_test_split(nn_data, test_size=0.2, random_state=42)

Extract actual features and labels from train and test set

In [12]:
#gettrian and test features for classification. Just need text and lables for this
x = train['cleanOverview'].values.tolist()
y = train['genres_vect']
x_test = test['cleanOverview'].values.tolist()
y_test = test['genres_vect']

In [13]:
#convert labels from array of lists to numpy array

y_train = y.tolist()
y_train = np.array(y_train)

y_test = y_test.tolist()
y_test = np.array(y_test)

Get initial word embedding vectors

In [14]:
tok = [word_tokenize(ov) for ov in x]

In [15]:
word_vec_len = 32
w2v = Word2Vec(tok, min_count = 2, size=word_vec_len)

In [16]:
num_words_kept = 100000 #using 100000 most popular words, use throughout

tokenizer = Tokenizer(num_words_kept)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

max_seq_len = 150 #larger than averaage but not too large

#get actual train features to feed into neural nets for training
x_train_seq = pad_sequences(sequences, maxlen=max_seq_len)

In [17]:
test_sequences = tokenizer.texts_to_sequences(x_test)
#get actual test features to feed into neural nets for testing
x_test_seq = pad_sequences(test_sequences, maxlen=max_seq_len)

Get word embeddings matrix for start input to neural net

In [18]:
#Citation: This technique to get word embeddings comes, with some minor changes, mostly from: 
#https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-11-cnn-word2vec-41f5e28eda74

embeddings_index = {}
for w in w2v.wv.vocab.keys():
    embeddings_index[w] = w2v.wv[w]


embedding_matrix = np.zeros((num_words_kept, word_vec_len))
for word, i in tokenizer.word_index.items():
    if i >= num_words_kept:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Below we define evlaution metric functions

In [19]:
def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuruacy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [20]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better. Percent incorrectly chosen labels counting assignment and non-assignment equally
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])


#K is what we imported keras backend as

#metric for keras for early stopping
#takes in raw labels from kerass (not yet converted to 0 and 1s)
#NOT the same as accuracy, this is total labels correctly identified divided by union of total labels
#this weights rows with more labels higher, where accruacy does not, but this is still a good metric for early stopping
def raw_multi_label_accuracy(y_true, y_pred):
    positives = K.greater_equal(y_pred, 0.5)
    positives = K.cast(positives, K.floatx())
    new_y_pred = positives #+ ((1-positives)*y_pred)
    intersection = y_true * new_y_pred
    union = 1 -((1-y_true)*(1-new_y_pred))
    accuracy = K.sum(intersection) / K.sum(union)
    return accuracy
    

In [21]:
def get_all_metrics(actual_labels, predictions):
    print('Getting evaluation metrics for each label:')
    get_per_label_metrics(actual_labels, predictions)
    print('Getting evaluations for multilabel problem')
    print('Multilabel accuracy: ' + str(multi_label_accuracy(actual_labels, predictions)))
    print('Multilabel precision: ' + str(multi_label_precision(actual_labels, predictions)))
    print('Multilabel recall: ' + str(multi_label_recall(actual_labels, predictions)))
    print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(actual_labels, predictions))))

In [22]:
#for early stopping only after certain number of epochs. wait until delay epochs until early stopping
#not same as patience. Want to not even start looking until delay is reached
class DelayedEarlyStopping(EarlyStopping):
    def __init__(self, monitor, min_delta=0, patience=0, verbose=0, mode='auto', delay = 100):
        super(DelayedEarlyStopping, self).__init__(monitor=monitor, min_delta=min_delta, patience=patience,verbose=verbose, mode=mode)
        self.delay = delay

    def on_epoch_end(self, epoch, logs=None):
        if epoch > self.delay:
            super().on_epoch_end(epoch, logs)

In [23]:
def nn_output_to_predictions(res):
    label_predictions = []
    for i in range(res.shape[0]):
        pred = [0]*len(genre_dict)
        for j in range(res.shape[1]):
            if res[i][j] >= .5:
                pred[j] = 1
        label_predictions.append(pred)
    return np.array(label_predictions)

Convolutional Neural Networks

In [24]:
model_cnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
#e = Embedding(num_words_kept, word_vec_len, input_length=max_seq_len, trainable=True)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model_cnn.add(Dropout(.5))
model_cnn.add(Dense(len(genre_dict), activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
model_cnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 4s - loss: 0.6853 - raw_multi_label_accuracy: 0.1930 - val_loss: 0.6676 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 2s - loss: 0.6498 - raw_multi_label_accuracy: 0.1897 - val_loss: 0.6484 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 2s - loss: 0.6348 - raw_multi_label_accuracy: 0.1925 - val_loss: 0.6359 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 1s - loss: 0.6199 - raw_multi_label_accuracy: 0.2039 - val_loss: 0.6287 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 1s - loss: 0.6077 - raw_multi_label_accuracy: 0.2110 - val_loss: 0.6227 - val_raw_multi_label_accuracy: 0.2144
Epoch

In [25]:
predictions = nn_output_to_predictions(model_cnn.predict(x_test_seq))

In [26]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6805555555555556
Precision for Action-Adventure: 0.5190839694656488
Recall for Action-Adventure: 0.35978835978835977

Accuruacy for Romance: 0.7899305555555556
Precision for Romance: 0.4
Recall for Romance: 0.10434782608695652

Accuruacy for Horror-Thriller: 0.6597222222222222
Precision for Horror-Thriller: 0.4968152866242038
Recall for Horror-Thriller: 0.4

Accuruacy for Comedy: 0.6319444444444444
Precision for Comedy: 0.43902439024390244
Recall for Comedy: 0.27411167512690354

Accuruacy for Science Fiction: 0.8333333333333334
Precision for Science Fiction: 0.59375
Recall for Science Fiction: 0.18627450980392157

Accuruacy for Drama: 0.5763888888888888
Precision for Drama: 0.5539568345323741
Recall for Drama: 0.7993079584775087

Getting evaluations for multilabel problem
Multilabel accuracy: 0.368055555555556
Multilabel precision: 0.5339892665474062
Multilabel recall: 0.46200810185185187
Percent of correctly 

CNN but with multiple filter sizes so we don't just filter on group of words at a time

In [27]:
model_input = Input(shape=(max_seq_len,), dtype='int32')
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)(model_input)
two_word_filter = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(e)
two_word_filter = GlobalMaxPooling1D()(two_word_filter)
three_word_filter = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(e)
three_word_filter = GlobalMaxPooling1D()(three_word_filter)
four_word_filter = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(e)
four_word_filter = GlobalMaxPooling1D()(four_word_filter)
merged = concatenate([two_word_filter, three_word_filter, four_word_filter], axis=1)

merged = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(merged)
merged = Dropout(0.5)(merged)
merged = Dense(len(genre_dict))(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[model_input], outputs=[output])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
model.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 5s - loss: 2.8157 - raw_multi_label_accuracy: 0.1935 - val_loss: 2.1944 - val_raw_multi_label_accuracy: 0.0405
Epoch 2/1000
 - 4s - loss: 1.7938 - raw_multi_label_accuracy: 0.1713 - val_loss: 1.4232 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 4s - loss: 1.1972 - raw_multi_label_accuracy: 0.1690 - val_loss: 1.0058 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 4s - loss: 0.8880 - raw_multi_label_accuracy: 0.1874 - val_loss: 0.7986 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 4s - loss: 0.7349 - raw_multi_label_accuracy: 0.1880 - val_loss: 0.6932 - val_raw_multi_label_accuracy: 0.2182
Epoch 6/1000
 - 4s - loss: 0.6597 - raw_multi_label_accuracy: 0.1731 - val_loss: 0.6457 - val_raw_multi_label_accuracy: 0.2182
Epoch 7/1000
 - 4s - loss: 0.6257 - raw_multi_label_accuracy: 0.1867 - val_loss: 0.6234 - val_raw_multi_label_accuracy: 0.2182
Epoch 8/1000
 - 4s - loss: 0.6090 - raw_multi_label_accuracy: 0.

In [28]:
predictions = nn_output_to_predictions(model.predict(x_test_seq))

In [29]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6875
Precision for Action-Adventure: 0.5436893203883495
Recall for Action-Adventure: 0.2962962962962963

Accuruacy for Romance: 0.8020833333333334
Precision for Romance: 0.5161290322580645
Recall for Romance: 0.1391304347826087

Accuruacy for Horror-Thriller: 0.6440972222222222
Precision for Horror-Thriller: 0.4752475247524752
Recall for Horror-Thriller: 0.49230769230769234

Accuruacy for Comedy: 0.6423611111111112
Precision for Comedy: 0.46853146853146854
Recall for Comedy: 0.3401015228426396

Accuruacy for Science Fiction: 0.828125
Precision for Science Fiction: 0.5555555555555556
Recall for Science Fiction: 0.14705882352941177

Accuruacy for Drama: 0.5868055555555556
Precision for Drama: 0.5698630136986301
Recall for Drama: 0.7197231833910035

Getting evaluations for multilabel problem
Multilabel accuracy: 0.3711516203703706
Multilabel precision: 0.5382249560632688
Multilabel recall: 0.46229745370370356
Per

Regular Neural Network

In [30]:
normal_nn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
normal_nn.add(e)
normal_nn.add(Flatten())
normal_nn.add(Dense(256, activation='relu'))
normal_nn.add(Dense(len(genre_dict), activation='sigmoid'))
normal_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
normal_nn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 3s - loss: 0.6162 - raw_multi_label_accuracy: 0.1691 - val_loss: 0.6187 - val_raw_multi_label_accuracy: 0.2204
Epoch 2/1000
 - 2s - loss: 0.5821 - raw_multi_label_accuracy: 0.2012 - val_loss: 0.6161 - val_raw_multi_label_accuracy: 0.1789
Epoch 3/1000
 - 2s - loss: 0.5636 - raw_multi_label_accuracy: 0.2134 - val_loss: 0.6139 - val_raw_multi_label_accuracy: 0.1867
Epoch 4/1000
 - 2s - loss: 0.5422 - raw_multi_label_accuracy: 0.2509 - val_loss: 0.6158 - val_raw_multi_label_accuracy: 0.1103
Epoch 5/1000
 - 3s - loss: 0.5058 - raw_multi_label_accuracy: 0.2913 - val_loss: 0.6169 - val_raw_multi_label_accuracy: 0.1916
Epoch 6/1000
 - 2s - loss: 0.4460 - raw_multi_label_accuracy: 0.4196 - val_loss: 0.6070 - val_raw_multi_label_accuracy: 0.2100
Epoch 7/1000
 - 2s - loss: 0.3570 - raw_multi_label_accuracy: 0.5989 - val_loss: 0.6101 - val_raw_multi_label_accuracy: 0.2275
Epoch 8/1000
 - 2s - loss: 0.2638 - raw_multi_label_accuracy: 0.

In [31]:
predictions = nn_output_to_predictions(normal_nn.predict(x_test_seq))

In [32]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6875
Precision for Action-Adventure: 0.5463917525773195
Recall for Action-Adventure: 0.2804232804232804

Accuruacy for Romance: 0.796875
Precision for Romance: 0.4444444444444444
Recall for Romance: 0.06956521739130435

Accuruacy for Horror-Thriller: 0.6458333333333334
Precision for Horror-Thriller: 0.4444444444444444
Recall for Horror-Thriller: 0.18461538461538463

Accuruacy for Comedy: 0.6284722222222222
Precision for Comedy: 0.4044943820224719
Recall for Comedy: 0.18274111675126903

Accuruacy for Science Fiction: 0.8333333333333334
Precision for Science Fiction: 0.59375
Recall for Science Fiction: 0.18627450980392157

Accuruacy for Drama: 0.5729166666666666
Precision for Drama: 0.5609065155807366
Recall for Drama: 0.6851211072664359

Getting evaluations for multilabel problem
Multilabel accuracy: 0.2873842592592595
Multilabel precision: 0.533571915473756
Multilabel recall: 0.3373842592592594
Percent of corr

LSTM

In [33]:
lstm_model = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
lstm_model.add(e)
lstm_model.add(LSTM(100, dropout=0.25, recurrent_dropout=0.25))
lstm_model.add(Dense(256, activation='relu'))
lstm_model.add(Dense(len(genre_dict), activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
lstm_model.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 11s - loss: 0.6269 - raw_multi_label_accuracy: 0.0951 - val_loss: 0.6041 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 9s - loss: 0.5917 - raw_multi_label_accuracy: 0.2306 - val_loss: 0.6009 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 9s - loss: 0.5887 - raw_multi_label_accuracy: 0.2305 - val_loss: 0.5992 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 9s - loss: 0.5873 - raw_multi_label_accuracy: 0.2282 - val_loss: 0.5994 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 8s - loss: 0.5869 - raw_multi_label_accuracy: 0.2306 - val_loss: 0.5994 - val_raw_multi_label_accuracy: 0.2182
Epoch 6/1000
 - 8s - loss: 0.5842 - raw_multi_label_accuracy: 0.2306 - val_loss: 0.5968 - val_raw_multi_label_accuracy: 0.2182
Epoch 7/1000
 - 8s - loss: 0.5802 - raw_multi_label_accuracy: 0.2077 - val_loss: 0.5940 - val_raw_multi_label_accuracy: 0.2151
Epoch 8/1000
 - 8s - loss: 0.5611 - raw_multi_label_accuracy: 0

In [34]:
predictions = nn_output_to_predictions(lstm_model.predict(x_test_seq))

In [35]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6371527777777778
Precision for Action-Adventure: 0.45535714285714285
Recall for Action-Adventure: 0.5396825396825397

Accuruacy for Romance: 0.765625
Precision for Romance: 0.4019607843137255
Recall for Romance: 0.3565217391304348

Accuruacy for Horror-Thriller: 0.6440972222222222
Precision for Horror-Thriller: 0.4742268041237113
Recall for Horror-Thriller: 0.4717948717948718

Accuruacy for Comedy: 0.6232638888888888
Precision for Comedy: 0.45535714285714285
Recall for Comedy: 0.5177664974619289

Accuruacy for Science Fiction: 0.8263888888888888
Precision for Science Fiction: 0.525
Recall for Science Fiction: 0.20588235294117646

Accuruacy for Drama: 0.5711805555555556
Precision for Drama: 0.5570652173913043
Recall for Drama: 0.7093425605536332

Getting evaluations for multilabel problem
Multilabel accuracy: 0.37777777777777816
Multilabel precision: 0.5026041666666666
Multilabel recall: 0.5389467592592593
Perc

simple rnn

In [36]:
rnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
rnn.add(e)
rnn.add(SimpleRNN(32, activation = 'relu'))
rnn.add(Dense(256, activation='relu'))
rnn.add(Dense(len(genre_dict), activation='sigmoid'))
rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
rnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 2s - loss: 0.6230 - raw_multi_label_accuracy: 0.1786 - val_loss: 0.6098 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 1s - loss: 0.5960 - raw_multi_label_accuracy: 0.2237 - val_loss: 0.6043 - val_raw_multi_label_accuracy: 0.2165
Epoch 3/1000
 - 1s - loss: 0.5844 - raw_multi_label_accuracy: 0.2115 - val_loss: 0.6006 - val_raw_multi_label_accuracy: 0.2165
Epoch 4/1000
 - 1s - loss: 0.5719 - raw_multi_label_accuracy: 0.2292 - val_loss: 0.6005 - val_raw_multi_label_accuracy: 0.2096
Epoch 5/1000
 - 1s - loss: 0.5525 - raw_multi_label_accuracy: 0.2313 - val_loss: 0.5925 - val_raw_multi_label_accuracy: 0.2136
Epoch 6/1000
 - 1s - loss: 0.5156 - raw_multi_label_accuracy: 0.2335 - val_loss: 0.6163 - val_raw_multi_label_accuracy: 0.2081
Epoch 7/1000
 - 1s - loss: 0.4563 - raw_multi_label_accuracy: 0.3149 - val_loss: 0.6233 - val_raw_multi_label_accuracy: 0.1663
Epoch 8/1000
 - 1s - loss: 0.3712 - raw_multi_label_accuracy: 0.

In [37]:
predictions = nn_output_to_predictions(rnn.predict(x_test_seq))

In [38]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.5347222222222222
Precision for Action-Adventure: 0.3034825870646766
Recall for Action-Adventure: 0.32275132275132273

Accuruacy for Romance: 0.7256944444444444
Precision for Romance: 0.1791044776119403
Recall for Romance: 0.10434782608695652

Accuruacy for Horror-Thriller: 0.5798611111111112
Precision for Horror-Thriller: 0.3592814371257485
Recall for Horror-Thriller: 0.3076923076923077

Accuruacy for Comedy: 0.578125
Precision for Comedy: 0.3776595744680851
Recall for Comedy: 0.3604060913705584

Accuruacy for Science Fiction: 0.8368055555555556
Precision for Science Fiction: 0.6
Recall for Science Fiction: 0.23529411764705882

Accuruacy for Drama: 0.5381944444444444
Precision for Drama: 0.5343283582089552
Recall for Drama: 0.6193771626297578

Getting evaluations for multilabel problem
Multilabel accuracy: 0.29646990740740764
Multilabel precision: 0.42806394316163443
Multilabel recall: 0.39782986111111135
Perc

bidirectional-LSTM

In [39]:
bi_lstm = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
bi_lstm.add(e)
bi_lstm.add(Bidirectional(LSTM(100, dropout=0.25, recurrent_dropout=0.25)))
bi_lstm.add(Dense(256, activation='relu'))
bi_lstm.add(Dense(len(genre_dict), activation='sigmoid'))
bi_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
bi_lstm.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 19s - loss: 0.6259 - raw_multi_label_accuracy: 0.2322 - val_loss: 0.6059 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 16s - loss: 0.5924 - raw_multi_label_accuracy: 0.1620 - val_loss: 0.6024 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 16s - loss: 0.5883 - raw_multi_label_accuracy: 0.2300 - val_loss: 0.5995 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 16s - loss: 0.5876 - raw_multi_label_accuracy: 0.2304 - val_loss: 0.5995 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 16s - loss: 0.5864 - raw_multi_label_accuracy: 0.2276 - val_loss: 0.5996 - val_raw_multi_label_accuracy: 0.2182
Epoch 6/1000
 - 16s - loss: 0.5839 - raw_multi_label_accuracy: 0.2294 - val_loss: 0.5973 - val_raw_multi_label_accuracy: 0.2182
Epoch 7/1000
 - 16s - loss: 0.5747 - raw_multi_label_accuracy: 0.2141 - val_loss: 0.5837 - val_raw_multi_label_accuracy: 0.1728
Epoch 8/1000
 - 16s - loss: 0.5362 - raw_multi_label_accu

In [40]:
predictions = nn_output_to_predictions(bi_lstm.predict(x_test_seq))

In [41]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6666666666666666
Precision for Action-Adventure: 0.4911242603550296
Recall for Action-Adventure: 0.43915343915343913

Accuruacy for Romance: 0.78125
Precision for Romance: 0.4266666666666667
Recall for Romance: 0.2782608695652174

Accuruacy for Horror-Thriller: 0.6493055555555556
Precision for Horror-Thriller: 0.48717948717948717
Recall for Horror-Thriller: 0.6820512820512821

Accuruacy for Comedy: 0.6336805555555556
Precision for Comedy: 0.46534653465346537
Recall for Comedy: 0.47715736040609136

Accuruacy for Science Fiction: 0.8246527777777778
Precision for Science Fiction: 0.5106382978723404
Recall for Science Fiction: 0.23529411764705882

Accuruacy for Drama: 0.5520833333333334
Precision for Drama: 0.547112462006079
Recall for Drama: 0.6228373702422145

Getting evaluations for multilabel problem
Multilabel accuracy: 0.3789351851851858
Multilabel precision: 0.5147569444444445
Multilabel recall: 0.525810185