In [1]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn import utils
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.layers import Conv1D, GlobalMaxPooling1D, LSTM, SimpleRNN, Dense, Dropout, Flatten, Bidirectional
from keras.layers import Input, concatenate, Activation
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.regularizers import l2

stop_words = set(stopwords.words('english'))

import time

Using TensorFlow backend.


read in the data

In [2]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)

In [3]:
#parse each row to get label vectors from json
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #number of genres we are looking at
        for i in range(numElems):
            genre_str = (json_genres[i]['name'])
            if genre_str in genre_map.keys():
                ret[genre_dict[genre_map[genre_str]]] = 1
        return ret
    except Exception as excep:
        print('Exception' + str(excep))
        return ''

Get dictionary for genre to its index in label vector

In [4]:
genre_dict = {}
genre_dict['Action-Adventure'] = 0
genre_dict['Romance'] = 1
genre_dict['Horror-Thriller'] = 2
genre_dict['Comedy'] = 3
genre_dict['Science Fiction'] = 4
genre_dict['Drama'] = 5
genre_dict

{'Action-Adventure': 0,
 'Romance': 1,
 'Horror-Thriller': 2,
 'Comedy': 3,
 'Science Fiction': 4,
 'Drama': 5}

In [5]:
#map original labels to more coarse grained labels
genre_map = {}
genre_map['Adventure'] = 'Action-Adventure'
genre_map['Romance'] = 'Romance'
genre_map['Horror'] = 'Horror-Thriller'
genre_map['Thriller'] = 'Horror-Thriller'
genre_map['Comedy'] = 'Comedy'
genre_map['Fantasy'] = 'Science Fiction'
genre_map['Action'] = 'Action-Adventure'
genre_map['Science Fiction'] = 'Science Fiction'
genre_map['Drama'] = 'Drama'

In [6]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret

In [7]:
getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [8]:
#put to lower case, remove punctation, remove stopwords
def cleanText(text):
    no_stopword_text = [w for w in text.split() if not w in stop_words]
    text = ' '.join(no_stopword_text)
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text

all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [9]:
all_data = all_data[all_data.genres_vect.map(sum) > 0] #drop rows that now have no labels 

In [10]:
#neural net data only needs a few cols
nn_data = all_data[['cleanOverview', 'genres_vect', 'overview']]

In [11]:
train, test = train_test_split(nn_data, test_size=0.2, random_state=42)

Extract actual features and labels from train and test set

In [12]:
#gettrian and test features for classification. Just need text and lables for this
x = train['cleanOverview'].values.tolist()
y = train['genres_vect']
x_test = test['cleanOverview'].values.tolist()
y_test = test['genres_vect']

In [13]:
#convert labels from array of lists to numpy array

y_train = y.tolist()
y_train = np.array(y_train)

y_test = y_test.tolist()
y_test = np.array(y_test)

Get initial word embedding vectors

In [14]:
tok = [word_tokenize(ov) for ov in x]

In [15]:
word_vec_len = 32
w2v = Word2Vec(tok, min_count = 2, size=word_vec_len)

In [16]:
num_words_kept = 100000 #using 100000 most popular words, use throughout

tokenizer = Tokenizer(num_words_kept)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

max_seq_len = 150 #larger than averaage but not too large

#get actual train features to feed into neural nets for training
x_train_seq = pad_sequences(sequences, maxlen=max_seq_len)

In [17]:
test_sequences = tokenizer.texts_to_sequences(x_test)
#get actual test features to feed into neural nets for testing
x_test_seq = pad_sequences(test_sequences, maxlen=max_seq_len)

Get word embeddings matrix for start input to neural net

In [18]:
#Citation: This technique to get word embeddings comes, with some minor changes, mostly from: 
#https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-11-cnn-word2vec-41f5e28eda74

embeddings_index = {}
for w in w2v.wv.vocab.keys():
    embeddings_index[w] = w2v.wv[w]


embedding_matrix = np.zeros((num_words_kept, word_vec_len))
for word, i in tokenizer.word_index.items():
    if i >= num_words_kept:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Below we define evlaution metric functions

In [19]:
def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuruacy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [20]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better. Percent incorrectly chosen labels counting assignment and non-assignment equally
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])


#K is what we imported keras backend as

#metric for keras for early stopping
#takes in raw labels from kerass (not yet converted to 0 and 1s)
#NOT the same as accuracy, this is total labels correctly identified divided by union of total labels
#this weights rows with more labels higher, where accruacy does not, but this is still a good metric for early stopping
def raw_multi_label_accuracy(y_true, y_pred):
    positives = K.greater_equal(y_pred, 0.5)
    positives = K.cast(positives, K.floatx())
    new_y_pred = positives #+ ((1-positives)*y_pred)
    intersection = y_true * new_y_pred
    union = 1 -((1-y_true)*(1-new_y_pred))
    accuracy = K.sum(intersection) / K.sum(union)
    return accuracy
    

In [21]:
def get_all_metrics(actual_labels, predictions):
    print('Getting evaluation metrics for each label:')
    get_per_label_metrics(actual_labels, predictions)
    print('Getting evaluations for multilabel problem')
    print('Multilabel accuracy: ' + str(multi_label_accuracy(actual_labels, predictions)))
    print('Multilabel precision: ' + str(multi_label_precision(actual_labels, predictions)))
    print('Multilabel recall: ' + str(multi_label_recall(actual_labels, predictions)))
    print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(actual_labels, predictions))))

In [22]:
#for early stopping only after certain number of epochs. wait until delay epochs until early stopping
#not same as patience. Want to not even start looking until delay is reached
class DelayedEarlyStopping(EarlyStopping):
    def __init__(self, monitor, min_delta=0, patience=0, verbose=0, mode='auto', delay = 100):
        super(DelayedEarlyStopping, self).__init__(monitor=monitor, min_delta=min_delta, patience=patience,verbose=verbose, mode=mode)
        self.delay = delay

    def on_epoch_end(self, epoch, logs=None):
        if epoch > self.delay:
            super().on_epoch_end(epoch, logs)

In [23]:
def nn_output_to_predictions(res):
    label_predictions = []
    for i in range(res.shape[0]):
        pred = [0]*len(genre_dict)
        for j in range(res.shape[1]):
            if res[i][j] >= .5:
                pred[j] = 1
        label_predictions.append(pred)
    return np.array(label_predictions)

Convolutional Neural Networks

In [24]:
model_cnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
#e = Embedding(num_words_kept, word_vec_len, input_length=max_seq_len, trainable=True)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model_cnn.add(Dropout(.5))
model_cnn.add(Dense(len(genre_dict), activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
model_cnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 3s - loss: 0.6957 - raw_multi_label_accuracy: 0.1961 - val_loss: 0.6663 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 1s - loss: 0.6530 - raw_multi_label_accuracy: 0.1983 - val_loss: 0.6500 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 2s - loss: 0.6356 - raw_multi_label_accuracy: 0.1894 - val_loss: 0.6384 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 1s - loss: 0.6266 - raw_multi_label_accuracy: 0.1906 - val_loss: 0.6309 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 1s - loss: 0.6139 - raw_multi_label_accuracy: 0.2056 - val_loss: 0.6258 - val_raw_multi_label_accuracy: 0.2054
Epoch

In [25]:
predictions = nn_output_to_predictions(model_cnn.predict(x_test_seq))

In [26]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6822916666666666
Precision for Action-Adventure: 0.5211267605633803
Recall for Action-Adventure: 0.3915343915343915

Accuruacy for Romance: 0.8090277777777778
Precision for Romance: 0.5925925925925926
Recall for Romance: 0.1391304347826087

Accuruacy for Horror-Thriller: 0.6822916666666666
Precision for Horror-Thriller: 0.5337078651685393
Recall for Horror-Thriller: 0.48717948717948717

Accuruacy for Comedy: 0.6302083333333334
Precision for Comedy: 0.43846153846153846
Recall for Comedy: 0.2893401015228426

Accuruacy for Science Fiction: 0.796875
Precision for Science Fiction: 0.35294117647058826
Recall for Science Fiction: 0.17647058823529413

Accuruacy for Drama: 0.5347222222222222
Precision for Drama: 0.5287671232876713
Recall for Drama: 0.6678200692041523

Getting evaluations for multilabel problem
Multilabel accuracy: 0.35795717592592635
Multilabel precision: 0.5242805755395683
Multilabel recall: 0.4453703

CNN but with multiple filter sizes so we don't just filter on group of words at a time

In [27]:
model_input = Input(shape=(max_seq_len,), dtype='int32')
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)(model_input)
two_word_filter = Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1)(e)
two_word_filter = GlobalMaxPooling1D()(two_word_filter)
three_word_filter = Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)(e)
three_word_filter = GlobalMaxPooling1D()(three_word_filter)
four_word_filter = Conv1D(filters=100, kernel_size=4, padding='valid', activation='relu', strides=1)(e)
four_word_filter = GlobalMaxPooling1D()(four_word_filter)
merged = concatenate([two_word_filter, three_word_filter, four_word_filter], axis=1)

merged = Dense(256, activation='relu', kernel_regularizer=l2(0.01))(merged)
merged = Dropout(0.5)(merged)
merged = Dense(len(genre_dict))(merged)
output = Activation('sigmoid')(merged)
model = Model(inputs=[model_input], outputs=[output])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
model.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 5s - loss: 2.8385 - raw_multi_label_accuracy: 0.1833 - val_loss: 2.2222 - val_raw_multi_label_accuracy: 0.1882
Epoch 2/1000
 - 4s - loss: 1.8214 - raw_multi_label_accuracy: 0.1738 - val_loss: 1.4537 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 4s - loss: 1.2267 - raw_multi_label_accuracy: 0.1739 - val_loss: 1.0323 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 4s - loss: 0.9104 - raw_multi_label_accuracy: 0.1778 - val_loss: 0.8161 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 4s - loss: 0.7506 - raw_multi_label_accuracy: 0.1647 - val_loss: 0.7073 - val_raw_multi_label_accuracy: 0.2182
Epoch 6/1000
 - 4s - loss: 0.6687 - raw_multi_label_accuracy: 0.1931 - val_loss: 0.6565 - val_raw_multi_label_accuracy: 0.2182
Epoch 7/1000
 - 4s - loss: 0.6304 - raw_multi_label_accuracy: 0.1788 - val_loss: 0.6329 - val_raw_multi_label_accuracy: 0.2182
Epoch 8/1000
 - 4s - loss: 0.6112 - raw_multi_label_accuracy: 0.

In [28]:
predictions = nn_output_to_predictions(model.predict(x_test_seq))

In [29]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6475694444444444
Precision for Action-Adventure: 0.4435483870967742
Recall for Action-Adventure: 0.291005291005291

Accuruacy for Romance: 0.78125
Precision for Romance: 0.37209302325581395
Recall for Romance: 0.1391304347826087

Accuruacy for Horror-Thriller: 0.6493055555555556
Precision for Horror-Thriller: 0.47586206896551725
Recall for Horror-Thriller: 0.35384615384615387

Accuruacy for Comedy: 0.6302083333333334
Precision for Comedy: 0.4411764705882353
Recall for Comedy: 0.30456852791878175

Accuruacy for Science Fiction: 0.8020833333333334
Precision for Science Fiction: 0.34210526315789475
Recall for Science Fiction: 0.12745098039215685

Accuruacy for Drama: 0.5972222222222222
Precision for Drama: 0.5780821917808219
Recall for Drama: 0.7301038062283737

Getting evaluations for multilabel problem
Multilabel accuracy: 0.33486689814814846
Multilabel precision: 0.5137614678899081
Multilabel recall: 0.4154803

Regular Neural Network

In [30]:
normal_nn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
normal_nn.add(e)
normal_nn.add(Flatten())
normal_nn.add(Dense(256, activation='relu'))
normal_nn.add(Dense(len(genre_dict), activation='sigmoid'))
normal_nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
normal_nn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 3s - loss: 0.6258 - raw_multi_label_accuracy: 0.1908 - val_loss: 0.6171 - val_raw_multi_label_accuracy: 0.1715
Epoch 2/1000
 - 3s - loss: 0.5864 - raw_multi_label_accuracy: 0.2006 - val_loss: 0.6180 - val_raw_multi_label_accuracy: 0.1390
Epoch 3/1000
 - 3s - loss: 0.5677 - raw_multi_label_accuracy: 0.2072 - val_loss: 0.6183 - val_raw_multi_label_accuracy: 0.1429
Epoch 4/1000
 - 2s - loss: 0.5468 - raw_multi_label_accuracy: 0.2407 - val_loss: 0.6151 - val_raw_multi_label_accuracy: 0.1447
Epoch 5/1000
 - 2s - loss: 0.5192 - raw_multi_label_accuracy: 0.2853 - val_loss: 0.6185 - val_raw_multi_label_accuracy: 0.1270
Epoch 6/1000
 - 2s - loss: 0.4734 - raw_multi_label_accuracy: 0.3786 - val_loss: 0.6192 - val_raw_multi_label_accuracy: 0.2058
Epoch 7/1000
 - 3s - loss: 0.3983 - raw_multi_label_accuracy: 0.5333 - val_loss: 0.6267 - val_raw_multi_label_accuracy: 0.1919
Epoch 8/1000
 - 3s - loss: 0.3036 - raw_multi_label_accuracy: 0.

In [31]:
predictions = nn_output_to_predictions(normal_nn.predict(x_test_seq))

In [32]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6805555555555556
Precision for Action-Adventure: 0.52
Recall for Action-Adventure: 0.3439153439153439

Accuruacy for Romance: 0.8055555555555556
Precision for Romance: 0.6
Recall for Romance: 0.0782608695652174

Accuruacy for Horror-Thriller: 0.6961805555555556
Precision for Horror-Thriller: 0.6428571428571429
Recall for Horror-Thriller: 0.23076923076923078

Accuruacy for Comedy: 0.6545138888888888
Precision for Comedy: 0.4880952380952381
Recall for Comedy: 0.20812182741116753

Accuruacy for Science Fiction: 0.8298611111111112
Precision for Science Fiction: 0.6
Recall for Science Fiction: 0.11764705882352941

Accuruacy for Drama: 0.578125
Precision for Drama: 0.5727848101265823
Recall for Drama: 0.6262975778546713

Getting evaluations for multilabel problem
Multilabel accuracy: 0.29800347222222245
Multilabel precision: 0.5701754385964913
Multilabel recall: 0.34366319444444465
Percent of correctly decided label

LSTM

In [33]:
lstm_model = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
lstm_model.add(e)
lstm_model.add(LSTM(100, dropout=0.25, recurrent_dropout=0.25))
lstm_model.add(Dense(256, activation='relu'))
lstm_model.add(Dense(len(genre_dict), activation='sigmoid'))
lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
lstm_model.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 12s - loss: 0.6253 - raw_multi_label_accuracy: 0.2010 - val_loss: 0.6029 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 10s - loss: 0.5916 - raw_multi_label_accuracy: 0.2310 - val_loss: 0.5991 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 10s - loss: 0.5886 - raw_multi_label_accuracy: 0.1698 - val_loss: 0.6006 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 8s - loss: 0.5873 - raw_multi_label_accuracy: 0.2301 - val_loss: 0.5994 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 10s - loss: 0.5856 - raw_multi_label_accuracy: 0.2289 - val_loss: 0.5970 - val_raw_multi_label_accuracy: 0.0972
Epoch 6/1000
 - 11s - loss: 0.5838 - raw_multi_label_accuracy: 0.1659 - val_loss: 0.5984 - val_raw_multi_label_accuracy: 0.2182
Epoch 7/1000
 - 11s - loss: 0.5728 - raw_multi_label_accuracy: 0.2094 - val_loss: 0.5840 - val_raw_multi_label_accuracy: 0.1375
Epoch 8/1000
 - 10s - loss: 0.5341 - raw_multi_label_accur

In [34]:
predictions = nn_output_to_predictions(lstm_model.predict(x_test_seq))

In [35]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6840277777777778
Precision for Action-Adventure: 0.519774011299435
Recall for Action-Adventure: 0.48677248677248675

Accuruacy for Romance: 0.7673611111111112
Precision for Romance: 0.4020618556701031
Recall for Romance: 0.3391304347826087

Accuruacy for Horror-Thriller: 0.625
Precision for Horror-Thriller: 0.452914798206278
Recall for Horror-Thriller: 0.517948717948718

Accuruacy for Comedy: 0.5885416666666666
Precision for Comedy: 0.4090909090909091
Recall for Comedy: 0.45685279187817257

Accuruacy for Science Fiction: 0.8246527777777778
Precision for Science Fiction: 0.5102040816326531
Recall for Science Fiction: 0.24509803921568626

Accuruacy for Drama: 0.5555555555555556
Precision for Drama: 0.5435356200527705
Recall for Drama: 0.71280276816609

Getting evaluations for multilabel problem
Multilabel accuracy: 0.37702546296296346
Multilabel precision: 0.5014178240740741
Multilabel recall: 0.5313657407407409

simple rnn

In [36]:
rnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
rnn.add(e)
rnn.add(SimpleRNN(32, activation = 'relu'))
rnn.add(Dense(256, activation='relu'))
rnn.add(Dense(len(genre_dict), activation='sigmoid'))
rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
rnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 3s - loss: 0.6468 - raw_multi_label_accuracy: 0.2246 - val_loss: 0.6168 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 2s - loss: 0.6027 - raw_multi_label_accuracy: 0.2304 - val_loss: 0.6056 - val_raw_multi_label_accuracy: 0.2182
Epoch 3/1000
 - 1s - loss: 0.5882 - raw_multi_label_accuracy: 0.2309 - val_loss: 0.5997 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 2s - loss: 0.5748 - raw_multi_label_accuracy: 0.2294 - val_loss: 0.6012 - val_raw_multi_label_accuracy: 0.2182
Epoch 5/1000
 - 1s - loss: 0.5562 - raw_multi_label_accuracy: 0.2317 - val_loss: 0.5998 - val_raw_multi_label_accuracy: 0.2191
Epoch 6/1000
 - 2s - loss: 0.5218 - raw_multi_label_accuracy: 0.2316 - val_loss: 0.5906 - val_raw_multi_label_accuracy: 0.1814
Epoch 7/1000
 - 2s - loss: 0.4772 - raw_multi_label_accuracy: 0.2936 - val_loss: 0.5967 - val_raw_multi_label_accuracy: 0.2027
Epoch 8/1000
 - 1s - loss: 0.4180 - raw_multi_label_accuracy: 0.

In [37]:
predictions = nn_output_to_predictions(rnn.predict(x_test_seq))

In [38]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.59375
Precision for Action-Adventure: 0.38219895287958117
Recall for Action-Adventure: 0.3862433862433862

Accuruacy for Romance: 0.7638888888888888
Precision for Romance: 0.26666666666666666
Recall for Romance: 0.10434782608695652

Accuruacy for Horror-Thriller: 0.6371527777777778
Precision for Horror-Thriller: 0.4533333333333333
Recall for Horror-Thriller: 0.3487179487179487

Accuruacy for Comedy: 0.5555555555555556
Precision for Comedy: 0.3254437869822485
Recall for Comedy: 0.27918781725888325

Accuruacy for Science Fiction: 0.8090277777777778
Precision for Science Fiction: 0.43333333333333335
Recall for Science Fiction: 0.2549019607843137

Accuruacy for Drama: 0.5590277777777778
Precision for Drama: 0.5609756097560976
Recall for Drama: 0.5570934256055363

Getting evaluations for multilabel problem
Multilabel accuracy: 0.2987268518518518
Multilabel precision: 0.4647077114427863
Multilabel recall: 0.39328703

bidirectional-LSTM

In [39]:
bi_lstm = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=max_seq_len, trainable=True)
bi_lstm.add(e)
bi_lstm.add(Bidirectional(LSTM(100, dropout=0.25, recurrent_dropout=0.25)))
bi_lstm.add(Dense(256, activation='relu'))
bi_lstm.add(Dense(len(genre_dict), activation='sigmoid'))
bi_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
start = time.time()
bi_lstm.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 5, delay=25)], epochs=1000, batch_size=100, verbose=2)
end = time.time()
print('Time to train with cross validation for early stopping: ' + str(end-start) + ' seconds')

Train on 2072 samples, validate on 231 samples
Epoch 1/1000
 - 23s - loss: 0.6218 - raw_multi_label_accuracy: 0.2025 - val_loss: 0.6042 - val_raw_multi_label_accuracy: 0.2182
Epoch 2/1000
 - 17s - loss: 0.5917 - raw_multi_label_accuracy: 0.2010 - val_loss: 0.5999 - val_raw_multi_label_accuracy: 0.2139
Epoch 3/1000
 - 17s - loss: 0.5893 - raw_multi_label_accuracy: 0.2286 - val_loss: 0.6006 - val_raw_multi_label_accuracy: 0.2182
Epoch 4/1000
 - 17s - loss: 0.5874 - raw_multi_label_accuracy: 0.2241 - val_loss: 0.5999 - val_raw_multi_label_accuracy: 0.1748
Epoch 5/1000
 - 19s - loss: 0.5871 - raw_multi_label_accuracy: 0.2224 - val_loss: 0.5988 - val_raw_multi_label_accuracy: 0.2182
Epoch 6/1000
 - 19s - loss: 0.5844 - raw_multi_label_accuracy: 0.2307 - val_loss: 0.5979 - val_raw_multi_label_accuracy: 0.2182
Epoch 7/1000
 - 19s - loss: 0.5771 - raw_multi_label_accuracy: 0.2150 - val_loss: 0.5919 - val_raw_multi_label_accuracy: 0.2167
Epoch 8/1000
 - 18s - loss: 0.5533 - raw_multi_label_accu

In [40]:
predictions = nn_output_to_predictions(bi_lstm.predict(x_test_seq))

In [41]:
get_all_metrics(y_test, predictions)

Getting evaluation metrics for each label:
Accuruacy for Action-Adventure: 0.6493055555555556
Precision for Action-Adventure: 0.47346938775510206
Recall for Action-Adventure: 0.6137566137566137

Accuruacy for Romance: 0.8107638888888888
Precision for Romance: 0.5789473684210527
Recall for Romance: 0.19130434782608696

Accuruacy for Horror-Thriller: 0.6736111111111112
Precision for Horror-Thriller: 0.5207100591715976
Recall for Horror-Thriller: 0.4512820512820513

Accuruacy for Comedy: 0.6215277777777778
Precision for Comedy: 0.46511627906976744
Recall for Comedy: 0.7106598984771574

Accuruacy for Science Fiction: 0.8038194444444444
Precision for Science Fiction: 0.4126984126984127
Recall for Science Fiction: 0.2549019607843137

Accuruacy for Drama: 0.5972222222222222
Precision for Drama: 0.5789473684210527
Recall for Drama: 0.7231833910034602

Getting evaluations for multilabel problem
Multilabel accuracy: 0.4102430555555559
Multilabel precision: 0.5370370370370371
Multilabel recall: 0