In [44]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [45]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [46]:
def text_to_list(x):
    if pd.isna(x):
        return ''
    else:
        return ast.literal_eval(x)

def parse_json(x):
    try:
        return json.loads(x.replace("'", '"'))[0]['name']
    except:
        return ''
    
def parse_all_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        for i in range(numElems):
            genre_set.add(json_genres[i]['name'])
    except:
        return ''
    
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #20 0s
        for i in range(numElems):
            ret[genre_dict[(json_genres[i]['name'])]] = 1
        return ret
    except:
        return ''
    

def get_labels_as_strs(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = []#20 0s
        for i in range(numElems):
            ret.append(json_genres[i]['name'])
        return ret
    except:
        return ''

In [47]:
 def getAllGenres():
    full_data = pd.read_csv('train.csv')

    y = full_data['genres']
    y.apply(parse_all_genres_json)

In [48]:
getAllGenres()#populate the genre set

In [49]:
#get set to dictionary for indexing of target vectors
genre_dict = {}
index = 0
for genre in genre_set:
    genre_dict[genre] = index
    index += 1

In [50]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    label_strs = y.apply(get_labels_as_strs)
    all_data['genres_labels'] = label_strs
    return ret

In [51]:
labels_vects = getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [52]:
#put to lower case, remove punctation
def cleanText(text):
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [53]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_labels', 'genres_vect', 'overview']]

In [54]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

CNN STUFF here

In [55]:
#get word embeddings
x = train['cleanOverview'].values.tolist()
y = train['genres_vect']

In [56]:
x_test = test['cleanOverview'].values.tolist()
y_test = test['genres_vect']

In [57]:
y_train = y.tolist()
y_train = np.array(y_train)

In [58]:
y_test = y_test.tolist()
y_test = np.array(y_test)

In [59]:
tok = [word_tokenize(sent) for sent in x]

In [60]:
word_vec_len = 32
model = Word2Vec(tok, min_count = 1, size=word_vec_len)

In [61]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words_kept = 100000 #using 100000 most popular words, use throughout

tokenizer = Tokenizer(num_words_kept)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

x_train_seq = pad_sequences(sequences, maxlen=200)

In [62]:
test_sequences = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(test_sequences, maxlen=200)

In [63]:
embeddings_index = {}
for w in model.wv.vocab.keys():
    embeddings_index[w] = model.wv[w]


embedding_matrix = np.zeros((num_words_kept, word_vec_len))
for word, i in tokenizer.word_index.items():
    if i >= num_words_kept:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [64]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def get_per_label_metrics(real_labels_matrix, predictions_labels_matrix):
    for genre in genre_dict.keys():
        index = genre_dict[genre]
        real_labels_vect = real_labels_matrix[:, index]
        prediction_vect = predictions_labels_matrix[:,index]
        print("Accuracy for " + genre + ": " + str(accuracy_score(real_labels_vect, prediction_vect)))
        print("Precision for " + genre + ": " + str(precision_score(real_labels_vect, prediction_vect)))
        print("Recall for " + genre + ": " + str(recall_score(real_labels_vect, prediction_vect)))
        print()

In [65]:
#size of intersection of predicted and actual labels divided by size of their union for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

#size of intersection of predicted and actual labels divided by size of predicted set for each datapoint tested on
#sum those and divide by number of datapoints
#if no predicted labels, don't count that row towards the precision as that would be undefined
def multi_label_precision(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    precision_sum = 0
    num_rows = 0
    for row in range(intersection.shape[0]):
        if sum(predictions_labels_matrix[row]) > 0: #if there is at least one prediction for this row
            num_rows += 1
            precision_sum += sum(intersection[row]) / sum(predictions_labels_matrix[row])
    if num_rows == 0:
        return 0#no labels predicted at all will give us 0 precision as precision makes no sense here
    return precision_sum / num_rows

#size of intersection of predicted and actual labels divided by size of real label set for each datapoint tested on
#sum those and divide by number of datapoints
#all datapoints should have at least 1 real label in this data set
#vectorized for speed
def multi_label_recall(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    #sum(array.T) gets number of 1s in row
    row_wise_recall = sum(intersection.T) / sum(real_labels_matrix.T)
    return sum(row_wise_recall) / real_labels_matrix.shape[0]

#lower is better
def hamming_loss(real_labels_matrix, predictions_labels_matrix):
    return (np.logical_xor(real_labels_matrix, predictions_labels_matrix)).sum()/(real_labels_matrix.shape[0] * real_labels_matrix.shape[1])

import keras.backend as K

#metric for keras for early stopping
#takes in raw labels from kerass (not yet converted to 0 and 1s)
#NOT the same as accuracy, this is total labels correctly identified divided by union of total labels
#this weights rows with more labels higher, where accruacy does not, but this is still a good metric for early stopping
def raw_multi_label_accuracy(y_true, y_pred):
    positives = K.greater_equal(y_pred, 0.5)
    positives = K.cast(positives, K.floatx())
    new_y_pred = positives #+ ((1-positives)*y_pred)
    intersection = y_true * new_y_pred
    union = 1 -((1-y_true)*(1-new_y_pred))
    accuracy = K.sum(intersection) / K.sum(union)
    return accuracy
    

In [66]:
from keras.callbacks import EarlyStopping
#for early stopping only after certain number of epochs. wait until delay epochs until early stopping
class DelayedEarlyStopping(EarlyStopping):
    def __init__(self, monitor, min_delta=0, patience=0, verbose=0, mode='auto', delay = 100):
        super(DelayedEarlyStopping, self).__init__()
        self.delay = delay

    def on_epoch_end(self, epoch, logs=None):
        if epoch > self.delay:
            super().on_epoch_end(epoch, logs)

In [67]:
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten, LSTM
from keras.layers.embeddings import Embedding

model_lstm = Sequential()
#e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=200, trainable=False)
#e = Embedding(num_words_kept, word_vec_len, input_length=200, trainable=True)
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=200, trainable=True)
model_lstm.add(e)
model_lstm.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_lstm.add(GlobalMaxPooling1D())
model_lstm.add(LSTM(100))
model_lstm.add(Dense(256, activation='relu'))
model_lstm.add(Dense(20, activation='sigmoid'))
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
print(model_lstm.summary())
#model_cnn_01.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
model_lstm.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 10, delay=250)], epochs=1000, batch_size=100, verbose=2)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 200, 32)           3200000   
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 199, 100)          6500      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 99, 100)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_4 (Dense)              (None, 20)                2020      
Total params: 3,288,920
Trainable params: 3,288,920
Non-trainable params: 0
_________________________________________________________________
None
Train on 2149 samples, validate on 239 samples
Epoch 1/1000
 - 9s - loss: 0.4270 - raw_multi_label_accuracy: 0.0786 - val_loss:

Epoch 57/1000
 - 7s - loss: 0.0660 - raw_multi_label_accuracy: 0.8625 - val_loss: 0.5494 - val_raw_multi_label_accuracy: 0.1829
Epoch 58/1000
 - 7s - loss: 0.0641 - raw_multi_label_accuracy: 0.8644 - val_loss: 0.5589 - val_raw_multi_label_accuracy: 0.1826
Epoch 59/1000
 - 7s - loss: 0.0618 - raw_multi_label_accuracy: 0.8751 - val_loss: 0.5599 - val_raw_multi_label_accuracy: 0.1859
Epoch 60/1000
 - 7s - loss: 0.0602 - raw_multi_label_accuracy: 0.8817 - val_loss: 0.5686 - val_raw_multi_label_accuracy: 0.1851
Epoch 61/1000
 - 7s - loss: 0.0575 - raw_multi_label_accuracy: 0.8841 - val_loss: 0.5665 - val_raw_multi_label_accuracy: 0.1831
Epoch 62/1000
 - 7s - loss: 0.0557 - raw_multi_label_accuracy: 0.8937 - val_loss: 0.5759 - val_raw_multi_label_accuracy: 0.1680
Epoch 63/1000
 - 7s - loss: 0.0536 - raw_multi_label_accuracy: 0.9028 - val_loss: 0.5798 - val_raw_multi_label_accuracy: 0.1838
Epoch 64/1000
 - 7s - loss: 0.0516 - raw_multi_label_accuracy: 0.9086 - val_loss: 0.5832 - val_raw_multi

Epoch 121/1000
 - 7s - loss: 0.0096 - raw_multi_label_accuracy: 0.9942 - val_loss: 0.8430 - val_raw_multi_label_accuracy: 0.1856
Epoch 122/1000
 - 7s - loss: 0.0092 - raw_multi_label_accuracy: 0.9959 - val_loss: 0.8462 - val_raw_multi_label_accuracy: 0.1797
Epoch 123/1000
 - 7s - loss: 0.0090 - raw_multi_label_accuracy: 0.9957 - val_loss: 0.8473 - val_raw_multi_label_accuracy: 0.1845
Epoch 124/1000
 - 7s - loss: 0.0087 - raw_multi_label_accuracy: 0.9968 - val_loss: 0.8513 - val_raw_multi_label_accuracy: 0.1910
Epoch 125/1000
 - 7s - loss: 0.0086 - raw_multi_label_accuracy: 0.9967 - val_loss: 0.8495 - val_raw_multi_label_accuracy: 0.1916
Epoch 126/1000
 - 7s - loss: 0.0085 - raw_multi_label_accuracy: 0.9967 - val_loss: 0.8603 - val_raw_multi_label_accuracy: 0.1833
Epoch 127/1000
 - 7s - loss: 0.0083 - raw_multi_label_accuracy: 0.9957 - val_loss: 0.8543 - val_raw_multi_label_accuracy: 0.1822
Epoch 128/1000
 - 7s - loss: 0.0081 - raw_multi_label_accuracy: 0.9970 - val_loss: 0.8693 - val_r

Epoch 185/1000
 - 8s - loss: 0.0027 - raw_multi_label_accuracy: 0.9992 - val_loss: 1.0088 - val_raw_multi_label_accuracy: 0.1798
Epoch 186/1000
 - 8s - loss: 0.0028 - raw_multi_label_accuracy: 0.9983 - val_loss: 1.0117 - val_raw_multi_label_accuracy: 0.1800
Epoch 187/1000
 - 8s - loss: 0.0027 - raw_multi_label_accuracy: 0.9994 - val_loss: 1.0127 - val_raw_multi_label_accuracy: 0.1837
Epoch 188/1000
 - 8s - loss: 0.0026 - raw_multi_label_accuracy: 0.9992 - val_loss: 1.0183 - val_raw_multi_label_accuracy: 0.1828
Epoch 189/1000
 - 7s - loss: 0.0026 - raw_multi_label_accuracy: 0.9989 - val_loss: 1.0186 - val_raw_multi_label_accuracy: 0.1838
Epoch 190/1000
 - 14s - loss: 0.0025 - raw_multi_label_accuracy: 0.9992 - val_loss: 1.0165 - val_raw_multi_label_accuracy: 0.1824
Epoch 191/1000
 - 11s - loss: 0.0025 - raw_multi_label_accuracy: 0.9991 - val_loss: 1.0190 - val_raw_multi_label_accuracy: 0.1833
Epoch 192/1000
 - 12s - loss: 0.0024 - raw_multi_label_accuracy: 0.9994 - val_loss: 1.0239 - va

Epoch 249/1000
 - 9s - loss: 0.0018 - raw_multi_label_accuracy: 0.9990 - val_loss: 1.0940 - val_raw_multi_label_accuracy: 0.1825
Epoch 250/1000
 - 7s - loss: 0.0018 - raw_multi_label_accuracy: 0.9994 - val_loss: 1.0973 - val_raw_multi_label_accuracy: 0.1818
Epoch 251/1000
 - 7s - loss: 0.0017 - raw_multi_label_accuracy: 0.9992 - val_loss: 1.0978 - val_raw_multi_label_accuracy: 0.1838
Epoch 252/1000
 - 7s - loss: 0.0016 - raw_multi_label_accuracy: 0.9990 - val_loss: 1.1020 - val_raw_multi_label_accuracy: 0.1824
Epoch 253/1000
 - 6s - loss: 0.0016 - raw_multi_label_accuracy: 0.9991 - val_loss: 1.1038 - val_raw_multi_label_accuracy: 0.1838


<keras.callbacks.History at 0x122ad3f60>

In [68]:
def nn_output_to_predictions(res):
    label_predictions = []
    for i in range(res.shape[0]):
        pred = [0]*20
        for j in range(res.shape[1]):
            if res[i][j] >= .5:
                pred[j] = 1
        label_predictions.append(pred)
    return np.array(label_predictions)

In [69]:
predictions = nn_output_to_predictions(model_lstm.predict(x_test_seq))

In [70]:
y_test[:,2].sum()

27

In [71]:
predictions[:,2].sum()

11

In [72]:
multi_label_accuracy(y_test, predictions)

0.2355627343064528

In [73]:
multi_label_precision(y_test, predictions)

0.39967551622418895

In [74]:
multi_label_recall(y_test, predictions)

0.32300789662598717

In [75]:
print("Percent of correctly decided label decisions: " + str(100* (1-hamming_loss(y_test, predictions))))

Percent of correctly decided label decisions: 84.71524288107203


In [76]:
get_per_label_metrics(y_test, predictions)

Accuracy for Comedy: 0.6147403685092128
Precision for Comedy: 0.46153846153846156
Recall for Comedy: 0.3888888888888889

Accuracy for Western: 0.9782244556113903
Precision for Western: 0.0
Recall for Western: 0.0

Accuracy for Animation: 0.9430485762144054
Precision for Animation: 0.18181818181818182
Recall for Animation: 0.07407407407407407

Accuracy for TV Movie: 1.0
Precision for TV Movie: 0.0
Recall for TV Movie: 0.0

Accuracy for Foreign: 0.9849246231155779
Precision for Foreign: 0.0
Recall for Foreign: 0.0

Accuracy for Music: 0.9698492462311558
Precision for Music: 0.0
Recall for Music: 0.0

Accuracy for Adventure: 0.8157453936348409
Precision for Adventure: 0.3170731707317073
Recall for Adventure: 0.1368421052631579

Accuracy for Action: 0.6733668341708543
Precision for Action: 0.32727272727272727
Recall for Action: 0.22929936305732485

Accuracy for Fantasy: 0.8609715242881072
Precision for Fantasy: 0.04878048780487805
Recall for Fantasy: 0.043478260869565216

Accuracy for Thri

In [77]:
y_train.shape

(2388, 20)