In [1]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [2]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [3]:
def text_to_list(x):
    if pd.isna(x):
        return ''
    else:
        return ast.literal_eval(x)

def parse_json(x):
    try:
        return json.loads(x.replace("'", '"'))[0]['name']
    except:
        return ''
    
def parse_all_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        for i in range(numElems):
            genre_set.add(json_genres[i]['name'])
    except:
        return ''
    
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #20 0s
        for i in range(numElems):
            ret[genre_dict[(json_genres[i]['name'])]] = 1
        return ret
    except:
        return ''
    

def get_labels_as_strs(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = []#20 0s
        for i in range(numElems):
            ret.append(json_genres[i]['name'])
        return ret
    except:
        return ''

In [4]:
 def getAllGenres():
    full_data = pd.read_csv('train.csv')

    y = full_data['genres']
    y.apply(parse_all_genres_json)

In [5]:
getAllGenres()#populate the genre set

In [6]:
#get set to dictionary for indexing of target vectors
genre_dict = {}
index = 0
for genre in genre_set:
    genre_dict[genre] = index
    index += 1

In [7]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    label_strs = y.apply(get_labels_as_strs)
    all_data['genres_labels'] = label_strs
    return ret

In [8]:
labels_vects = getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [9]:
#put to lower case, remove punctation
def cleanText(text):
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [10]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_labels', 'genres_vect', 'overview']]

In [11]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

CNN STUFF here

In [12]:
#get word embeddings
x = train['cleanOverview'].values.tolist()
y = train['genres_vect']

In [13]:
x_test = test['cleanOverview'].values.tolist()
y_test = test['genres_vect']

In [14]:
y_train = y.tolist()
y_train = np.array(y_train)

In [15]:
y_test = y_test.tolist()
y_test = np.array(y_test)

In [16]:
tok = [word_tokenize(sent) for sent in x]

In [17]:
word_vec_len = 32
model = Word2Vec(tok, min_count = 1, size=word_vec_len)

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words_kept = 100000 #using 100000 most popular words, use throughout

tokenizer = Tokenizer(num_words_kept)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

x_train_seq = pad_sequences(sequences, maxlen=200)

Using TensorFlow backend.


In [19]:
test_sequences = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(test_sequences, maxlen=200)

In [20]:
embeddings_index = {}
for w in model.wv.vocab.keys():
    embeddings_index[w] = model.wv[w]


embedding_matrix = np.zeros((num_words_kept, word_vec_len))
for word, i in tokenizer.word_index.items():
    if i >= num_words_kept:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [21]:
#intersection of predicted and actual labels divided by uniont for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

import keras.backend as K

#metric for keras for early stopping
#takes in raw labels from kerass (not yet converted to 0 and 1s)
#NOT the same as accuracy, this is total labels correctly identified divided by union of total labels
#this weights rows with more labels higher, where accruacy does not, but this is still a good metric for early stopping
def raw_multi_label_accuracy(y_true, y_pred):
    positives = K.greater_equal(y_pred, 0.5)
    positives = K.cast(positives, K.floatx())
    new_y_pred = positives #+ ((1-positives)*y_pred)
    intersection = y_true * new_y_pred
    union = 1 -((1-y_true)*(1-new_y_pred))
    accuracy = K.sum(intersection) / K.sum(union)
    return accuracy
    

In [22]:
from keras.callbacks import EarlyStopping
#for early stopping only after certain number of epochs. wait until delay epochs until early stopping
class DelayedEarlyStopping(EarlyStopping):
    def __init__(self, monitor, min_delta=0, patience=0, verbose=0, mode='auto', delay = 100):
        super(DelayedEarlyStopping, self).__init__()
        self.delay = delay

    def on_epoch_end(self, epoch, logs=None):
        if epoch > self.delay:
            super().on_epoch_end(epoch, logs)

In [23]:
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

model_cnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=200, trainable=False)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu'))
model_cnn.add(Dense(20, activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
#model_cnn_01.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
model_cnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [DelayedEarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 10, delay=500)], epochs=1000, batch_size=100, verbose=2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 2149 samples, validate on 239 samples
Epoch 1/1000
 - 2s - loss: 0.4166 - raw_multi_label_accuracy: 0.0872 - val_loss: 0.3231 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 2/1000
 - 1s - loss: 0.3163 - raw_multi_label_accuracy: 0.0688 - val_loss: 0.3158 - val_raw_multi_label_accuracy: 0.1695
Epoch 3/1000
 - 1s - loss: 0.3136 - raw_multi_label_accuracy: 0.0937 - val_loss: 0.3159 - val_raw_multi_label_accuracy: 0.1695
Epoch 4/1000
 - 1s - loss: 0.3133 - raw_multi_label_accuracy: 0.1000 - val_loss: 0.3148 - val_raw_multi_label_accuracy: 0.1695
Epoch 5/1000
 - 1s - loss: 0.3132 - raw_multi_label_accuracy: 0.1023 - val_loss: 0.3160 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 6/1000
 - 1s - loss: 0.3134 - raw_multi_label_accuracy: 0.0754 - val_loss: 0.3150 - val_raw_multi_label_ac

Epoch 58/1000
 - 2s - loss: 0.3109 - raw_multi_label_accuracy: 0.0891 - val_loss: 0.3141 - val_raw_multi_label_accuracy: 0.0099
Epoch 59/1000
 - 2s - loss: 0.3113 - raw_multi_label_accuracy: 0.1052 - val_loss: 0.3142 - val_raw_multi_label_accuracy: 0.0513
Epoch 60/1000
 - 2s - loss: 0.3105 - raw_multi_label_accuracy: 0.1003 - val_loss: 0.3138 - val_raw_multi_label_accuracy: 0.1688
Epoch 61/1000
 - 1s - loss: 0.3102 - raw_multi_label_accuracy: 0.1326 - val_loss: 0.3138 - val_raw_multi_label_accuracy: 0.1030
Epoch 62/1000
 - 1s - loss: 0.3103 - raw_multi_label_accuracy: 0.1072 - val_loss: 0.3143 - val_raw_multi_label_accuracy: 0.0558
Epoch 63/1000
 - 1s - loss: 0.3103 - raw_multi_label_accuracy: 0.1304 - val_loss: 0.3153 - val_raw_multi_label_accuracy: 0.1648
Epoch 64/1000
 - 1s - loss: 0.3100 - raw_multi_label_accuracy: 0.1188 - val_loss: 0.3145 - val_raw_multi_label_accuracy: 0.1543
Epoch 65/1000
 - 1s - loss: 0.3101 - raw_multi_label_accuracy: 0.1264 - val_loss: 0.3139 - val_raw_multi

Epoch 122/1000
 - 2s - loss: 0.3071 - raw_multi_label_accuracy: 0.0815 - val_loss: 0.3114 - val_raw_multi_label_accuracy: 0.1385
Epoch 123/1000
 - 2s - loss: 0.3077 - raw_multi_label_accuracy: 0.1351 - val_loss: 0.3118 - val_raw_multi_label_accuracy: 0.0860
Epoch 124/1000
 - 2s - loss: 0.3078 - raw_multi_label_accuracy: 0.1038 - val_loss: 0.3124 - val_raw_multi_label_accuracy: 0.0246
Epoch 125/1000
 - 2s - loss: 0.3076 - raw_multi_label_accuracy: 0.0770 - val_loss: 0.3115 - val_raw_multi_label_accuracy: 0.1503
Epoch 126/1000
 - 2s - loss: 0.3066 - raw_multi_label_accuracy: 0.1269 - val_loss: 0.3119 - val_raw_multi_label_accuracy: 0.0878
Epoch 127/1000
 - 2s - loss: 0.3067 - raw_multi_label_accuracy: 0.1021 - val_loss: 0.3115 - val_raw_multi_label_accuracy: 0.1127
Epoch 128/1000
 - 1s - loss: 0.3067 - raw_multi_label_accuracy: 0.1097 - val_loss: 0.3136 - val_raw_multi_label_accuracy: 0.0147
Epoch 129/1000
 - 1s - loss: 0.3069 - raw_multi_label_accuracy: 0.0906 - val_loss: 0.3116 - val_r

Epoch 186/1000
 - 1s - loss: 0.3027 - raw_multi_label_accuracy: 0.1163 - val_loss: 0.3103 - val_raw_multi_label_accuracy: 0.1531
Epoch 187/1000
 - 1s - loss: 0.3019 - raw_multi_label_accuracy: 0.1177 - val_loss: 0.3118 - val_raw_multi_label_accuracy: 0.1502
Epoch 188/1000
 - 1s - loss: 0.3021 - raw_multi_label_accuracy: 0.1166 - val_loss: 0.3119 - val_raw_multi_label_accuracy: 0.0625
Epoch 189/1000
 - 1s - loss: 0.3020 - raw_multi_label_accuracy: 0.1184 - val_loss: 0.3099 - val_raw_multi_label_accuracy: 0.1291
Epoch 190/1000
 - 1s - loss: 0.3013 - raw_multi_label_accuracy: 0.1274 - val_loss: 0.3118 - val_raw_multi_label_accuracy: 0.0802
Epoch 191/1000
 - 2s - loss: 0.3016 - raw_multi_label_accuracy: 0.1210 - val_loss: 0.3128 - val_raw_multi_label_accuracy: 0.1667
Epoch 192/1000
 - 2s - loss: 0.3012 - raw_multi_label_accuracy: 0.1074 - val_loss: 0.3097 - val_raw_multi_label_accuracy: 0.1506
Epoch 193/1000
 - 2s - loss: 0.3010 - raw_multi_label_accuracy: 0.1185 - val_loss: 0.3109 - val_r

Epoch 250/1000
 - 2s - loss: 0.2958 - raw_multi_label_accuracy: 0.1193 - val_loss: 0.3125 - val_raw_multi_label_accuracy: 0.1674
Epoch 251/1000
 - 2s - loss: 0.2975 - raw_multi_label_accuracy: 0.1271 - val_loss: 0.3124 - val_raw_multi_label_accuracy: 0.1618
Epoch 252/1000
 - 2s - loss: 0.2966 - raw_multi_label_accuracy: 0.1287 - val_loss: 0.3129 - val_raw_multi_label_accuracy: 0.0990
Epoch 253/1000
 - 2s - loss: 0.2957 - raw_multi_label_accuracy: 0.1222 - val_loss: 0.3131 - val_raw_multi_label_accuracy: 0.1358
Epoch 254/1000
 - 2s - loss: 0.2960 - raw_multi_label_accuracy: 0.1309 - val_loss: 0.3112 - val_raw_multi_label_accuracy: 0.1261
Epoch 255/1000
 - 1s - loss: 0.2958 - raw_multi_label_accuracy: 0.1192 - val_loss: 0.3113 - val_raw_multi_label_accuracy: 0.0985
Epoch 256/1000
 - 1s - loss: 0.2957 - raw_multi_label_accuracy: 0.1279 - val_loss: 0.3116 - val_raw_multi_label_accuracy: 0.1231
Epoch 257/1000
 - 1s - loss: 0.2958 - raw_multi_label_accuracy: 0.1292 - val_loss: 0.3141 - val_r

Epoch 314/1000
 - 2s - loss: 0.2895 - raw_multi_label_accuracy: 0.1466 - val_loss: 0.3137 - val_raw_multi_label_accuracy: 0.1178
Epoch 315/1000
 - 2s - loss: 0.2887 - raw_multi_label_accuracy: 0.1447 - val_loss: 0.3150 - val_raw_multi_label_accuracy: 0.1367
Epoch 316/1000
 - 2s - loss: 0.2892 - raw_multi_label_accuracy: 0.1300 - val_loss: 0.3154 - val_raw_multi_label_accuracy: 0.1555
Epoch 317/1000
 - 2s - loss: 0.2893 - raw_multi_label_accuracy: 0.1422 - val_loss: 0.3142 - val_raw_multi_label_accuracy: 0.1412
Epoch 318/1000
 - 2s - loss: 0.2882 - raw_multi_label_accuracy: 0.1418 - val_loss: 0.3130 - val_raw_multi_label_accuracy: 0.1446
Epoch 319/1000
 - 2s - loss: 0.2895 - raw_multi_label_accuracy: 0.1420 - val_loss: 0.3148 - val_raw_multi_label_accuracy: 0.1039
Epoch 320/1000
 - 2s - loss: 0.2892 - raw_multi_label_accuracy: 0.1369 - val_loss: 0.3154 - val_raw_multi_label_accuracy: 0.1478
Epoch 321/1000
 - 1s - loss: 0.2885 - raw_multi_label_accuracy: 0.1438 - val_loss: 0.3148 - val_r

Epoch 378/1000
 - 3s - loss: 0.2821 - raw_multi_label_accuracy: 0.1605 - val_loss: 0.3205 - val_raw_multi_label_accuracy: 0.1358
Epoch 379/1000
 - 2s - loss: 0.2823 - raw_multi_label_accuracy: 0.1685 - val_loss: 0.3177 - val_raw_multi_label_accuracy: 0.1238
Epoch 380/1000
 - 2s - loss: 0.2820 - raw_multi_label_accuracy: 0.1551 - val_loss: 0.3188 - val_raw_multi_label_accuracy: 0.1169
Epoch 381/1000
 - 2s - loss: 0.2808 - raw_multi_label_accuracy: 0.1620 - val_loss: 0.3204 - val_raw_multi_label_accuracy: 0.1525
Epoch 382/1000
 - 2s - loss: 0.2809 - raw_multi_label_accuracy: 0.1609 - val_loss: 0.3176 - val_raw_multi_label_accuracy: 0.1096
Epoch 383/1000
 - 2s - loss: 0.2813 - raw_multi_label_accuracy: 0.1595 - val_loss: 0.3172 - val_raw_multi_label_accuracy: 0.1219
Epoch 384/1000
 - 2s - loss: 0.2814 - raw_multi_label_accuracy: 0.1669 - val_loss: 0.3176 - val_raw_multi_label_accuracy: 0.1492
Epoch 385/1000
 - 1s - loss: 0.2820 - raw_multi_label_accuracy: 0.1600 - val_loss: 0.3188 - val_r

Epoch 442/1000
 - 1s - loss: 0.2740 - raw_multi_label_accuracy: 0.1890 - val_loss: 0.3245 - val_raw_multi_label_accuracy: 0.1315
Epoch 443/1000
 - 1s - loss: 0.2724 - raw_multi_label_accuracy: 0.1835 - val_loss: 0.3274 - val_raw_multi_label_accuracy: 0.1358
Epoch 444/1000
 - 1s - loss: 0.2716 - raw_multi_label_accuracy: 0.1857 - val_loss: 0.3248 - val_raw_multi_label_accuracy: 0.1306
Epoch 445/1000
 - 1s - loss: 0.2707 - raw_multi_label_accuracy: 0.1922 - val_loss: 0.3273 - val_raw_multi_label_accuracy: 0.1251
Epoch 446/1000
 - 2s - loss: 0.2712 - raw_multi_label_accuracy: 0.1835 - val_loss: 0.3275 - val_raw_multi_label_accuracy: 0.1329
Epoch 447/1000
 - 1s - loss: 0.2734 - raw_multi_label_accuracy: 0.1875 - val_loss: 0.3290 - val_raw_multi_label_accuracy: 0.1350
Epoch 448/1000
 - 2s - loss: 0.2742 - raw_multi_label_accuracy: 0.1875 - val_loss: 0.3266 - val_raw_multi_label_accuracy: 0.1224
Epoch 449/1000
 - 1s - loss: 0.2725 - raw_multi_label_accuracy: 0.1838 - val_loss: 0.3298 - val_r

<keras.callbacks.History at 0x7f8a281d1048>

In [24]:
res = model_cnn.predict(x_test_seq)

In [25]:
label_predictions = []
for i in range(res.shape[0]):
    pred = [0]*20
    for j in range(res.shape[1]):
        if res[i][j] >= .5:
            pred[j] = 1
    label_predictions.append(pred)

In [26]:
y_test[:,2].sum()

319

In [27]:
label_predictions = np.array(label_predictions)
label_predictions[:,2].sum()

167

In [28]:
multi_label_accuracy(y_test, label_predictions)

0.12264497088617693