In [28]:
import numpy as np
import pandas as pd
#for reading in data properly
import ast
import json

import gensim
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn import utils

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

In [29]:
all_data = pd.read_csv('train.csv')
all_data = all_data.dropna(subset=['overview', 'genres']) #drop cols without overview or genre (data we use or labels)
genre_set = {'Comedy'}

In [30]:
def text_to_list(x):
    if pd.isna(x):
        return ''
    else:
        return ast.literal_eval(x)

def parse_json(x):
    try:
        return json.loads(x.replace("'", '"'))[0]['name']
    except:
        return ''
    
def parse_all_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        for i in range(numElems):
            genre_set.add(json_genres[i]['name'])
    except:
        return ''
    
def parse_genres_json(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = [0]*len(genre_dict) #20 0s
        for i in range(numElems):
            ret[genre_dict[(json_genres[i]['name'])]] = 1
        return ret
    except:
        return ''
    

def get_labels_as_strs(x):
    try:
        json_genres = json.loads(x.replace("'", '"'))
        numElems = len(json_genres)
        ret = []#20 0s
        for i in range(numElems):
            ret.append(json_genres[i]['name'])
        return ret
    except:
        return ''

In [31]:
 def getAllGenres():
    full_data = pd.read_csv('train.csv')

    y = full_data['genres']
    y.apply(parse_all_genres_json)

In [32]:
getAllGenres()#populate the genre set

In [33]:
#get set to dictionary for indexing of target vectors
genre_dict = {}
index = 0
for genre in genre_set:
    genre_dict[genre] = index
    index += 1

In [34]:
def getGenresVects():
    y = all_data['genres']
    ret = y.apply(parse_genres_json)
    all_data['genres_vect'] = ret
    label_strs = y.apply(get_labels_as_strs)
    all_data['genres_labels'] = label_strs
    return ret

In [35]:
labels_vects = getGenresVects() #get label vectors for genres indexed by indexes in genre_dict

In [36]:
#put to lower case, remove punctation
def cleanText(text):
    text = re.sub(r'[^a-z A-Z0-9]', "", text) #maybe shouldn't remove punction between words here?
    text = text.lower()
    return text
all_data['cleanOverview'] = all_data['overview'].apply(cleanText)

In [37]:
#logistic regression data
lr_data = all_data[['cleanOverview', 'genres_labels', 'genres_vect', 'overview']]

In [38]:
train, test = train_test_split(lr_data, test_size=0.2, random_state=42)

CNN STUFF here

In [39]:
#get word embeddings
x = train['cleanOverview'].values.tolist()
y = train['genres_vect']

In [40]:
x_test = test['cleanOverview'].values.tolist()
y_test = test['genres_vect']

In [41]:
y_train = y.tolist()
y_train = np.array(y_train)

In [42]:
y_test = y_test.tolist()
y_test = np.array(y_test)

In [43]:
tok = [word_tokenize(sent) for sent in x]

In [44]:
word_vec_len = 32
model = Word2Vec(tok, min_count = 1, size=word_vec_len)

In [45]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

num_words_kept = 100000 #using 100000 most popular words, use throughout

tokenizer = Tokenizer(num_words_kept)
tokenizer.fit_on_texts(x)
sequences = tokenizer.texts_to_sequences(x)

x_train_seq = pad_sequences(sequences, maxlen=200)

In [46]:
test_sequences = tokenizer.texts_to_sequences(x_test)
x_test_seq = pad_sequences(test_sequences, maxlen=200)

In [47]:
embeddings_index = {}
for w in model.wv.vocab.keys():
    embeddings_index[w] = model.wv[w]


embedding_matrix = np.zeros((num_words_kept, word_vec_len))
for word, i in tokenizer.word_index.items():
    if i >= num_words_kept:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [48]:
#intersection of predicted and actual labels divided by uniont for each datapoint tested on
#sum those and then divide by number of datapoints
#vectorized for speed
def multi_label_accuracy(real_labels_matrix, predictions_labels_matrix):
    #binary so set intersection is and operator
    intersection = real_labels_matrix & predictions_labels_matrix
    #set union for binary is same as or operator
    union = real_labels_matrix | predictions_labels_matrix
    #sum(array.T) gets number of 1s in row
    row_wise_accuracy = sum(intersection.T) / sum(union.T)
    return sum(row_wise_accuracy) / real_labels_matrix.shape[0]

import keras.backend as K

#metric for keras for early stopping
#takes in raw labels from kerass (not yet converted to 0 and 1s)
#NOT the same as accuracy, this is total labels correctly identified divided by union of total labels
#this weights rows with more labels higher, where accruacy does not
def raw_multi_label_accuracy(y_true, y_pred):
    positives = K.greater_equal(y_pred, 0.5)
    positives = K.cast(positives, K.floatx())
    new_y_pred = positives #+ ((1-positives)*y_pred)
    intersection = y_true * new_y_pred
    union = 1 -((1-y_true)*(1-new_y_pred))
    accuracy = K.sum(intersection) / K.sum(union)
    return accuracy
    

In [49]:
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping

model_cnn = Sequential()
e = Embedding(num_words_kept, word_vec_len, weights=[embedding_matrix], input_length=200, trainable=False)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu'))
model_cnn.add(Dense(20, activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[raw_multi_label_accuracy])
#model_cnn_01.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
model_cnn.fit(x_train_seq, y_train, validation_split = .1, callbacks = [EarlyStopping(monitor = 'val_raw_multi_label_accuracy', patience = 25)], epochs=500, batch_size=100, verbose=2)

Train on 2149 samples, validate on 239 samples
Epoch 1/500
 - 2s - loss: 0.4191 - raw_multi_label_accuracy: 0.0567 - val_loss: 0.3224 - val_raw_multi_label_accuracy: 0.1315
Epoch 2/500
 - 1s - loss: 0.3164 - raw_multi_label_accuracy: 0.1039 - val_loss: 0.3153 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 3/500
 - 1s - loss: 0.3136 - raw_multi_label_accuracy: 0.0737 - val_loss: 0.3156 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 4/500
 - 1s - loss: 0.3133 - raw_multi_label_accuracy: 0.0867 - val_loss: 0.3152 - val_raw_multi_label_accuracy: 0.0000e+00
Epoch 5/500
 - 1s - loss: 0.3140 - raw_multi_label_accuracy: 0.0996 - val_loss: 0.3139 - val_raw_multi_label_accuracy: 0.1682
Epoch 6/500
 - 2s - loss: 0.3137 - raw_multi_label_accuracy: 0.0838 - val_loss: 0.3154 - val_raw_multi_label_accuracy: 0.1695
Epoch 7/500
 - 1s - loss: 0.3135 - raw_multi_label_accuracy: 0.0903 - val_loss: 0.3148 - val_raw_multi_label_accuracy: 0.1695
Epoch 8/500
 - 1s - loss: 0.3139 - raw_multi_label_accuracy

<keras.callbacks.History at 0x7ffa2c1d8320>

In [50]:
res = model_cnn.predict(x_test_seq)

In [51]:
label_predictions = []
for i in range(res.shape[0]):
    pred = [0]*20
    for j in range(res.shape[1]):
        if res[i][j] >= .5:
            pred[j] = 1
    label_predictions.append(pred)

In [52]:
y_test[:,1].sum()

95

In [53]:
label_predictions = np.array(label_predictions)
label_predictions[:,1].sum()

0

In [54]:
multi_label_accuracy(y_test, label_predictions)

0.25447076653106804