In [1]:
import keras
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from keras.layers.advanced_activations import LeakyReLU
import os
from os import listdir
from os.path import isfile, join
import numpy as np
from numpy import array
from sklearn.model_selection import train_test_split
import csv
import re
import pandas
from pandas.core.series import Series

# need to have pre-screened metadata with posters before this

def get_data(input_shape):
  directory = "posters"
  meta_directory = "metadata"
  no_files = len([f for f in listdir(meta_directory) if isfile(join(meta_directory, f))])    
  x = np.empty((no_files, input_shape[0], input_shape[1], input_shape[2]))
  y = pandas.DataFrame()
  idx = 0
  missing_genres = []
  for filename in os.listdir(directory):
    if filename.endswith(".jpg"):
      uuid = filename.split('.')[0]
      filepath = os.path.join(directory, filename)
      img = image.load_img(path=filepath,grayscale=False,target_size=input_shape)
      img = image.img_to_array(img)
      x[idx,:,:,:] = img
      filepath = os.path.join(meta_directory, uuid + ".txt")
      genre_found = False
      with open(filepath) as tsv:
        for line in csv.reader(tsv):
          res = re.search("^genre\t(.*)$", line[0])
          if(res != None):
            genres = res.group(1).split("\t")
            if(len(genres) > 0):
                genre_found = True
            for g in genres:
                y = y.append(Series({'movie':uuid, 'genre':g}), ignore_index=True)
      if(not genre_found):
        missing_genres.append(idx)
        print("No genre found for " + uuid + "! Omitting it...")        
    idx += 1
    y['count'] = 1
  # remove genre-missing
  x = np.delete(x, missing_genres, axis=0)
  return((x,y))

#input_shape = (100, 150, 3)

# currently doing 50% train-test splits
(x, y) = get_data(input_shape)

y = y.pivot(index='movie', columns='genre', values='count').fillna(0)

y_arr = np.array(y)

print(x.shape)
print(y_arr.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


No genre found for 366ffaa0-4522-47c0-8ef5-a9d2725538e1! Omitting it...
No genre found for 41e95ff1-39a0-42bb-a7dc-15f96f01cc8a! Omitting it...
No genre found for ec371192-aa97-4f8c-8eeb-b3ddbac8817e! Omitting it...
(1029, 100, 150, 3)
(1029, 19)


In [33]:
#idx = 1
#x_train, x_test, y_train, y_test = train_test_split(x, y_arr[:,idx], test_size=0.25, random_state=1)

# MNIST
from keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

# source: https://towardsdatascience.com/a-simple-2d-cnn-for-mnist-digit-recognition-a998dbc1e79a
# channels last with TensorFlow backend
img_rows = 28
img_cols = img_rows
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
input_shape = (img_rows, img_cols, 1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

num_classes = 10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

#x_train = x_train.astype('float32') / 255.
#x_test = x_test.astype('float32') / 255.

#y_train = y_train.astype('int32')
#y_test = y_test.astype('int32')

#num_classes = 2

relu_leak = 0.3

model = Sequential()
model.add(Conv2D(32, kernel_size=(5, 5), strides=(1, 1), input_shape=input_shape))
model.add(LeakyReLU(alpha=relu_leak))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Conv2D(32, (5, 5)))
model.add(LeakyReLU(alpha=relu_leak))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(1000))
model.add(LeakyReLU(alpha=relu_leak))
model.add(Dense(num_classes, activation='softmax'))

(60000, 28, 28, 1)
(60000, 10)
(10000, 28, 28, 1)
(10000, 10)


In [34]:
model.compile(loss=keras.losses.categorical_crossentropy,
      optimizer=keras.optimizers.SGD(lr=0.01),
      metrics=['accuracy'])
batch_size = 20
epochs = 3

In [87]:
# perturb input!

# from keras.preprocessing.image import ImageDataGenerator

# aug = ImageDataGenerator(rotation_range=25, width_shift_range=0.1,
#   height_shift_range=0.1, shear_range=0.2, zoom_range=0.2,
#   horizontal_flip=True, fill_mode="nearest")

In [42]:
# H = model.fit_generator(
#   aug.flow(x_train, y_train, batch_size=batch_size),
#   validation_data=(x_test, y_test),
#   steps_per_epoch=len(x_train),
#   epochs=epochs, verbose=1)

Epoch 1/2
Epoch 2/2


In [35]:
from keras.models import model_from_json

# without data augmentation
H = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

# serialize model to JSON
#model_json = model.to_json()
#with open("model_action.json", "w") as json_file:
#    json_file.write(model_json)
# serialize weights to HDF5
#model.save_weights("model_action.h5")
#print("Saved model to disk")

Train on 60000 samples, validate on 10000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [9]:
# load json and create model
#json_file = open('model_action.json', 'r')
#loaded_model_json = json_file.read()
#json_file.close()
#loaded_model = model_from_json(loaded_model_json)
# load weights into new model
#loaded_model.load_weights("model_action.h5")
#print("Loaded model from disk")

Loaded model from disk


In [39]:
y_new = model.predict_classes(x_test)

print(y_test)
print(y_new)

#for i in range(len(y_new)):
#    print("Predicted=%s" % y_new[i])

[[0. 0. 0. ... 1. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[7 2 1 ... 4 5 6]
