In [51]:
import keras
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten
from keras.layers.advanced_activations import LeakyReLU
import os
from os import listdir
from os.path import isfile, join
import numpy as np
from numpy import array
from sklearn.model_selection import train_test_split
import csv
import re
import pandas
from pandas.core.series import Series

def get_data(input_shape):
  directory = "posters"
  meta_directory = "metadata"
  no_files = len([f for f in listdir(directory) if isfile(join(directory, f))])    
  x = np.empty((no_files, input_shape[0], input_shape[1], input_shape[2]))
  y = pandas.DataFrame()
  idx = 0
  for filename in os.listdir(directory):
    if filename.endswith(".jpg"):
      uuid = filename.split('.')[0]
      filepath = os.path.join(directory, filename)
      img = image.load_img(path=filepath,grayscale=False,target_size=input_shape)
      img = image.img_to_array(img)
      x[idx,:,:,:] = img
      filepath = os.path.join(meta_directory, uuid + ".txt")
      with open(filepath) as tsv:
        for line in csv.reader(tsv):
          res = re.search("^genre\t(.*)$", line[0])
          if(res != None):
            genres = res.group(1).split("\t")
            for g in genres:
                y = y.append(Series({'movie':uuid, 'genre':g}), ignore_index=True)
    idx += 1
    y['count'] = 1
  return((x,y))

input_shape = (100, 150, 3)

# currently doing 50% train-test splits
(x, y) = get_data(input_shape)

y = y.pivot(index='movie', columns='genre', values='count').fillna(0)



genre                                 Action  Adventure  Animation  Comedy  \
movie                                                                        
02e9b007-a6fe-40fc-88ad-2e2174400c36     0.0        0.0        0.0     0.0   
03624796-53db-4437-b521-a9810d8d6614     0.0        1.0        0.0     0.0   
0370c9e0-b3d2-4658-a5cb-2412c2034fd6     0.0        0.0        0.0     0.0   
0498e52c-bb83-4afe-a789-7cdc1a1b1fd0     1.0        1.0        0.0     1.0   
0569e03f-975d-4781-a4d7-da5745ba48f4     0.0        0.0        0.0     0.0   
066ce6c8-f6ac-4f70-ab7a-3a7f1507e0bc     1.0        1.0        0.0     0.0   
06c5f8e8-cdbb-45e3-aafd-671b3cebe221     0.0        1.0        1.0     0.0   
08b905bb-5555-4387-9eb0-38d3a09e05be     1.0        1.0        0.0     0.0   
09c73a3c-6f7b-40ed-bf79-5853a39fc314     0.0        1.0        1.0     1.0   
0b89dcf5-43f3-4a6f-9f7d-91bb22bd58af     1.0        0.0        0.0     0.0   
0da90242-8a14-491a-af44-4e1c9286f26d     0.0        0.0        1

In [84]:
y_arr = np.array(y)

idx = 2

print(y_arr[:,idx])
print(sum(y_arr[:,idx]))

[0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]
23.0


In [88]:
x_train, x_test, y_train, y_test = train_test_split(x, y_arr[:,idx], test_size=0.25, random_state=1)

x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.

num_classes=2

relu_leak = 0.3

model = Sequential()
model.add(Conv2D(32, kernel_size=(5, 5), strides=(1, 1), input_shape=input_shape))
model.add(LeakyReLU(alpha=relu_leak))
model.add(MaxPooling2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Conv2D(64, (5, 5)))
model.add(LeakyReLU(alpha=relu_leak))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(1000))
model.add(LeakyReLU(alpha=relu_leak))
model.add(Dense(num_classes, activation='softmax'))

In [89]:
model.compile(loss=keras.losses.sparse_categorical_crossentropy,
      optimizer=keras.optimizers.SGD(lr=0.01),
      metrics=['accuracy'])
batch_size = 20
epochs = 2

In [87]:
# perturb input!

# from keras.preprocessing.image import ImageDataGenerator

# aug = ImageDataGenerator(rotation_range=25, width_shift_range=0.1,
#   height_shift_range=0.1, shear_range=0.2, zoom_range=0.2,
#   horizontal_flip=True, fill_mode="nearest")

In [42]:
# H = model.fit_generator(
#   aug.flow(x_train, y_train, batch_size=batch_size),
#   validation_data=(x_test, y_test),
#   steps_per_epoch=len(x_train),
#   epochs=epochs, verbose=1)

Epoch 1/2
Epoch 2/2


In [90]:
print(y_train)

# without data augmentation
H = model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_data=(x_test, y_test))

[1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1.]
Train on 150 samples, validate on 50 samples
Epoch 1/2
Epoch 2/2


In [91]:
y_new = model.predict_classes(x_test)

print(y_test)
print(y_new)

#for i in range(len(y_new)):
#    print("Predicted=%s" % y_new[i])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0.]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0]
