In [37]:
# This version uses abstracted function to load X and Y training data into arrays of the right shape

# Here I will be building out the architecture of the first classification LSTM
# At each time step, this LSTM will take in a vector representing the extracted audio and visual features from Liris-ACCEDE
# Its goal is to output whether or not the movie induces fear at each time step

# First, import necessary libraries
import tensorflow as tf
import numpy as np

In [38]:
# setting up the keras stuff
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
# my custom data_utils file
from data_utils_local05 import *

In [72]:
# a function for iterating through all of the files in a folder and loading them into input_data
def load_Xinput(directory):
    X_input = np.zeros([212, 4096]) # MAGIC NUMBERS
    count = 0
    for file in os.listdir(directory):
        filename = os.fsdecode(file)
        if filename.endswith(".txt"):
            input_data = np.loadtxt(os.path.join(directory, file), delimiter=',')
            X_input[count, :] = np.asarray(input_data)[:]
            #print(os.path.join(directory, filename))
            count = count + 1
            continue
        else:
            continue
    return X_input


In [74]:
# testing the function above
X_input = load_Xinput('visual_features_part01/07/fc6/')
print(X_input.shape)

(212, 4096)


In [69]:
# a function for finding the correct fc6 folder
def get_fc6_directory(movie_num):
    return os.path.join("visual_features_part01",
                       "{}".format(movie_num), 
                       "fc6")


In [70]:
# for testing the function above
print(get_fc6_directory("07"))

visual_features_part01/07/fc6


In [71]:
# testing the combination of the two functions
print(load_Xinput(get_fc6_directory("07")))

[[[ -4.9211 ]
  [  0.96259]
  [ -3.5258 ]
  ..., 
  [  0.40076]
  [ -1.2796 ]
  [ -0.35213]]

 [[ -5.0926 ]
  [  0.81427]
  [ -3.4552 ]
  ..., 
  [  0.23162]
  [ -1.2592 ]
  [ -0.3475 ]]

 [[ -5.4391 ]
  [ -8.3879 ]
  [ -7.4676 ]
  ..., 
  [ -9.9657 ]
  [-10.339  ]
  [  4.4491 ]]

 ..., 
 [[ -8.2536 ]
  [  2.4615 ]
  [ -0.51632]
  ..., 
  [  2.6969 ]
  [ -2.1534 ]
  [  6.2791 ]]

 [[ -8.3559 ]
  [  2.6918 ]
  [ -1.5177 ]
  ..., 
  [  3.5107 ]
  [ -4.4554 ]
  [  6.395  ]]

 [[ -4.7936 ]
  [  2.6032 ]
  [ -3.5129 ]
  ..., 
  [  1.774  ]
  [  0.9897 ]
  [  3.9134 ]]]


In [48]:
# loading the X input values 

# create array of shape (210, 4096, 1) that will hold each second's worth of VGG16 data
X_input = np.zeros([210, 4096, 1]) # MAGIC NUMBERS

# iterate through the first axis length of this array and insert each second's worth of fc6 feature data
# start counting seconds at 00001

# testing function with just one second of input data:
# I think this will fail because i need to treat the numbers as strings 

# uploading the X_values: fc6 feature data as input and figuring out its shape
input_data01 = np.loadtxt('visual_features_part01/07/fc6/MEDIAEVAL18_07-00001_fc6.txt', delimiter=',')
input_data02 = np.loadtxt('visual_features_part01/07/fc6/MEDIAEVAL18_07-00002_fc6.txt', delimiter=',')
print("input_data01 shape:")
print(np.asarray(input_data01)[:, np.newaxis].shape)
print("X_input.shape")
print(X_input[0,:,:].shape)
X_input[0, :, :] = np.asarray(input_data01)[:, np.newaxis]
print(X_input)


input_data01 shape:
(4096, 1)
X_input.shape
(4096, 1)
[[[-4.9211 ]
  [ 0.96259]
  [-3.5258 ]
  ..., 
  [ 0.40076]
  [-1.2796 ]
  [-0.35213]]

 [[ 0.     ]
  [ 0.     ]
  [ 0.     ]
  ..., 
  [ 0.     ]
  [ 0.     ]
  [ 0.     ]]

 [[ 0.     ]
  [ 0.     ]
  [ 0.     ]
  ..., 
  [ 0.     ]
  [ 0.     ]
  [ 0.     ]]

 ..., 
 [[ 0.     ]
  [ 0.     ]
  [ 0.     ]
  ..., 
  [ 0.     ]
  [ 0.     ]
  [ 0.     ]]

 [[ 0.     ]
  [ 0.     ]
  [ 0.     ]
  ..., 
  [ 0.     ]
  [ 0.     ]
  [ 0.     ]]

 [[ 0.     ]
  [ 0.     ]
  [ 0.     ]
  ..., 
  [ 0.     ]
  [ 0.     ]
  [ 0.     ]]]


In [None]:
# 

In [51]:
# uploading the Y_values
# the target data (y_values) will be a one-hot vector representing which seconds of movie induce fear

# convert into function that takes following input args: movie_length, fear_annot_path
# returns y_data_input

# first access start and stop times for fear-inducing sequences
y_data = np.loadtxt('fear_annotations_part01/MEDIAEVAL18_07_Fear.txt', skiprows=1)

# now treat these as pairs of indices --> we want all the indices between each pair of numbers
# create array of zeros --> the size will be the number of seconds in movie, in this case 210
movie_length = 212 #MAGIC NUMBER ALERT! --> length of movie
y_data_input = np.zeros((movie_length)) 

# for each element in first dimension of the y_data array
for i in range(y_data.shape[0]):
    # access the start time number and end time number
    start = int(y_data[i][0])
    end = int(y_data[i][1])
    # set the elements between these indices in the zeros array to one
    y_data_input[start] = 1 #maybe superfluous
    y_data_input[end] = 1
    y_data_input[start:end] = 1
print(y_data_input)

[[  64.  101.]
 [ 105.  109.]
 [ 129.  145.]]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  1.  1.  1.
  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]


In [52]:
# setting up some key values
timesteps = 212   # the number of seconds in movie 07 --> i will figure out how to programmatically get this value
data_dim = 4096    # the number of output values from VGG16 layer fc6 --> switch to programmatic later
# could data_dim be the number of features that have been extracted (for now visual features only) --> maybe too much

# I have yet to figure this out
X_train = input_data
Y_train = y_data_input
batch_size = 30 # very much arbitrary
num_epochs = 20 # very much arbitrary

In [53]:
# constructing a many-to-one LSTM model in keras --> inspiration: https://stackoverflow.com/questions/43034960/many-to-one-and-many-to-many-lstm-examples-in-keras
# i will start by training a model on only the VGG16 fc6 layer output (that's just one feature)
# should I eventually abstract this LSTM model? Create its own object file?
model = Sequential()
model.add(LSTM(1, input_shape=(timesteps, data_dim), return_sequences=True))
# going to add a softmax activation to this
model.add(Activation('softmax'))

In [54]:
# compiling LSTM model
# note that Ng used an Adam optimizer and categorical cross-entropy loss
# but this is a binary classification problem so I think the parameters below should suffice
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [55]:
# running the LSTM model
model.fit(X_train, Y_train, batch_size=batch_size, epochs=num_epochs)
print("finished training!")

ValueError: Error when checking input: expected lstm_2_input to have 3 dimensions, but got array with shape (4096, 1)