Skip to content

Commit

Permalink
There were problems in second LSTM. Now fixed (hopefully)
Browse files Browse the repository at this point in the history
  • Loading branch information
jazzsaxmafia committed Dec 13, 2015
1 parent c1e06f9 commit 42acaec
Show file tree
Hide file tree
Showing 3 changed files with 93 additions and 40 deletions.
Binary file modified cnn_util.pyc
Binary file not shown.
84 changes: 55 additions & 29 deletions model.py
Expand Up @@ -48,6 +48,8 @@ def build_model(self):
state2 = tf.zeros([self.batch_size, self.lstm2.state_size])
padding = tf.zeros([self.batch_size, self.dim_hidden])

probs = []

loss = 0.0

for i in range(self.n_lstm_steps): ## Phase 1 => only read frames
Expand All @@ -58,7 +60,7 @@ def build_model(self):
output1, state1 = self.lstm1( image_emb[:,i,:], state1 )

with tf.variable_scope("LSTM2"):
output2, state2 = self.lstm2( padding, state2 )
output2, state2 = self.lstm2( tf.concat(1,[padding, output1]), state2 )

# Each video might have different length. Need to mask those.
# But how? Padding with 0 would be enough?
Expand All @@ -75,7 +77,7 @@ def build_model(self):
output1, state1 = self.lstm1( padding, state1 )

with tf.variable_scope("LSTM2"):
output2, state2 = self.lstm2( current_embed, state2 )
output2, state2 = self.lstm2( tf.concat(1,[current_embed, output1]), state2 )

labels = tf.expand_dims(caption[:,i], 1)
indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
Expand All @@ -86,11 +88,13 @@ def build_model(self):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words, onehot_labels)
cross_entropy = cross_entropy * caption_mask[:,i]

probs.append(logit_words)

current_loss = tf.reduce_sum(cross_entropy)
loss += current_loss

loss = loss / tf.reduce_sum(caption_mask)
return loss, video, video_mask, caption, caption_mask
return loss, video, video_mask, caption, caption_mask, probs


def build_generator(self):
Expand All @@ -107,14 +111,17 @@ def build_generator(self):

generated_words = []

probs = []
embeds = []

for i in range(self.n_lstm_steps):
if i > 0: tf.get_variable_scope().reuse_variables()

with tf.variable_scope("LSTM1"):
output1, state1 = self.lstm1( image_emb[:,i,:], state1 )

with tf.variable_scope("LSTM2"):
output2, state2 = self.lstm2( padding, state2 )
output2, state2 = self.lstm2( tf.concat(1,[padding,output1]), state2 )

for i in range(self.n_lstm_steps):

Expand All @@ -127,23 +134,26 @@ def build_generator(self):
output1, state1 = self.lstm1( padding, state1 )

with tf.variable_scope("LSTM2"):
output2, state2 = self.lstm2( current_embed, state2 )
output2, state2 = self.lstm2( tf.concat(1,[current_embed,output1]), state2 )

logit_words = tf.nn.xw_plus_b( output2, self.embed_word_W, self.embed_word_b)
max_prob_index = tf.argmax(logit_words, 1)[0]
generated_words.append(max_prob_index)
probs.append(logit_words)

with tf.device("/cpu:0"):
current_embed = tf.nn.embedding_lookup(self.Wemb, max_prob_index)
current_embed = tf.expand_dims(current_embed, 0)

return video, video_mask, generated_words
embeds.append(current_embed)

return video, video_mask, generated_words, probs, embeds


############### Global Parameters ###############
video_path = '/media/storage3/Study/data/youtube_videos'
video_data_path='./data/video_corpus.csv'
video_save_path = '/media/storage3/Study/data/youtube_videos'
video_feat_path = '/media/storage3/Study/data/youtube_feats'

vgg16_path = '/home/taeksoo/Package/tensorflow_vgg16/vgg16.tfmodel'

Expand All @@ -153,15 +163,15 @@ def build_generator(self):
dim_hidden= 256
n_frame_step = 80
n_epochs = 1000
batch_size = 50
batch_size = 100
learning_rate = 0.001
##################################################

def get_video_data(video_data_path, video_save_path, train_ratio=0.9):
def get_video_data(video_data_path, video_feat_path, train_ratio=0.9):
video_data = pd.read_csv(video_data_path, sep=',')
video_data = video_data[video_data['Language'] == 'English']
video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(row['Start'])+'_'+str(row['End'])+'.avi.npy', axis=1)
video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_save_path, x))
video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]

Expand Down Expand Up @@ -206,11 +216,11 @@ def preProBuildWordVocab(sentence_iterator, word_count_threshold=5): # borrowed
return wordtoix, ixtoword, bias_init_vector

def train():
train_data, _ = get_video_data(video_data_path, video_save_path, train_ratio=0.9)
train_data, _ = get_video_data(video_data_path, video_feat_path, train_ratio=0.9)
captions = train_data['Description'].values
captions = map(lambda x: x.replace('.', ''), captions)
captions = map(lambda x: x.replace(',', ''), captions)
wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions)
wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=10)

np.save('./data/ixtoword', ixtoword)

Expand All @@ -222,7 +232,7 @@ def train():
n_lstm_steps=n_frame_step,
bias_init_vector=bias_init_vector)

tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model()
tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()
sess = tf.InteractiveSession()

saver = tf.train.Saver(max_to_keep=10)
Expand All @@ -234,11 +244,14 @@ def train():
np.random.shuffle(index)
train_data = train_data.ix[index]

current_train_data = train_data.groupby('video_path').apply(lambda x: x.irow(np.random.choice(len(x))))
current_train_data = current_train_data.reset_index(drop=True)

for start,end in zip(
range(0, len(train_data), batch_size),
range(batch_size, len(train_data), batch_size)):
range(0, len(current_train_data), batch_size),
range(batch_size, len(current_train_data), batch_size)):

current_batch = train_data[start:end]
current_batch = current_train_data[start:end]
current_videos = current_batch['video_path'].values

current_feats = np.zeros((batch_size, n_frame_step, dim_image))
Expand All @@ -261,6 +274,11 @@ def train():
for ind, row in enumerate(current_caption_masks):
row[:nonzeros[ind]] = 1

probs_val = sess.run(tf_probs, feed_dict={
tf_video:current_feats,
tf_caption: current_caption_matrix
})

_, loss_val = sess.run(
[train_op, tf_loss],
feed_dict={
Expand All @@ -271,11 +289,14 @@ def train():
})

print loss_val
print "Epoch ", epoch, " is done. Saving the model ..."
saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
if np.mod(epoch, 100) == 0:
print "Epoch ", epoch, " is done. Saving the model ..."
saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)

def test(model_path='models/model-700', video_feat_path=video_feat_path):

def test(model_path='models/model-45', video_feat_path='/media/storage3/Study/data/youtube_videos/hxZ-5wELSJM_0_12.avi.npy'):
_, test_data = get_video_data(video_data_path, video_save_path, train_ratio=0.9)
train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.9)
test_videos = test_data['video_path'].unique()
ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())

model = Video_Caption_Generator(
Expand All @@ -286,22 +307,27 @@ def test(model_path='models/model-45', video_feat_path='/media/storage3/Study/da
n_lstm_steps=n_frame_step,
bias_init_vector=None)

video_tf, video_mask_tf, caption_tf = model.build_generator()
video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()
sess = tf.InteractiveSession()

saver = tf.train.Saver()
saver.restore(sess, model_path)

video_feat = np.load(video_feat_path)[None,...]
video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
for video_feat_path in test_videos:
print video_feat_path
video_feat = np.load(video_feat_path)[None,...]
video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))

generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
generated_words = ixtoword[generated_word_index]
generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
probs_val = sess.run(probs_tf, feed_dict={video_tf:video_feat})
embed_val = sess.run(last_embed_tf, feed_dict={video_tf:video_feat})
generated_words = ixtoword[generated_word_index]

punctuation = np.argmax(np.array(generated_words) == '.')+1
generated_words = generated_words[:punctuation]
punctuation = np.argmax(np.array(generated_words) == '.')+1
generated_words = generated_words[:punctuation]

generated_sentence = ' '.join(generated_words)
print generated_sentence
generated_sentence = ' '.join(generated_words)
print generated_sentence
ipdb.set_trace()

ipdb.set_trace()
49 changes: 38 additions & 11 deletions preprocessing.py
Expand Up @@ -12,27 +12,54 @@
import skimage
from cnn_util import *

def preprocess_frame(frame):
short_edge = min(frame.shape[:2])
yy = int((frame.shape[0] - short_edge) / 2)
xx = int((frame.shape[1] - short_edge) / 2)
crop_img = frame[yy : yy + short_edge, xx : xx + short_edge]
resized_img = skimage.transform.resize(crop_img, (224, 224))

return resized_img
def preprocess_frame(image, target_height=224, target_width=224):

if len(image.shape) == 2:
image = np.tile(image[:,:,None], 3)
elif len(image.shape) == 4:
image = image[:,:,:,0]

image = skimage.img_as_float(image).astype(np.float32)
height, width, rgb = image.shape
if width == height:
resized_image = cv2.resize(image, (target_height,target_width))

elif height < width:
resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width))
cropping_length = int((resized_image.shape[1] - target_height) / 2)
resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length]

else:
resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
cropping_length = int((resized_image.shape[0] - target_width) / 2)
resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:]

return cv2.resize(resized_image, (target_height, target_width))

def main():
num_frames = 80
vgg_model = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers.caffemodel'
vgg_deploy = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers_deploy.prototxt'
video_save_path = '/media/storage3/Study/data/youtube_videos'
videos = os.listdir(video_save_path)
video_path = '/media/storage3/Study/data/youtube_videos'
video_save_path = '/media/storage3/Study/data/youtube_feats'
videos = os.listdir(video_path)
videos = filter(lambda x: x.endswith('avi'), videos)

cnn = CNN(model=vgg_model, deploy=vgg_deploy, width=224, height=224)

for video in videos:
video_fullpath = os.path.join(video_save_path, video)
cap = cv2.VideoCapture( video_fullpath )
print video

if os.path.exists( os.path.join(video_save_path, video) ):
print "Already processed ... "
continue

video_fullpath = os.path.join(video_path, video)
try:
cap = cv2.VideoCapture( video_fullpath )
except:
pass

frame_count = 0
frame_list = []
Expand Down

0 comments on commit 42acaec

Please sign in to comment.