There were problems in second LSTM. Now fixed (hopefully)

jazzsaxmafia · Dec 13, 2015 · 42acaec · 42acaec
1 parent c1e06f9
commit 42acaec
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 40 deletions.
diff --git a/cnn_util.pyc b/cnn_util.pyc
diff --git a/model.py b/model.py
@@ -48,6 +48,8 @@ def build_model(self):
         state2 = tf.zeros([self.batch_size, self.lstm2.state_size])
         padding = tf.zeros([self.batch_size, self.dim_hidden])
 
+        probs = []
+
         loss = 0.0
 
         for i in range(self.n_lstm_steps): ## Phase 1 => only read frames
@@ -58,7 +60,7 @@ def build_model(self):
                 output1, state1 = self.lstm1( image_emb[:,i,:], state1 )
 
             with tf.variable_scope("LSTM2"):
-                output2, state2 = self.lstm2( padding, state2 )
+                output2, state2 = self.lstm2( tf.concat(1,[padding, output1]), state2 )
 
         # Each video might have different length. Need to mask those.
         # But how? Padding with 0 would be enough?
@@ -75,7 +77,7 @@ def build_model(self):
                 output1, state1 = self.lstm1( padding, state1 )
 
             with tf.variable_scope("LSTM2"):
-                output2, state2 = self.lstm2( current_embed, state2 )
+                output2, state2 = self.lstm2( tf.concat(1,[current_embed, output1]), state2 )
 
             labels = tf.expand_dims(caption[:,i], 1)
             indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1)
@@ -86,11 +88,13 @@ def build_model(self):
             cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words, onehot_labels)
             cross_entropy = cross_entropy * caption_mask[:,i]
 
+            probs.append(logit_words)
+
             current_loss = tf.reduce_sum(cross_entropy)
             loss += current_loss
 
         loss = loss / tf.reduce_sum(caption_mask)
-        return loss, video, video_mask, caption, caption_mask
+        return loss, video, video_mask, caption, caption_mask, probs
 
 
     def build_generator(self):
@@ -107,14 +111,17 @@ def build_generator(self):
 
         generated_words = []
 
+        probs = []
+        embeds = []
+
         for i in range(self.n_lstm_steps):
             if i > 0: tf.get_variable_scope().reuse_variables()
 
             with tf.variable_scope("LSTM1"):
                 output1, state1 = self.lstm1( image_emb[:,i,:], state1 )
 
             with tf.variable_scope("LSTM2"):
-                output2, state2 = self.lstm2( padding, state2 )
+                output2, state2 = self.lstm2( tf.concat(1,[padding,output1]), state2 )
 
         for i in range(self.n_lstm_steps):
 
@@ -127,23 +134,26 @@ def build_generator(self):
                 output1, state1 = self.lstm1( padding, state1 )
 
             with tf.variable_scope("LSTM2"):
-                output2, state2 = self.lstm2( current_embed, state2 )
+                output2, state2 = self.lstm2( tf.concat(1,[current_embed,output1]), state2 )
 
             logit_words = tf.nn.xw_plus_b( output2, self.embed_word_W, self.embed_word_b)
             max_prob_index = tf.argmax(logit_words, 1)[0]
             generated_words.append(max_prob_index)
+            probs.append(logit_words)
 
             with tf.device("/cpu:0"):
                 current_embed = tf.nn.embedding_lookup(self.Wemb, max_prob_index)
                 current_embed = tf.expand_dims(current_embed, 0)
 
-        return video, video_mask, generated_words
+            embeds.append(current_embed)
+
+        return video, video_mask, generated_words, probs, embeds
 
 
 ############### Global Parameters ###############
 video_path = '/media/storage3/Study/data/youtube_videos'
 video_data_path='./data/video_corpus.csv'
-video_save_path = '/media/storage3/Study/data/youtube_videos'
+video_feat_path = '/media/storage3/Study/data/youtube_feats'
 
 vgg16_path = '/home/taeksoo/Package/tensorflow_vgg16/vgg16.tfmodel'
 
@@ -153,15 +163,15 @@ def build_generator(self):
 dim_hidden= 256
 n_frame_step = 80
 n_epochs = 1000
-batch_size = 50
+batch_size = 100
 learning_rate = 0.001
 ##################################################
 
-def get_video_data(video_data_path, video_save_path, train_ratio=0.9):
+def get_video_data(video_data_path, video_feat_path, train_ratio=0.9):
     video_data = pd.read_csv(video_data_path, sep=',')
     video_data = video_data[video_data['Language'] == 'English']
     video_data['video_path'] = video_data.apply(lambda row: row['VideoID']+'_'+str(row['Start'])+'_'+str(row['End'])+'.avi.npy', axis=1)
-    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_save_path, x))
+    video_data['video_path'] = video_data['video_path'].map(lambda x: os.path.join(video_feat_path, x))
     video_data = video_data[video_data['video_path'].map(lambda x: os.path.exists( x ))]
     video_data = video_data[video_data['Description'].map(lambda x: isinstance(x, str))]
 
@@ -206,11 +216,11 @@ def preProBuildWordVocab(sentence_iterator, word_count_threshold=5): # borrowed
     return wordtoix, ixtoword, bias_init_vector
 
 def train():
-    train_data, _ = get_video_data(video_data_path, video_save_path, train_ratio=0.9)
+    train_data, _ = get_video_data(video_data_path, video_feat_path, train_ratio=0.9)
     captions = train_data['Description'].values
     captions = map(lambda x: x.replace('.', ''), captions)
     captions = map(lambda x: x.replace(',', ''), captions)
-    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions)
+    wordtoix, ixtoword, bias_init_vector = preProBuildWordVocab(captions, word_count_threshold=10)
 
     np.save('./data/ixtoword', ixtoword)
 
@@ -222,7 +232,7 @@ def train():
             n_lstm_steps=n_frame_step,
             bias_init_vector=bias_init_vector)
 
-    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask = model.build_model()
+    tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()
     sess = tf.InteractiveSession()
 
     saver = tf.train.Saver(max_to_keep=10)
@@ -234,11 +244,14 @@ def train():
         np.random.shuffle(index)
         train_data = train_data.ix[index]
 
+        current_train_data = train_data.groupby('video_path').apply(lambda x: x.irow(np.random.choice(len(x))))
+        current_train_data = current_train_data.reset_index(drop=True)
+
         for start,end in zip(
-                range(0, len(train_data), batch_size),
-                range(batch_size, len(train_data), batch_size)):
+                range(0, len(current_train_data), batch_size),
+                range(batch_size, len(current_train_data), batch_size)):
 
-            current_batch = train_data[start:end]
+            current_batch = current_train_data[start:end]
             current_videos = current_batch['video_path'].values
 
             current_feats = np.zeros((batch_size, n_frame_step, dim_image))
@@ -261,6 +274,11 @@ def train():
             for ind, row in enumerate(current_caption_masks):
                 row[:nonzeros[ind]] = 1
 
+            probs_val = sess.run(tf_probs, feed_dict={
+                tf_video:current_feats,
+                tf_caption: current_caption_matrix
+                })
+
             _, loss_val = sess.run(
                     [train_op, tf_loss],
                     feed_dict={
@@ -271,11 +289,14 @@ def train():
                         })
 
             print loss_val
-        print "Epoch ", epoch, " is done. Saving the model ..."
-        saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
+        if np.mod(epoch, 100) == 0:
+            print "Epoch ", epoch, " is done. Saving the model ..."
+            saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
+
+def test(model_path='models/model-700', video_feat_path=video_feat_path):
 
-def test(model_path='models/model-45', video_feat_path='/media/storage3/Study/data/youtube_videos/hxZ-5wELSJM_0_12.avi.npy'):
-    _, test_data = get_video_data(video_data_path, video_save_path, train_ratio=0.9)
+    train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.9)
+    test_videos = test_data['video_path'].unique()
     ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist())
 
     model = Video_Caption_Generator(
@@ -286,22 +307,27 @@ def test(model_path='models/model-45', video_feat_path='/media/storage3/Study/da
             n_lstm_steps=n_frame_step,
             bias_init_vector=None)
 
-    video_tf, video_mask_tf, caption_tf = model.build_generator()
+    video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator()
     sess = tf.InteractiveSession()
 
     saver = tf.train.Saver()
     saver.restore(sess, model_path)
 
-    video_feat = np.load(video_feat_path)[None,...]
-    video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
+    for video_feat_path in test_videos:
+        print video_feat_path
+        video_feat = np.load(video_feat_path)[None,...]
+        video_mask = np.ones((video_feat.shape[0], video_feat.shape[1]))
 
-    generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
-    generated_words = ixtoword[generated_word_index]
+        generated_word_index = sess.run(caption_tf, feed_dict={video_tf:video_feat, video_mask_tf:video_mask})
+        probs_val = sess.run(probs_tf, feed_dict={video_tf:video_feat})
+        embed_val = sess.run(last_embed_tf, feed_dict={video_tf:video_feat})
+        generated_words = ixtoword[generated_word_index]
 
-    punctuation = np.argmax(np.array(generated_words) == '.')+1
-    generated_words = generated_words[:punctuation]
+        punctuation = np.argmax(np.array(generated_words) == '.')+1
+        generated_words = generated_words[:punctuation]
 
-    generated_sentence = ' '.join(generated_words)
-    print generated_sentence
+        generated_sentence = ' '.join(generated_words)
+        print generated_sentence
+        ipdb.set_trace()
 
     ipdb.set_trace()
diff --git a/preprocessing.py b/preprocessing.py
@@ -12,27 +12,54 @@
 import skimage
 from cnn_util import *
 
-def preprocess_frame(frame):
-    short_edge = min(frame.shape[:2])
-    yy = int((frame.shape[0] - short_edge) / 2)
-    xx = int((frame.shape[1] - short_edge) / 2)
-    crop_img = frame[yy : yy + short_edge, xx : xx + short_edge]
-    resized_img = skimage.transform.resize(crop_img, (224, 224))
 
-    return resized_img
+def preprocess_frame(image, target_height=224, target_width=224):
+
+    if len(image.shape) == 2:
+        image = np.tile(image[:,:,None], 3)
+    elif len(image.shape) == 4:
+        image = image[:,:,:,0]
+
+    image = skimage.img_as_float(image).astype(np.float32)
+    height, width, rgb = image.shape
+    if width == height:
+        resized_image = cv2.resize(image, (target_height,target_width))
+
+    elif height < width:
+        resized_image = cv2.resize(image, (int(width * float(target_height)/height), target_width))
+        cropping_length = int((resized_image.shape[1] - target_height) / 2)
+        resized_image = resized_image[:,cropping_length:resized_image.shape[1] - cropping_length]
+
+    else:
+        resized_image = cv2.resize(image, (target_height, int(height * float(target_width) / width)))
+        cropping_length = int((resized_image.shape[0] - target_width) / 2)
+        resized_image = resized_image[cropping_length:resized_image.shape[0] - cropping_length,:]
+
+    return cv2.resize(resized_image, (target_height, target_width))
 
 def main():
     num_frames = 80
     vgg_model = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers.caffemodel'
     vgg_deploy = '/home/taeksoo/Package/caffe/models/vgg/VGG_ILSVRC_19_layers_deploy.prototxt'
-    video_save_path = '/media/storage3/Study/data/youtube_videos'
-    videos = os.listdir(video_save_path)
+    video_path = '/media/storage3/Study/data/youtube_videos'
+    video_save_path = '/media/storage3/Study/data/youtube_feats'
+    videos = os.listdir(video_path)
+    videos = filter(lambda x: x.endswith('avi'), videos)
 
     cnn = CNN(model=vgg_model, deploy=vgg_deploy, width=224, height=224)
 
     for video in videos:
-        video_fullpath = os.path.join(video_save_path, video)
-        cap  = cv2.VideoCapture( video_fullpath )
+        print video
+
+        if os.path.exists( os.path.join(video_save_path, video) ):
+            print "Already processed ... "
+            continue
+
+        video_fullpath = os.path.join(video_path, video)
+        try:
+            cap  = cv2.VideoCapture( video_fullpath )
+        except:
+            pass
 
         frame_count = 0
         frame_list = []