In [1]:
from easydict import EasyDict as edit
import torch
import numpy as np
import os
import re
import pickle
from torchvision import transforms
from build_vocab import Vocabulary
from model import EncoderCNN, DecoderRNN
from PIL import Image

In [2]:
args = edit({
    "image": "./",
    "device": torch.device('cuda: 0' if torch.cuda.is_available() else 'cpu'),
    "video_path": './2fps_sample_video',
    "encoder_path": './models/encoder-5-3000.ckpt',
    "decoder_path": './models/decoder-5-3000.ckpt',
    "vocab_path": './data/vocab.pkl',
    "embed_size": 256,
    "hidden_size": 512,
    "num_layers": 1,
})

In [3]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406),
                        (0.229, 0.224, 0.225))
])

In [4]:
with open(args.vocab_path, 'rb') as f:
    vocab = pickle.load(f)

In [5]:
encoder = EncoderCNN(args.embed_size).eval()
decoder = DecoderRNN(args.embed_size, args.hidden_size, len(vocab), args.num_layers)

encoder = encoder.to(args.device)
decoder = decoder.to(args.device)

In [6]:
encoder.load_state_dict(torch.load(args.encoder_path))
decoder.load_state_dict(torch.load(args.decoder_path))

In [7]:
args.image = './png/1.jpg'

In [8]:
def load_image(image_path, transform=None):
    image = Image.open(image_path).convert('L')
    image = image.resize([224, 224], Image.LANCZOS)
    
    if transform is not None:
        image = transform(image).unsqueeze(0)
    
    image = torch.cat((image, image, image), 1)
    return image

In [21]:
for video_name in os.listdir(args.video_path):
    print(video_name)
    video_path = os.path.join(args.video_path, video_name)
    frame_list = os.listdir(video_path)
    frame_list = sorted(frame_list, key=lambda i:int(re.match(r'(\d+)',i).group()))
    
    out_list = []
    for i, frame in enumerate(frame_list):
        frame_file = os.path.join(video_path, frame)
        
        # Prepare an image
        image = load_image(frame_file, transform)
        image_tensor = image.to(args.device)

        # Generate an caption from the image
        feature = encoder(image_tensor)
        sampled_ids, out_feature_list = decoder.sample(feature)
        out_list.append(out_feature_list)
        sampled_ids = sampled_ids[0].cpu().numpy()

        # Convert word_ids to words
        sampled_caption = []
        for word_id in sampled_ids:
            word = vocab.idx2word[word_id]
            sampled_caption.append(word)
            if word == '<end>':
                break
        sentence = ' '.join(sampled_caption)
        print("{}, {}".format(i, sentence))
    break

EYqVtI9YWJA
0, <start> a black and white photo of a pair of scissors . <end>
1, <start> a white bed with a white comforter and a black cat laying on it . <end>
2, <start> a couple of pictures of a building with a clock . <end>
3, <start> a sign that says <unk> <unk> <unk> <unk> <unk> . <end>
4, <start> a sign for a barber shop with a clock on it . <end>
5, <start> a sign for a barber shop with a clock on it . <end>
6, <start> a sign for a <unk> <unk> <unk> <unk> <unk> <unk> <unk> . <end>
7, <start> a sign that says <unk> street and a building in the background . <end>
8, <start> a sign for a barber shop with a sign on it . <end>
9, <start> a sign for a barber shop with a sign on it . <end>
10, <start> a sign for a barber shop with a sign on it . <end>
11, <start> a sign for a barber shop with a sign on it . <end>
12, <start> a sign for a barber shop with a sign on it . <end>
13, <start> a street sign with a street sign on top of it . <end>
14, <start> a street sign with a street sign o

121, <start> a group of people standing around a table with a cake . <end>
122, <start> a group of people standing around a table with a cake . <end>
123, <start> a group of people standing around a tennis court . <end>
124, <start> a group of people standing around a tennis court . <end>
125, <start> a group of people standing around a table with a cake . <end>
126, <start> a group of people standing around a horse drawn carriage . <end>
127, <start> a group of people standing around a table with a cake . <end>
128, <start> a man in a suit and tie with a woman in a suit and hat . <end>
129, <start> a man in a suit and tie is looking at a cell phone . <end>
130, <start> a man in a suit and tie is looking at a phone . <end>
131, <start> a man in a suit and tie is holding a glass of wine . <end>
132, <start> a man in a suit and tie is standing in front of a mirror . <end>
133, <start> a man and woman standing next to each other . <end>
134, <start> a man and woman standing next to each o

237, <start> a man in a suit and tie holding a glass of wine . <end>
238, <start> a man in a suit and tie with a microphone <end>
239, <start> a man in a suit and tie with a microphone <end>
240, <start> a man in a suit and tie holding a glass of wine . <end>
241, <start> a man in a suit and tie holding a glass of wine . <end>
242, <start> a man in a suit and tie holding a glass of wine . <end>
243, <start> a man holding a glass of wine in his hand . <end>
244, <start> a man in a suit and tie is looking at a cell phone . <end>
245, <start> a man in a suit and tie holding a glass of wine . <end>
246, <start> a man in a suit and tie with a tie . <end>
247, <start> a man in a tie and a woman in a white background . <end>
248, <start> a man in a suit and tie holding a glass of wine . <end>
249, <start> a man in a suit and tie holding a knife . <end>
250, <start> a man in a suit and tie holding a glass of wine . <end>
251, <start> a man in a suit and tie with a microphone <end>
252, <start>

359, <start> a man riding a skateboard down a street . <end>
360, <start> a group of people standing around a tennis court . <end>
361, <start> a group of people standing around a table with a cake . <end>
362, <start> a group of people standing around a table with a umbrella . <end>
363, <start> a group of people standing around a table with a horse and a basket . <end>
364, <start> a black and white photo of a man in a kitchen . <end>
365, <start> a man is standing on a skateboard in front of a crowd . <end>
366, <start> a man in a white shirt and black pants is on a skateboard . <end>
367, <start> a group of people standing around a table with a cake . <end>
368, <start> a man is standing in front of a large crowd . <end>
369, <start> a man in a suit and tie holding a microphone . <end>
370, <start> a man in a suit and tie holding a microphone . <end>
371, <start> a black and white photo of a man in a suit and tie . <end>
372, <start> a black and white photo of a man on a skateboard

In [21]:
fea = torch.stack(out_list[0]).data.cpu()

In [22]:
fea.shape

torch.Size([20, 1, 9956])

In [50]:
# 4, <start> a man holding a sign that says `` <unk> '' on the screen . <end>
# 5, <start> a man holding a sign that says `` <unk> '' on the screen . <end>
# 6, <start> a man and woman standing in front of a large screen . <end>
# 7, <start> a man holding a sign that says `` <unk> '' on the screen . <end>
vocab('and')

7