# Tutorial code for W2VV++ based sentence and video embedding

In [1]:
import os
import random
import torch
import numpy as np

from model import get_model
from bigfile import BigFile
from evaluation import compute_sim
from common import ROOT_PATH as rootpath

### Load checkpoint to initialize model

In [2]:
model_path = os.path.join(rootpath, 'w2vvpp', 'w2vvpp_resnext101_resnet152_subspace_v190916.pth.tar')
if torch.cuda.is_available():
    checkpoint = torch.load(model_path)
else:
    checkpoint = torch.load(model_path,map_location='cpu')
best_perf = checkpoint['best_perf']
config = checkpoint['config']
if hasattr(config, 't2v_w2v'):
    w2v_feature_file = os.path.join(rootpath, 'word2vec', 'flickr', 'vec500flickr30m', 'feature.bin')
    config.t2v_w2v.w2v.binary_file = w2v_feature_file

model = get_model('w2vvpp')(config)
model.load_state_dict(checkpoint['model'])
print("=> loaded checkpoint '{}' (best_perf {})"
      .format(model_path, best_perf))

print(model.vis_net)
print(model.txt_net)

=> loaded checkpoint '/data/home/xcx/VisualSearch/w2vvpp/w2vvpp_resnext101_resnet152_subspace_v190916.pth.tar' (best_perf 0.55695818775)
VisTransformNet(
  (fc1): Linear(in_features=4096, out_features=2048, bias=True)
  (activation): Tanh()
  (dropout): Dropout(p=0.2)
)
MultiScaleTxtNet(
  (encoder): MultiScaleTxtEncoder(
    (rnn_encoder): GruTxtEncoder(
      (we): Embedding(11286, 500)
      (rnn): GRU(500, 1024, batch_first=True)
    )
    (w2v_encoder): W2VTxtEncoder()
    (bow_encoder): BoWTxtEncoder()
  )
  (transformer): TxtTransformNet(
    (fc1): Linear(in_features=12671, out_features=2048, bias=True)
    (activation): Tanh()
    (dropout): Dropout(p=0.2)
  )
)


### Embed video feature

In [3]:
# load video feature
video_collection = 'tv2016train'
feat_name = 'mean_resnext101_resnet152'
vid_feat_dir = os.path.join(rootpath, video_collection, 'FeatureData', feat_name)
vid_feat_file = BigFile(vid_feat_dir)

videoset = vid_feat_file.names
renamed, vectors = vid_feat_file.read(videoset)
nr_videos = len(renamed)
vis_vecs = np.array([model.embed_vis(x)[0].numpy() for x in vectors])

[BigFile] 200x4096 instances loaded from /data/home/xcx/VisualSearch/tv2016train/FeatureData/mean_resnext101_resnet152


### Embed sentence

In [4]:
sent = 'a dog is playing with a cat'
sent_vec = model.embed_txt(sent).numpy()

### Compute text2video similarity

In [5]:
ranklist = [(renamed[i], sim) for i, sim in enumerate(compute_sim(sent_vec, vis_vecs, measure='cosine')[0])]
ranklist.sort(key=lambda v:v[1], reverse=True)

print (ranklist[:5])

cosine_sim execution time: 0.036 seconds

[('tv2016train_video92', 0.43230182), ('tv2016train_video147', 0.38227573), ('tv2016train_video128', 0.3500569), ('tv2016train_video14', 0.28239024), ('tv2016train_video195', 0.27338427)]


### Embed sentences

In [6]:
caption_file = os.path.join(rootpath, video_collection, 'TextData', video_collection+'.caption.txt')
sentences = [line.strip().split(' ', 1)[1] for line in open(caption_file)]
sent_vecs = np.array([model.embed_txt(sent)[0].numpy() for sent in sentences])

### Compute text2text similarity

In [7]:
qry_idx = random.randint(0, len(sentences)-1)
qry_sent_vec = np.array([sent_vecs[qry_idx]])

ranklist = [(sentences[i], sim) for i, sim in enumerate(compute_sim(qry_sent_vec, sent_vecs, measure='cosine')[0])]
ranklist.sort(key=lambda v:v[1], reverse=True)

print 'query: %s\n' % sentences[qry_idx]
print '\n'.join(['%s %f'%(x[0], x[1]) for x in ranklist[:5]])

cosine_sim execution time: 0.013 seconds

query: a man lying on a bed

a man lying on a bed 1.000000
a person is lying on a bed 0.921146
A woman cries on a bed 0.626783
A man makes noises in front of a bed 0.584935
a man sitting in a car 0.417096
