In [1]:
import torch, tensorflow as tf


if torch.cuda.is_available():
    print("GPU is available (Torch).")
    _device = torch.device("cuda")
    print("Current GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU found. Using CPU.")
    _device = torch.device("cpu")


# Check if GPU is available
if tf.config.list_physical_devices('GPU'):
    print("GPU is available (Tensorflow).")
    print("GPU Devices:")
    for device in tf.config.list_physical_devices('GPU'):
        print(device)
else:
    print("No GPU found. Using CPU. (Tensorflow)")

GPU is available (Torch).
Current GPU: GeForce GTX 1080 Ti
No GPU found. Using CPU. (Tensorflow)


In [2]:
import os, sys
from pycocotools.coco import COCO
import urllib, zipfile
from nlp_utils import *

from models import EncoderCNN_Resnet50, EncoderCNN_VGG19, DecoderRNN
from Data_loader import *
from torchvision import transforms

from collections import defaultdict

import pickle5 as pickle
import matplotlib.pyplot as plt 
import skimage.io as io 
import numpy as np 
%matplotlib inline 
from Data_loader_val import get_loader as val_get_loader

## Load the saved checkpoint

In [3]:
embed_size=256
hidden_size = 512
batch_size=128
vocab_threshold=8
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])
# Obtain the data loader.
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_threshold=vocab_threshold,
                         vocab_from_file=False,
                         cocoapi_loc = 'opt')

loading annotations into memory...
Done (t=1.39s)
creating index...
index created!
[0/414113] Tokenizing captions...
[100000/414113] Tokenizing captions...
[200000/414113] Tokenizing captions...
[300000/414113] Tokenizing captions...
[400000/414113] Tokenizing captions...
loading annotations into memory...
Done (t=0.82s)
creating index...


  0%|          | 397/414113 [00:00<01:44, 3969.47it/s]

index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [01:36<00:00, 4281.73it/s]


## Create Dataloader for test data :

In [4]:
vocab_size = len(data_loader.dataset.vocab)

model_save_path = "checkpoint_resnet50"

# Define a transform to pre-process the training images.
transform_test = transforms.Compose([ 
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Obtain the data loader.
val_data_loader = val_get_loader(transform=transform_test, mode='valid', cocoapi_loc='opt')
encoder_file = "encoderdata_1.pkl"
decoder_file = "decoderdata_1.pkl"

encoder = EncoderCNN_Resnet50(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)

# Moving models to GPU if CUDA is available.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)


# Loading the trained weights
encoder.load_state_dict(torch.load(os.path.join(model_save_path, encoder_file)))
decoder.load_state_dict(torch.load(os.path.join(model_save_path, decoder_file)))

encoder.eval()
decoder.eval()

Vocabulary successfully loaded from vocab.pkl file!


DecoderRNN(
  (embed): Embedding(7072, 256)
  (lstm): LSTM(256, 512, batch_first=True)
  (linear): Linear(in_features=512, out_features=7072, bias=True)
)

In [5]:
# infer captions for all images
pred_result = defaultdict(list)
for img_id, img in tqdm(val_data_loader):
    img = img.to(device)
#     print(img.shape)
    
    with torch.no_grad():
        features = encoder(img).unsqueeze(1)
#         print(features.shape)
#         break
        output = decoder.sample(features)
    sentence = clean_sentence(output, val_data_loader.dataset.vocab.idx2word)
    pred_result[img_id.item()].append(sentence)

100%|██████████| 40504/40504 [57:16<00:00, 11.79it/s]  


In [6]:
with open(os.path.join('opt', "cocoapi", "annotations/captions_val2014.json"), "r") as f:
    caption = json.load(f)

valid_annot = caption["annotations"]
valid_result = defaultdict(list)
for i in valid_annot:
    valid_result[i["image_id"]].append(i["caption"].lower())

In [10]:
list(valid_result.values())[:4]

[['a bicycle replica with a clock as the front wheel.',
  'the bike has a clock as a tire.',
  'a black metal bicycle with a clock inside the front wheel.',
  'a bicycle figurine in which the front wheel is replaced with a clock\n',
  'a clock with the appearance of the wheel of a bicycle '],
 ['a black honda motorcycle parked in front of a garage.',
  'a honda motorcycle parked in a grass driveway',
  'a black honda motorcycle with a dark burgundy seat.',
  'ma motorcycle parked on the gravel in front of a garage',
  'a motorcycle with its brake extended standing outside'],
 ['a room with blue walls and a white sink and door.',
  'blue and white color scheme in a small bathroom.',
  'this is a blue and white bathroom with a wall sink and a lifesaver on the wall.',
  'a blue boat themed bathroom with a life preserver on the wall',
  'a bathroom with walls that are painted baby blue.'],
 ['a car that seems to be parked illegally behind a legally parked car',
  'two cars parked on the si

In [11]:
list(pred_result.values())[:4]

[[' a bench sitting on a park bench in a park.'],
 [' a giraffe standing in the middle of a field.'],
 [' a man in a suit and tie standing in a room.'],
 [' a man in a kitchen preparing food on a table.']]

In [9]:
bleu_score(true_sentences=valid_result, predicted_sentences=pred_result)

0.17755527547985703