# If Using colab

In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')

# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'CV_HW3/assignment3/'
FOLDERNAME = '/content/drive/MyDrive/CV HW3'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/{}'.format(FOLDERNAME))

# This downloads the COCO dataset to your Drive
# if it doesn't already exist.
%cd /content/drive/MyDrive/CV HW3/CV/datasets
!bash get_datasets.sh

%cd /content/drive/MyDrive/CV HW3

## Load data

In [1]:
# Setup cell.
import time, os, json
import numpy as np
import matplotlib.pyplot as plt

from utils.transformer_layers import *
from utils.captioning_solver_transformer import CaptioningSolverTransformer
from utils.transformer import CaptioningTransformer
from utils.coco_utils import load_coco_data, decode_captions

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # Set default size of plots.
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

In [2]:
data = load_coco_data(pca_features=False, base_dir='/Users/tt/Documents/Hanyang/3rd grade/Computer Vision/HW_3/NICE/datasets/coco_captioning')#['nice_feature']

base dir  /Users/tt/Documents/Hanyang/3rd grade/Computer Vision/HW_3/NICE/datasets/coco_captioning


In [3]:
from utils.coco_utils import sample_coco_minibatch
minibatch = sample_coco_minibatch(
            data, batch_size=1, split="train")
captions, features, urls = minibatch

In [4]:
torch.manual_seed(231)
np.random.seed(231)

data = load_coco_data(pca_features=False, base_dir='/Users/tt/Documents/Hanyang/3rd grade/Computer Vision/HW_3/NICE/datasets/coco_captioning')

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'
print(f"Now using {device} device")

transformer = CaptioningTransformer(
          word_to_idx=data['word_to_idx'],
          input_dim=data['train_features'].shape[1],
          wordvec_dim=256,
          num_heads=4,
          num_layers=2,
          max_length=30
        ).to(device)


transformer_solver = CaptioningSolverTransformer(
           transformer, data, idx_to_word=data['idx_to_word'],
           num_epochs=30,
           batch_size=20,
           learning_rate=5e-4,
           verbose=True, print_every=1000,
         )



base dir  /Users/tt/Documents/Hanyang/3rd grade/Computer Vision/HW_3/NICE/datasets/coco_captioning


: 

In [None]:

transformer_solver.train()

# Plot the training losses.
plt.plot(transformer_solver.loss_history)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Training loss history')
plt.show()

torch.save(transformer.state_dict(), 'trained.pt')

## Make json file

In [None]:
import json
import os
from tqdm import tqdm
from google.colab import files

def generate_caption(feature):
    captions = transformer.sample(feature)
    captions = decode_captions(captions, data['idx_to_word'])
    return captions[0]

student_id = "2022094093"
pred = []
nice_feat = data['nice_feature']
nice_feat = np.expand_dims(nice_feat, axis=1)

for i in tqdm(range(len(nice_feat))):
    caption = generate_caption(nice_feat[i])
    image_id = i + 1
    pred.append({'image_id' : image_id, 'caption' : caption})

In [None]:
result = {"student_id" : student_id, "prediction" : pred}
json.dump(result, open('prediction_baseline.json', 'w'), indent='\t')
files.download('prediction_baseline.json')