In [4]:
import os,sys,inspect
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

In [8]:
import random

# torch imports
import torch
import torch.nn as nn
import torch.optim as optim

# general imports
import json
import pickle
import time
import numpy as np

# custom imports
import agent
from toolbox import *

# SETTING CONSTANTS & INITIALIZATION

## Constants

In [30]:
uc = UniversalConstants()

seed = 123

random.seed(a=seed)

# setting torch seeds
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)

np.random.seed(seed)

# todo: mkdir all the needed paths
mkdir(os.path.join(current_dir, uc.paths['models']))

path_data = os.path.join(current_dir,'data/')

# Print after this many batches:
printerval = uc.printerval

## Interpret command line arguments

In [31]:
# batchsize, lr, setting = print_god_settings(sys.argv) todo: four next lines can be done just by this line as well
# print("\nSys.argv:", sys.argv)
batchsize = 40
lr = 0.001
setting = 'plasticity'

## Reading the Data

### Object vgg indices (object information)

In [32]:
path_ha_vggs_indices = os.path.join(path_data, "ha_vgg_indices.json")
ha_vggs_indices = load_data(path_ha_vggs_indices)

In [33]:
ha_vggs_indices['1295671216']

{'7639': [0, 1],
 '7640': [3, 4],
 '7641': [1, 2],
 '7642': [2, 3],
 '7643': [5, 11],
 '7644': [4, 5]}

### Regular data (dictionary with all images and their object ids, corresponding words)

In [34]:
path_dict_words_boxes = os.path.join(path_data, "dict_words_boxes.json")
dict_words_boxes = load_data(path_dict_words_boxes)

In [35]:
dict_words_boxes['4787655421']

{'202850': {'bnbox': [[270, 78, 375, 401]], 'word': 'horse'},
 '202851': {'bnbox': [[40, 104, 186, 215]], 'word': 'armor'},
 '202852': {'bnbox': [[155, 225, 319, 407]], 'word': 'shield'}}

### Splits

#### Train split, image ids

In [36]:
path_train_data = os.path.join(path_data, "train_data.txt")
train_data = load_data(path_train_data)

In [37]:
train_data

['4787655421',
 '5041370727',
 '2604462733',
 '3338289816',
 '3457784061',
 '231584770',
 '336551609',
 '3539612893',
 '3527572464',
 '8140285876',
 '4937203156',
 '3593538248',
 '182030893',
 '4479855590',
 '2226898412',
 '2863186021',
 '2604357479',
 '2978024878',
 '4530729176',
 '4478363092',
 '4486660330',
 '3124964754',
 '2146316159',
 '3376435746',
 '4917929879',
 '2489084679',
 '6521812555',
 '4789959232',
 '2190137367',
 '3290465391',
 '5738179350',
 '4588654711',
 '3176072448',
 '4818210621',
 '6187191468',
 '7802586228',
 '27683428',
 '1408958345',
 '4979032188',
 '2644916196',
 '4045265278',
 '259611216',
 '2402793046',
 '6201592880',
 '3914751903',
 '4179455963',
 '2602679255',
 '5633096756',
 '3304484212',
 '2667015110',
 '3774622914',
 '112264837',
 '85600252',
 '2931721215',
 '41253213',
 '2528552898',
 '522700240',
 '473251659',
 '132489044',
 '281754914',
 '4813609768',
 '519228804',
 '2710280476',
 '518602687',
 '4661178321',
 '2217244400',
 '4547439172',
 '4981216283

#### Validation split, image ids

In [38]:
path_validation_data = os.path.join(path_data, "validation_data.txt")
validation_data = load_data(path_validation_data)

#### Test split, image ids

In [39]:
path_test_data = os.path.join(path_data, "test_data.txt")
test_data = load_data(path_test_data)

# PREPROCESSING

In [40]:
vocab, freq = make_vocabulary(dict_words_boxes)  # Makes a vocabulary of the entire set of objects
word_to_ix = make_index_table(vocab)  # Gives an index number to every word in the vocabulary todo: change name to
# indexed_vocabulary

In [41]:
vocab[:5]

['beak', 'fish', 'boy', 'hat', 'hands']

In [42]:
freq

{'beak': 5,
 'fish': 52,
 'boy': 1477,
 'hat': 1333,
 'hands': 345,
 'ladder': 85,
 'boat': 292,
 'water': 986,
 'shore': 45,
 'man': 7891,
 'plastic': 16,
 'playpen': 4,
 'girl': 1428,
 'spectators': 21,
 'toothbrush': 17,
 'paste': 1,
 'table': 814,
 'target': 7,
 'baby': 286,
 'stuffed': 9,
 'blanket': 98,
 'grass': 511,
 'striped': 46,
 'lion': 6,
 'tree': 342,
 'dog': 1399,
 'glasses': 751,
 'meal': 49,
 'jacket': 949,
 'snowboarder': 69,
 'board': 70,
 'woman': 4378,
 'necklace': 42,
 'building': 688,
 'waves': 49,
 'shirt': 4536,
 'backpack': 278,
 'hand': 694,
 'hair': 390,
 'bushes': 22,
 'head': 606,
 'stick': 178,
 'surf': 30,
 'wall': 560,
 'camera': 245,
 'coat': 301,
 'tripod': 33,
 'pants': 750,
 'house': 118,
 'skateboard': 274,
 'tracks': 28,
 'kickflip': 1,
 'mat': 37,
 'pickup': 1,
 'shorts': 832,
 'street': 1162,
 'sign': 262,
 'box': 61,
 'other': 209,
 'pipe': 23,
 'tie': 107,
 'pole': 130,
 'wood': 59,
 'beach': 373,
 'sand': 238,
 'game': 48,
 'people': 313,
 's

In [43]:
word_to_ix

{'<UNK>': 0,
 'beak': 1,
 'fish': 2,
 'boy': 3,
 'hat': 4,
 'hands': 5,
 'ladder': 6,
 'boat': 7,
 'water': 8,
 'shore': 9,
 'man': 10,
 'plastic': 11,
 'playpen': 12,
 'girl': 13,
 'spectators': 14,
 'toothbrush': 15,
 'paste': 16,
 'table': 17,
 'target': 18,
 'baby': 19,
 'stuffed': 20,
 'blanket': 21,
 'grass': 22,
 'striped': 23,
 'lion': 24,
 'tree': 25,
 'dog': 26,
 'glasses': 27,
 'meal': 28,
 'jacket': 29,
 'snowboarder': 30,
 'board': 31,
 'woman': 32,
 'necklace': 33,
 'building': 34,
 'waves': 35,
 'shirt': 36,
 'backpack': 37,
 'hand': 38,
 'hair': 39,
 'bushes': 40,
 'head': 41,
 'stick': 42,
 'surf': 43,
 'wall': 44,
 'camera': 45,
 'coat': 46,
 'tripod': 47,
 'pants': 48,
 'house': 49,
 'skateboard': 50,
 'tracks': 51,
 'kickflip': 52,
 'mat': 53,
 'pickup': 54,
 'shorts': 55,
 'street': 56,
 'sign': 57,
 'box': 58,
 'other': 59,
 'pipe': 60,
 'tie': 61,
 'pole': 62,
 'wood': 63,
 'beach': 64,
 'sand': 65,
 'game': 66,
 'people': 67,
 'sun': 68,
 'hill': 69,
 'cellphone

#### Save `word-to-ix`

In [44]:
path_word_to_ix = os.path.join(current_dir, 'outfiles', 'models', 'word_to_ix/')
mkdir(path_word_to_ix)
with open(os.path.join(path_word_to_ix, 'word_to_ix_{}_{}_{}.json'.format(setting, seed, str(lr))), 'w') as wtx:
    json.dump(word_to_ix, wtx)

## Test, Validation set

### train set

In [45]:
no_objs = imgn_per_x_objn(dict_words_boxes, train_data)  # Returns a dictionary with the number of objects per image
batches = dict_to_batches(no_objs, batchsize)  # Returns a list of batch-size batches: A batch contains images with
# the same no. of objs

In [46]:
no_objs[2]

['336551609',
 '3593538248',
 '2863186021',
 '4486660330',
 '2489084679',
 '6521812555',
 '3176072448',
 '6187191468',
 '2402793046',
 '2931721215',
 '41253213',
 '522700240',
 '281754914',
 '4813609768',
 '4547439172',
 '1275832390',
 '150712506',
 '2375402652',
 '2917480555',
 '4844993012',
 '3331900249',
 '3608661756',
 '3406930103',
 '552498500',
 '4521860330',
 '3748769937',
 '2956413620',
 '1253095131',
 '8125199349',
 '3154693053',
 '3682038869',
 '12974441',
 '4638840250',
 '356478206',
 '2397886149',
 '3803116389',
 '3274879561',
 '6813886436',
 '23016091',
 '7463856746',
 '2745811124',
 '3370085095',
 '107368840',
 '2088120475',
 '2230482572',
 '3687215840',
 '97731718',
 '3216762979',
 '3154641421',
 '3382777250',
 '4301236235',
 '2701892321',
 '3372167201',
 '4681354399',
 '3677329561',
 '4785739399',
 '3124455694',
 '69397865',
 '6079294995',
 '1343426964',
 '4925906360',
 '2217258342',
 '4872513153',
 '4301874954',
 '4545180999',
 '4497207612',
 '4653642693',
 '2749641730

In [47]:
batches[573][1]

'1145755142'

### Validation set

In [48]:
no_objs_val = imgn_per_x_objn(dict_words_boxes, validation_data)
val_batchlist = dict_to_batches(no_objs_val, batchsize)

### Test set

In [49]:
no_objs_test = imgn_per_x_objn(dict_words_boxes, test_data)
test_batchlist = dict_to_batches(no_objs_test, batchsize)

In [50]:
ntokens = len(word_to_ix.keys())
print("ntokens:", ntokens)

ntokens: 4238


# SPECIFY MODEL

In [51]:
# these are the sizes Anna Rohrbach uses. she uses a batch size of 40.
# n_objects = 100
object_size = uc.object_size  # Length vgg vector?
att_hidden_size = uc.att_hidden_size  # Number of hidden nodes
wordemb_size = uc.wordemb_size  # Length word embedding
nonlin = uc.nonlin
print("hidden layer size:", att_hidden_size)

hidden layer size: 256


In [52]:
epochs = 5

device = torch.device('cuda')  # Device = GPU

## Listener

### Initialization

In [53]:
# Makes the listener part of the model:
listener = agent.Listener(object_size, ntokens, wordemb_size,
                          att_hidden_size, nonlinearity=nonlin).to(device)

Initializing word embeddings
Initializing attention MLP weights...
Initializing bias terms to all 0...
att_hidden.bias
attention.bias


### Save Listener

In [29]:
mkdir(parent_dir + '/' + uc.path_models + '/' + setting)  # todo: moving it to top, after removing function definitions
torch.save({
            'epoch': 0,
            'setting': setting,
            'seed': seed,
            'model_state_dict': listener.state_dict()
        }, (os.path.join(parent_dir, 'outfiles', 'models', setting, 'liModel_{}_{}_{}_ep0.pth'
                         .format(setting, str(lr), seed))))

## Speaker
### Initialization

In [54]:
# Makes the speaker part of the model:
speaker = agent.Speaker(object_size, ntokens, att_hidden_size, nonlinearity=nonlin).to(device)

Initializing mapping weights...
Initializing bias terms to all 0
hidden.bias
word_logits.bias


### Save Speaker

In [31]:
torch.save({
            'epoch': 0,
            'setting': setting,
            'seed': seed,
            'model_state_dict': speaker.state_dict()
        }, (os.path.join(parent_dir, 'outfiles', 'models', setting, 'spModel_{}_{}_{}_ep0.pth'
                         .format(setting, str(lr), seed))))  # todo: manage the path

## Loss Function

In [55]:
# Loss function: binary cross entropy
criterion = nn.CrossEntropyLoss(size_average=True)

# TRAIN

In [56]:
print("parameters of listener agent:")
for param in listener.parameters():
    print(type(param.data), param.size())
listener_optimizer = optim.Adam(listener.parameters(), lr=lr)

parameters of listener agent:
<class 'torch.Tensor'> torch.Size([4238, 256])
<class 'torch.Tensor'> torch.Size([256, 4352])
<class 'torch.Tensor'> torch.Size([256])
<class 'torch.Tensor'> torch.Size([1, 256])
<class 'torch.Tensor'> torch.Size([1])


In [57]:
print("parameters of speaker agent:")
for param in speaker.parameters():
    print(type(param.data), param.size())
speaker_optimizer = optim.Adam(speaker.parameters(), lr=lr)

parameters of speaker agent:
<class 'torch.Tensor'> torch.Size([256, 4096])
<class 'torch.Tensor'> torch.Size([256])
<class 'torch.Tensor'> torch.Size([4238, 256])
<class 'torch.Tensor'> torch.Size([4238])


In [58]:
# Creating numpy arrays to store loss and accuracy
# for train, validation, and test splits
listener_train_loss = \
    listener_train_acc = \
    speaker_train_loss = \
    speaker_train_acc = \
    listener_val_loss = \
    listener_val_acc = \
    speaker_val_loss = \
    speaker_val_acc = np.empty(epochs)

# Evaluation Function

In [59]:
# evaluation function
my_targets = []

In [62]:
def evaluate(epoch, split='val'):
    listener.eval()
    speaker.eval()
    if split == 'val':
        batchlist = val_batchlist
    elif split == 'test':
        batchlist = test_batchlist
    n_batches = len(batchlist)
    start_time = time.time()
    li_eval_loss = np.empty(n_batches)
    li_eval_acc = np.empty(n_batches)
    sp_eval_loss = np.empty(n_batches)
    sp_eval_acc = np.empty(n_batches)
    batch_size = np.empty(n_batches)

    batch = 0

    while batch < n_batches:
        language_input, visual_input, targets = load_val_batch(dict_words_boxes,
                                                               batchlist[batch],
                                                               word_to_ix,
                                                               device)

        obj_guesses = listener(language_input, visual_input)
        obj_guess_values = obj_guesses.detach()

        word_guesses = speaker(visual_input, obj_guess_values)
        li_loss = criterion(obj_guesses, targets)
        li_eval_acc[batch], batch_size[batch] = calc_accuracy(obj_guesses, targets)
        li_eval_loss[batch] = li_loss.item() * batch_size[batch]
        li_eval_acc[batch] *= batch_size[batch]  # avg weighted for differing batchsizes

        sp_loss = criterion(word_guesses, language_input)
        sp_eval_loss[batch] = sp_loss.item() * batch_size[batch]
        sp_eval_acc[batch], _ = calc_accuracy(word_guesses, language_input)
        sp_eval_acc[batch] *= batch_size[batch]  # avg weighted for differing batchsizes

        batch += 1
        if batch % printerval == 0:
            print(
                '| epoch {:2d} | batch {:3d}/{:3d} | t {:6.2f} | l.L {:6.4f} | l.A {:5.4f} | s.L {:6.4f} | s.A {:5.4f} |'.format(
                    epoch, batch, n_batches, (time.time() - start_time),
                    np.sum(li_eval_loss[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch]),
                    np.sum(li_eval_acc[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch]),
                    np.sum(sp_eval_loss[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch]),
                    np.sum(sp_eval_acc[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch])))

    avg_li_eval_loss = np.sum(li_eval_loss) / np.sum(batch_size)
    avg_li_eval_acc = np.sum(li_eval_acc) / np.sum(batch_size)
    avg_sp_eval_loss = np.sum(sp_eval_loss) / np.sum(batch_size)
    avg_sp_eval_acc = np.sum(sp_eval_acc) / np.sum(batch_size)

    if split == 'val':
        print('-' * 89)
        print("overall performance on validation set:")
        print('| L.loss {:8.4f} | L.acc. {:8.4f} |'.format(
            avg_li_eval_loss,
            avg_li_eval_acc))
        print('| S.loss {:8.4f} | S.acc. {:8.4f} |'.format(
            avg_sp_eval_loss,
            avg_sp_eval_acc))
        print('-' * 89)
    elif split == 'test':
        print('-' * 89)
        print("overall performance on test set:")
        print('| L.loss {:8.4f} | L.acc. {:8.4f} |'.format(
            avg_li_eval_loss,
            avg_li_eval_acc))
        print('| S.loss {:8.4f} | S.acc. {:8.4f} |'.format(
            avg_sp_eval_loss,
            avg_sp_eval_acc))
        print('-' * 89)
    return avg_li_eval_loss, avg_li_eval_acc, avg_sp_eval_loss, avg_sp_eval_acc


def load_val_batch(_dict_words_boxes, batch, _word_to_ix, _device):
    # Loads the batches for the validation and test splits of the data
    language_input = []
    visual_input = []
    targets = []

    for img in batch:
        vggs = torch.load(current_dir + "/data/ha_bbox_vggs/" + img + ".pt").to(_device)
        for obj in _dict_words_boxes[img]:
            language_input.append(get_word_ix(_word_to_ix, _dict_words_boxes[img][obj]["word"]))

            bbox_indices = []
            n = 0

            for obj_id in _dict_words_boxes[img]:
                bbox_indices.append(ha_vggs_indices[img][obj_id][0])
                if obj_id == obj:
                    targets.append(n)
                    my_targets.append(n)
                n += 1
            visual_input.append(vggs[bbox_indices, :])

    lang_batch = torch.tensor(language_input, dtype=torch.long, device=_device)
    vis_batch = torch.stack(visual_input)
    targets = torch.tensor(targets, dtype=torch.long, device=_device)
    return lang_batch, vis_batch, targets


def load_img(_dict_words_boxes, _ha_vggs_indices, _word_to_ix, img, _device, path_vgg):
    vggs = torch.load(path_vgg + img + ".pt").to(_device)  # Edit path
    # dict met obj ids als keys en een dictionary met words : '', bboxes :
    # n = 0
    bbox_indices = []
    words = []
    for obj in _dict_words_boxes[img]:  # For every object in this image
        words.append(get_word_ix(_word_to_ix, _dict_words_boxes[img][obj]["word"]))
        bbox_indices.append(_ha_vggs_indices[img][obj][0])
    visual_input = vggs[bbox_indices, :]
    language_input = torch.tensor(words, dtype=torch.long, device=_device)
    return language_input, visual_input


def curious_look_at_img(_dict_words_boxes, _ha_vggs_indices, img, _setting, word_to_ix):
    language_input, scene = load_img(_dict_words_boxes, _ha_vggs_indices, word_to_ix, img, device, uc.path_vgg)
    # repeat scene n_objects times as input to listener
    visual_input = scene.expand(scene.size()[0], scene.size()[0], scene.size()[1])
    curiosity_targets = torch.eye(visual_input.size()[0], dtype=torch.float, device=device)
    # targets is simply 0, 1, ...., n because they are in order of appearance
    targets = torch.tensor([i for i in range(len(language_input))], dtype=torch.long, device=device)
    # word guesses by child - use as attention over word embeddings
    word_guesses = speaker(visual_input, curiosity_targets, apply_softmax=False)
    # only keep most likely words
    words = torch.argmax(word_guesses, dim=1)
    # give these as input to listener
    object_guesses = listener(words, visual_input)
    curiosity_values = curiosity(curiosity_targets, object_guesses, _setting)
    max_curious = torch.argmax(curiosity_values)
    return language_input[max_curious], scene, targets[max_curious]


def random_look_at_img(_dict_words_boxes, _ha_vggs_indices, img, _word_to_ix):
    language_input, scene = load_img(_dict_words_boxes, _ha_vggs_indices, _word_to_ix, img, device, uc.path_vgg)
    # targets is simply 0, 1, ...., n because they are in order of appearance
    targets = torch.tensor([i for i in range(len(language_input))], dtype=torch.long, device=device)
    i = np.random.randint(len(targets))
    return language_input[i], scene, targets[i]


def load_select_obj(_dict_words_boxes, _ha_vggs_indices, img, _setting, _word_to_ix):
    if _setting == "random":
        return random_look_at_img(_dict_words_boxes, _ha_vggs_indices, img, _word_to_ix)
    elif (_setting == "curious") | (_setting == "plasticity") | (_setting == "sn"):
        return curious_look_at_img(_dict_words_boxes, _ha_vggs_indices, img, _setting, _word_to_ix)
    else:
        print('setting is not correct. It should be random, curious, plasticity, or sn.')

# Training function

# Train Loop

In [76]:
len(my_targets)

3506

In [63]:
li_val_loss, li_val_acc, sp_val_loss, sp_val_acc = evaluate(0)  # first run evaluate to get random baseline

-----------------------------------------------------------------------------------------
overall performance on validation set:
| L.loss   1.3276 | L.acc.   0.2852 |
| S.loss   8.4475 | S.acc.   0.0000 |
-----------------------------------------------------------------------------------------


## Manual Loop

In [38]:
epoch = 1 # each time increase once

In [39]:
# for epoch in range(1, epochs+1):
epoch_start_time = time.time()  # todo: should we do anything with this time?

In [112]:
# li_train_loss, li_train_acc, sp_train_loss, sp_train_acc = train()

| epoch  1 | batch 100/574 | t   4.64 | l.L 1.0739 | l.A 0.5730 | s.L 4.8854 | s.A 0.2674 |
| epoch  1 | batch 200/574 | t   9.37 | l.L 1.0767 | l.A 0.5611 | s.L 4.5040 | s.A 0.3051 |
| epoch  1 | batch 300/574 | t  14.16 | l.L 0.9999 | l.A 0.5455 | s.L 4.6124 | s.A 0.2802 |
| epoch  1 | batch 400/574 | t  18.74 | l.L 1.0290 | l.A 0.5343 | s.L 4.4171 | s.A 0.2930 |
| epoch  1 | batch 500/574 | t  23.33 | l.L 1.0389 | l.A 0.5255 | s.L 4.4879 | s.A 0.2785 |
-----------------------------------------------------------------------------------------
overall performance on training set:
| L.loss   1.0371 | L.acc.   0.5482 |
| S.loss   4.5580 | S.acc.   0.2846 |
-----------------------------------------------------------------------------------------


In [90]:
speaker.word_logits.weight[0]

tensor([-0.0149,  0.0146, -0.0277,  0.0568,  0.0392,  0.0553,  0.0289,
        -0.1228,  0.0163,  0.0374, -0.0086, -0.0531, -0.0685,  0.0183,
         0.0318,  0.0534, -0.1148, -0.0399, -0.0608,  0.0917,  0.0179,
        -0.0117, -0.1253, -0.1298,  0.0531, -0.0210,  0.0003, -0.0184,
         0.0362, -0.0472,  0.0504,  0.0407, -0.0006, -0.0565, -0.0348,
         0.0049,  0.0605, -0.0929, -0.0684,  0.0351,  0.0173,  0.1019,
        -0.1020, -0.0173, -0.0346, -0.0702,  0.0381, -0.0620,  0.0062,
        -0.0307,  0.0231, -0.0003,  0.0310, -0.0459, -0.0273, -0.0665,
         0.0005,  0.0855, -0.0601,  0.1226, -0.0016,  0.0960,  0.0520,
        -0.0526,  0.0373,  0.0220,  0.0689, -0.0655,  0.0855, -0.0097,
         0.0544,  0.0818,  0.0989,  0.0363,  0.1304,  0.0798,  0.0399,
         0.0616,  0.0122,  0.1029, -0.1038, -0.0640, -0.0269,  0.0698,
        -0.0072, -0.0136, -0.0129, -0.1056,  0.0258,  0.0448,  0.0830,
         0.1223,  0.0193,  0.0912,  0.0698,  0.0533,  0.0040, -0.0360,
      

In [40]:
listener.train()
speaker.train()

Speaker(
  (hidden): Linear(in_features=4096, out_features=256, bias=True)
  (word_logits): Linear(in_features=256, out_features=4238, bias=True)
)

In [91]:
start_time = time.time()

In [92]:
n_batches = len(batches)
li_train_loss = np.empty(n_batches)
li_train_accuracy = np.empty(n_batches)
sp_train_loss = np.empty(n_batches)
sp_train_accuracy = np.empty(n_batches)
batch_size = np.empty(n_batches)

In [93]:
batch = 0

In [127]:
word_to_ix[language_input[1].item()]

KeyError: 10

In [131]:
len(visual_input[0]

2

In [134]:
language_batch[0].item()

100

In [136]:
target

tensor(0, device='cuda:0')

tensor(1.00000e-08 *
       5.9129, device='cuda:0')

In [94]:
# batches shuffled during training
while batch < n_batches:
    language_batch = []  # All word indices in the batch?
    visual_batch = []  # All vgg vectors in the batch?
    target_batch = []  # All target word indices in the batch?

    for img in batches[batch]:
        language_input, visual_input, target = load_select_obj(dict_words_boxes, ha_vggs_indices, img,
                                                               setting, word_to_ix)
        language_batch.append(language_input)
        visual_batch.append(visual_input)
        target_batch.append(target)
    language_input = torch.stack(language_batch)
    visual_input = torch.stack(visual_batch)
    targets = torch.stack(target_batch)

    speaker_optimizer.zero_grad()
    listener_optimizer.zero_grad()

    obj_guesses = listener(language_input, visual_input)

    # Saves the batch length for weighted mean accuracy:
    batch_size[batch] = len(batches[batch])

    loss = criterion(obj_guesses, targets)
    loss.backward()  # backward pass
    listener_optimizer.step()  # adapting the weights

    # Loss/accuracy times batch size for weighted average over epoch:
    li_train_loss[batch] = loss.item() * batch_size[batch]
    li_train_accuracy[batch], _ = calc_accuracy(obj_guesses, targets, average=False)

    obj_guess_values = obj_guesses.detach()

    word_guesses = speaker(visual_input, obj_guess_values)

    speaker_loss = criterion(word_guesses, language_input)
    speaker_loss.backward()
    speaker_optimizer.step()

    # Loss/accuracy times batch size for weighted average over epoch:
    sp_train_loss[batch] = speaker_loss.item() * batch_size[batch]
    sp_train_accuracy[batch], _ = calc_accuracy(word_guesses, language_input, average=False)

    batch += 1
    if batch % printerval == 0:
        print(
            '| epoch {:2d} | batch {:3d}/{:3d} | t {:6.2f} | l.L {:6.4f} | l.A {:5.4f} | s.L {:6.4f} | s.A {:5.4f} |'.format(
                epoch, batch, n_batches, (time.time() - start_time),
                np.sum(li_train_loss[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch]),
                np.sum(li_train_accuracy[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch]),
                np.sum(sp_train_loss[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch]),
                np.sum(sp_train_accuracy[batch - printerval:batch]) / np.sum(batch_size[batch - printerval:batch])))

avg_li_train_loss = np.sum(li_train_loss) / np.sum(batch_size)
avg_li_train_acc = np.sum(li_train_accuracy) / np.sum(batch_size)
avg_sp_train_loss = np.sum(sp_train_loss) / np.sum(batch_size)
avg_sp_train_acc = np.sum(sp_train_accuracy) / np.sum(batch_size)

print('-' * 89)
print("overall performance on training set:")
print('| L.loss {:8.4f} | L.acc. {:8.4f} |'.format(
    avg_li_train_loss,
    avg_li_train_acc))
print('| S.loss {:8.4f} | S.acc. {:8.4f} |'.format(
    avg_sp_train_loss,
    avg_sp_train_acc))
print('-' * 89)
li_train_loss, li_train_acc, sp_train_loss, sp_train_acc = avg_li_train_loss, avg_li_train_acc, avg_sp_train_loss, avg_sp_train_acc

| epoch  1 | batch 100/574 | t 424.11 | l.L 1.0739 | l.A 0.5730 | s.L 4.8854 | s.A 0.2674 |
| epoch  1 | batch 200/574 | t 428.77 | l.L 1.0767 | l.A 0.5611 | s.L 4.5040 | s.A 0.3051 |
| epoch  1 | batch 300/574 | t 433.45 | l.L 0.9999 | l.A 0.5455 | s.L 4.6124 | s.A 0.2802 |
| epoch  1 | batch 400/574 | t 438.08 | l.L 1.0290 | l.A 0.5343 | s.L 4.4171 | s.A 0.2930 |
| epoch  1 | batch 500/574 | t 442.72 | l.L 1.0389 | l.A 0.5255 | s.L 4.4879 | s.A 0.2785 |
-----------------------------------------------------------------------------------------
overall performance on training set:
| L.loss   1.0371 | L.acc.   0.5482 |
| S.loss   4.5580 | S.acc.   0.2846 |
-----------------------------------------------------------------------------------------


In [None]:
        listener_train_loss[epoch - 1], listener_train_acc[epoch - 1] = li_train_loss, li_train_acc
        speaker_train_loss[epoch - 1], speaker_train_acc[epoch - 1] = sp_train_loss, sp_train_acc

        torch.save({
            'epoch': epoch,
            'setting': setting,
            'seed': seed,
            'model_state_dict': listener.state_dict()
        }, (os.path.join('./outfiles', 'models', setting, 'liModel_{}_{}_{}_ep{}.pth'
                         .format(setting, str(lr), seed, epoch))))

        torch.save({
            'epoch': epoch,
            'setting': setting,
            'seed': seed,
            'model_state_dict': speaker.state_dict()
        }, (os.path.join('./outfiles', 'models', setting, 'spModel_{}_{}_{}_ep{}.pth'
                         .format(setting, str(lr), seed, epoch))))

        li_val_loss, li_val_acc, sp_val_loss, sp_val_acc = evaluate(epoch)
        listener_val_loss[epoch - 1], listener_val_acc[epoch - 1] = li_val_loss, li_val_acc
        speaker_val_loss[epoch - 1], speaker_val_acc[epoch - 1] = sp_val_loss, sp_val_acc

        # li_test_loss, li_test_acc, sp_test_loss, sp_test_acc = evaluate(epoch, 'test')
        # listener_test_loss[epoch - 1], listener_test_acc[epoch - 1] = li_test_loss, li_test_acc
        # speaker_test_loss[epoch - 1], speaker_test_acc[epoch - 1] = sp_test_loss, sp_test_acc

# To enable to hit Ctrl + C and break out of training:
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Saving the loss and accuracy numpy arrays:
np.save('loss_acc/final/li_train_loss_{}_{}_{}'.format(
    str(lr), setting, seed), listener_train_loss)
np.save('loss_acc/final/li_train_acc_{}_{}_{}'.format(
    str(lr), setting, seed), listener_train_acc)
np.save('loss_acc/final/sp_train_loss_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_train_loss)
np.save('loss_acc/final/sp_train_acc_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_train_acc)
np.save('loss_acc/final/li_val_loss_{}_{}_{}'.format(
    str(lr), setting, seed), listener_val_loss)
np.save('loss_acc/final/li_val_acc_{}_{}_{}'.format(
    str(lr), setting, seed), listener_val_acc)
np.save('loss_acc/final/sp_val_loss_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_val_loss)
np.save('loss_acc/final/sp_val_acc_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_val_acc)
# np.save('loss_acc/final/li_test_loss_{}_{}_{}'.format(
#    str(lr), setting, seed), listener_test_loss)
# np.save('loss_acc/final/li_test_acc_{}_{}_{}'.format(
#    str(lr), setting, seed), listener_test_acc)
# np.save('loss_acc/final/sp_test_loss_{}_{}_{}'.format(
#    str(lr), setting, seed), speaker_test_loss)
# np.save('loss_acc/final/sp_test_acc_{}_{}_{}'.format(
#    str(lr), setting, seed), speaker_test_acc)

In [1]:
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()  # todo: should we do anything with this time?
        li_train_loss, li_train_acc, sp_train_loss, sp_train_acc = train()
        listener_train_loss[epoch - 1], listener_train_acc[epoch - 1] = li_train_loss, li_train_acc
        speaker_train_loss[epoch - 1], speaker_train_acc[epoch - 1] = sp_train_loss, sp_train_acc

        torch.save({
            'epoch': epoch,
            'setting': setting,
            'seed': seed,
            'model_state_dict': listener.state_dict()
        }, (os.path.join('./outfiles', 'models', setting, 'liModel_{}_{}_{}_ep{}.pth'
                         .format(setting, str(lr), seed, epoch))))

        torch.save({
            'epoch': epoch,
            'setting': setting,
            'seed': seed,
            'model_state_dict': speaker.state_dict()
        }, (os.path.join('./outfiles', 'models', setting, 'spModel_{}_{}_{}_ep{}.pth'
                         .format(setting, str(lr), seed, epoch))))

        li_val_loss, li_val_acc, sp_val_loss, sp_val_acc = evaluate(epoch)
        listener_val_loss[epoch - 1], listener_val_acc[epoch - 1] = li_val_loss, li_val_acc
        speaker_val_loss[epoch - 1], speaker_val_acc[epoch - 1] = sp_val_loss, sp_val_acc

        # li_test_loss, li_test_acc, sp_test_loss, sp_test_acc = evaluate(epoch, 'test')
        # listener_test_loss[epoch - 1], listener_test_acc[epoch - 1] = li_test_loss, li_test_acc
        # speaker_test_loss[epoch - 1], speaker_test_acc[epoch - 1] = sp_test_loss, sp_test_acc

# To enable to hit Ctrl + C and break out of training:
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

# Saving the loss and accuracy numpy arrays:
np.save('loss_acc/final/li_train_loss_{}_{}_{}'.format(
    str(lr), setting, seed), listener_train_loss)
np.save('loss_acc/final/li_train_acc_{}_{}_{}'.format(
    str(lr), setting, seed), listener_train_acc)
np.save('loss_acc/final/sp_train_loss_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_train_loss)
np.save('loss_acc/final/sp_train_acc_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_train_acc)
np.save('loss_acc/final/li_val_loss_{}_{}_{}'.format(
    str(lr), setting, seed), listener_val_loss)
np.save('loss_acc/final/li_val_acc_{}_{}_{}'.format(
    str(lr), setting, seed), listener_val_acc)
np.save('loss_acc/final/sp_val_loss_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_val_loss)
np.save('loss_acc/final/sp_val_acc_{}_{}_{}'.format(
    str(lr), setting, seed), speaker_val_acc)
# np.save('loss_acc/final/li_test_loss_{}_{}_{}'.format(
#    str(lr), setting, seed), listener_test_loss)
# np.save('loss_acc/final/li_test_acc_{}_{}_{}'.format(
#    str(lr), setting, seed), listener_test_acc)
# np.save('loss_acc/final/sp_test_loss_{}_{}_{}'.format(
#    str(lr), setting, seed), speaker_test_loss)
# np.save('loss_acc/final/sp_test_acc_{}_{}_{}'.format(
#    str(lr), setting, seed), speaker_test_acc)