In [21]:
import torch
import pandas as pd
import os
import numpy as np
import json
import h5py
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import sys
import pprint
from collections import Counter,defaultdict
from itertools import chain

class DialogDataset(Dataset):
    
    def __init__(self, json_data, image_features, img2feat, transform=None):
        
        with open(img2feat, 'r') as f:
            self.img2feat = json.load(f)['IR_imgid2id']
            
        self.img_features = np.asarray(h5py.File(image_features, 'r')['img_features'])
        self.json_data = pd.read_json(json_data, orient='index')
        self.corpus = self.get_words()
        self.vocab = list(set(self.corpus))
        self.w2i = {word : i for i, word in enumerate(self.vocab)}
        
    # collect all the words from dialogs and 
    # captions and use them to create embedding map
    def get_words(self):
        words = []
        for idx in range(len(self)):
            item = self.json_data.iloc[idx]

            # Flatten dialog and add caption into 1d array
            dialog = [word for line in item.dialog for word in line[0].split()]
            dialog.extend(item.caption.split(' '))

            words.append(dialog)
            
        return list(chain.from_iterable(words))
    
    def make_context_vector(self, context):
        idxs = [self.w2i[w] for w in context]
        tensor = torch.LongTensor(idxs)
        return tensor

    def __len__(self):
        return len(self.json_data)

    def getDialogVector (self, item):
        dialog = [word for line in item.dialog for word in line[0].split()]
        dialog.extend(item.caption.split(' '))
        return self.make_context_vector(dialog)
    
    def getImageFeatures (self, item):
        img_features = map(lambda x: self.img_features[self.img2feat[str(x)]], item.img_list)
        return torch.FloatTensor(img_features)
    
    def getTarget (self, item):
        return torch.LongTensor(np.array([target]))
        
    def getBatch (self, key):
        items = self.json_data.iloc[key]
        dialogVectors = map(lambda item, caption: self.getDialogVector(item), items)
        imageFeatures = map(lambda imgIds: self.getImageFeatures(imgIds), items['img_list'])
        targets = map(lambda target: self.getImageFeatures(target), items['target'])
        return torch.cat(dialogVectors), torch.cat(imageFeatures), torch.cat(targets)
        print(dialogVectors)
    
    def __getitem__(self, key):
        if isinstance(key, slice):
            return self.getBatch(key)
            #Get the start, stop, and step from the slice
#             return [self[ii] for ii in range(*key.indices(len(self)))]

#         elif isinstance(key, int):
#             if key < 0 : #Handle negative indices
#                 key += len( self )
#             if key < 0 or key >= len(self) :
#                 raise IndexError("The index ({}) is out of range.".format(key))
            
            item = self.json_data.iloc[key]

            # Flatten dialog and add caption into 1d array
            dialog = [word for line in item.dialog for word in line[0].split()]
            dialog.extend(item.caption.split(' '))
            dialog = self.make_context_vector(dialog)

            img_ids = np.array(item.img_list)
            img_features = [self.img_features[idx] for idx in map(lambda x: self.img2feat[str(x)], img_ids)]
            img_features = np.array(img_features)
            img_features = torch.FloatTensor(img_features)

            target = item.target
            target = torch.LongTensor(np.array([target]))

            if torch.cuda.is_available():
                dialog, img_features, target = dialog.cuda(), img_features.cuda(), target.cuda()
                
            return dialog, img_features, target

In [22]:
SAMPLE_EASY = ['Data', 'sample_easy.json']
TRAIN_EASY = ['Data', 'Easy', 'IR_train_easy.json']
VALID_EASY = ['Data', 'Easy', 'IR_val_easy.json']
IMG_FEATURES = ['Data', 'Features', 'IR_image_features.h5']
INDEX_MAP = ['Data', 'Features', 'IR_img_features2id.json']

IMG_SIZE = 2048
EMBEDDING_DIM = 5

torch.manual_seed(1)
# dialog_data = DialogDataset(os.path.join(*SAMPLE_EASY), os.path.join(*IMG_FEATURES), os.path.join(*INDEX_MAP))
dialog_data = DialogDataset(os.path.join(*SAMPLE_EASY), os.path.join(*IMG_FEATURES), os.path.join(*INDEX_MAP))
valid_data = DialogDataset(os.path.join(*VALID_EASY), os.path.join(*IMG_FEATURES), os.path.join(*INDEX_MAP))

vocab_size = len(dialog_data.vocab)
dialog_data.json_data.head()

Unnamed: 0,caption,dialog,img_list,target,target_img_id
0,a person that is laying next to a dog,"[[is this a child or adult ? adult], [male or ...","[163394, 378466, 216480, 114678, 455824, 53419...",1,378466
1,a black and white photo of a man on a horse by...,"[[what color is horse ? brown, but it's black ...","[432686, 145135, 321610, 108851, 468891, 29405...",8,575029
2,a couple of people and some motor bikes,"[[how many bikes there ? 3], [what color are b...","[432686, 397034, 524131, 33034, 115422, 330357...",9,287140
3,"a kitchen with light colored wood cabinets, a ...","[[what color is the sink ? white], [is the lig...","[308384, 419334, 541164, 548906, 378461, 42138...",4,378461
4,a giraffe takes food from a feeding bin high u...,"[[is this a zoo ? yes], [how many giraffes are...","[35102, 386203, 323213, 379433, 461501, 259316...",8,332243


In [23]:
print(len(dialog_data[0:3])) # can now slice this bitch up


  13
 129
  95
  10
  26
 123
 132
 123
 155
  26
 114
 132
 155
 138
  59
  58
  26
   8
 132
  58
 138
  59
  60
 158
 107
 137
 132
  45
 135
  92
  13
  95
 133
 153
  11
  41
  78
 107
 137
  13
 107
 137
  79
  26
 108
 132
   7
  13
  17
  24
  70
  13
 107
 133
 132
  48
  78
 116
  24
  70
  13
 107
  17
 132
 148
  48
  24
 134
  13
 107
  27
 132
 109
  77
 107
  27
  81
  63
  78
  68
 132
   1
  24
  70
  13
 107
  27
 132
 122
  95
  62
 146
  13
  60
  52
 117
  95
  27
[torch.LongTensor of size 98]

<map object at 0x7f705203a2b0>


TypeError: object of type 'NoneType' has no len()

In [None]:
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class CBOW(torch.nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.Sigmoid()
        

    def forward(self, inputs):
        # i believe .view() is useless here because the sum already produces a 1xEMB_DIM vector
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

In [4]:
class MaxEnt(torch.nn.Module):
    
    def __init__(self, text_module, vocab_size, img_size):
        super(MaxEnt, self).__init__()

        self.text_module = text_module
        self.linear = nn.Linear(vocab_size + img_size, 1)
        self.softmax = nn.LogSoftmax()
        
    def prepare (self, dialog, imgFeatures):
        text_features = self.text_module(Variable(dialog))
        text_features = text_features.expand(imgFeatures.size(0), text_features.size(1))
        concat = torch.cat((imgFeatures, text_features.data), 1)
        return concat
    
    def prepareBatch (self, batch):
        inputs = []
        targets = []
        for dialog, imgFeatures, target in batch:
            inputs.append(self.prepare(dialog, imgFeatures))
            targets.append(target)
        inputs = torch.cat(inputs)
        targets = torch.cat(targets)
        return Variable(inputs), Variable(targets)
        
    def forward(self, inp, batchSize=1):
        scores = self.linear(inp)
        scores = self.softmax(scores.transpose(0, 1))
        return scores

In [5]:
cbow_model = CBOW(vocab_size, EMBEDDING_DIM)
model = MaxEnt(cbow_model, vocab_size, IMG_SIZE)

if torch.cuda.is_available():
    print("ya ya")
    cbow_model = cbow_model.cuda()
    model = model.cuda()
    print("cuda ready bitches")
else:
    print("no no")
    
training_errors = []
validation_errors = []
epochsTrained = 0

ya ya
cuda ready bitches


In [6]:
def validate(model, data, loss_func):
    total_loss = 0
    
    for i, (dialog, imgFeatures, target) in enumerate(data):
        inputs = model.prepare(dialog, imgFeatures)
        
        inputs, target = Variable(inputs), Variable(target)
        
        pred = model(inputs)
        
        loss = loss_func(pred, target)
        total_loss += loss.data[0]
    
    return total_loss / len(data)

def predict(model, data):
    correct = 0
    
    for i, (inp, target) in enumerate(data):
        pred = model(inp)
        img, idx = torch.max(pred, 1)
        if idx.data[0] == target:
            correct += 1
        
        if i == 20:
            break
    
    return correct
validate(model, valid_data[:100], nn.NLLLoss())

2.300809106826782

In [8]:


# for epoch in range(1, EPOCHS + 1):
#     total_loss = 0
#     for i, (inp, target) in enumerate(dialog_data):
        
#         pred = max_ent(inp)
        
#         target = Variable(torch.LongTensor(np.array([target])))

#         loss = loss_func(pred, target)
#         total_loss += loss.data[0]
            
#         max_ent.zero_grad()
        
#         loss.backward()
#         optimizer.step()
        
#         if i == 500:
#             break
    
#     total_loss = total_loss / 500
#     print("Epoch {}: {}".format(epoch, total_loss))
#     print("Predicted {}/20 samples correctly".format(predict(max_ent, valid_data)))
    
#     val = validate(max_ent, valid_data, loss_func)
#     validation_errors.append(val.data[0])
        
from timeit import default_timer as timer

batchSize = 30
numEpochs = 3
learningRate = 1e-4
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learningRate)

startTime = timer()
lastPrintTime = startTime

continueFromEpc = 0
continueFromI = 0
sampleCount = len(dialog_data)
batchCountPerEpc = int(sampleCount/batchSize)-1
remainderCount = sampleCount - batchCountPerEpc * batchSize
print("we have: {} dialogs, batch size of {} with {} as remainder to offset batches each epoch".format(sampleCount, batchSize, remainderCount))
offset = 0

for t in range(numEpochs):
    lastPrintTime = timer()
    epochStartTime = timer()
    
    if t == 0 and continueFromI > 0:
        # continue where I crashed
        print("continuing")
        model.load_state_dict(torch.load('maxent_{}epc_{}iter.pt'.format(continueFromEpc, continueFromI+1)))
    
    for i in range(continueFromI, batchCountPerEpc):
        # In case of RNN, clear hidden state
        #model.hidden = steerNet.init_hidden(batchSize)
        
        batchBegin = offset + i * batchSize
        batchEnd = batchBegin + batchSize
        
        thing1 = timer()
        batch = dialog_data[batchBegin:batchEnd]
        thing2 = timer()
        inputs, targets = model.prepareBatch(batch)
        thing3 = timer()
        print("gather bach:", thing2 - thing1, " prep batch:", thing3 - thing2)
        
        predictions = model(inputs, batchSize)
        
        loss = criterion(predictions.view(batchSize, -1), targets)
        training_errors.append(loss.data[0])
        
        model.zero_grad()
        loss.backward()
        optimizer.step()
        
        # report and save progress sometimes
        if (timer() - lastPrintTime > 10):
            lastPrintTime = timer()
            avgProcessingSpeed = (i*batchSize) / (timer() - epochStartTime)
            percentOfEpc = (i / batchCountPerEpc) * 100
            print("{:.0f}s:\t epoch: {}\t batch:{} ({:.1f}%) \t training error: {:.6f}\t speed: {:.1f} dialogs/s".format(timer() - startTime, epochsTrained, i, percentOfEpc, np.mean(training_errors[-100:]), avgProcessingSpeed))
#             if (i % 10000 == 0):
#                 torch.save(model.state_dict(),"maxent_{}epc_{}iter.pt".format(t, i+1))
#                 print("saved at {}".format(i))
            
    epochsTrained += 1
    offset = (offset + 1) % remainderCount
    print("{:.1f}s:\t Finished epoch. Calculating test error..".format(timer() - startTime))
    testError = validate(model, valid_data, nn.NLLLoss())
    print("{:.1f}s:\t test error: {:.6f}".format(timer() - startTime, testError))
    continueFromI = 0
    fileName = "maxent_{}batch_{}epc.pt".format(batchSize, epochsTrained)
    torch.save(model.state_dict(), fileName)
    print("saved\t", fileName)


we have: 40000 dialogs, batch size of 30 with 40 as remainder to offset batches each epoch
gather bach: 0.010008557999753975  prep batch: 0.0630851639998582
gather bach: 0.009275548000005074  prep batch: 0.06124467300014658
gather bach: 0.009093151999877591  prep batch: 0.06293102099971293
gather bach: 0.010162530000343395  prep batch: 0.05977802100005647
gather bach: 0.009056885999598308  prep batch: 0.06039885799964395
gather bach: 0.009477229000367515  prep batch: 0.05861318000006577
gather bach: 0.009559009999975387  prep batch: 0.06176581300042017
gather bach: 0.009228593000443652  prep batch: 0.06478791099925729
gather bach: 0.009753513999385177  prep batch: 0.06419819199982157
gather bach: 0.010246704000564932  prep batch: 0.06360922400017444
gather bach: 0.009939747999851534  prep batch: 0.06236941399947682
gather bach: 0.01041308699950605  prep batch: 0.062487822000548476
gather bach: 0.01033392899989849  prep batch: 0.06449611500011088
gather bach: 0.00963877499998489  prep b

KeyboardInterrupt: 