In [119]:
import torch
import pandas as pd
import os
import numpy as np
from torch.utils.data import Dataset, DataLoader

import pprint
from collections import Counter,defaultdict
from itertools import chain

SAMPLE_EASY = ['Data', 'sample_easy.json']
TRAIN_EASY = ['Data', 'Easy', 'IR_train_easy.json']

class DialogDataset(Dataset):
    def __init__(self, json_data, transform=None):
        self.json_data = pd.read_json(json_data, orient='index')

    def __len__(self):
        return len(self.json_data)

    def __getitem__(self, idx):
        item = self.json_data.iloc[idx]
        print(item.dialog[0])

        # Flatten dialog and add caption into 1d array
        dialog = [word for line in item.dialog for word in line[0].split()]
        dialog.extend(item.caption.split(' '))
        #words = np.array(dialog)

        img_ids = np.array(item.img_list)
        target = np.array([item.target, item.target_img_id])

        return {'dialog':dialog, 'img_ids':item.img_list, 'target':item.target_img_id}

def show_batch(sample_batched):
    print(sample_batched)


def createEmbeddings (words, threshold):
    w2i = defaultdict(lambda: len(w2i))
    i2w = dict()
    wordCounts = Counter()

    # count all the words in lower case
    for word in words:
        wordCounts[word.lower()] += 1

    # index all words that occured at least n times
    for word, count in wordCounts.most_common():
        if count >= threshold:
            i2w[w2i[word]] = word
        else:
            break

    return w2i, i2w

# done: collect all the words from dialogs and 
# captions and use them to create embedding map
def getWords(dataset):
    words = [dataset[i]['dialog'] for i in range(len(dataset))]
    return list(chain.from_iterable(words))

    
dd = DialogDataset(os.path.join(*SAMPLE_EASY))

words = getWords(dd)
w2i, i2w = createEmbeddings(words, 3)

loader = DataLoader(dd, batch_size=4, shuffle=True, num_workers=4)

for batch_num, sample in enumerate(loader):
    show_batch(sample)
    if batch_num == 3:
        break

['is this a child or adult ? adult']
["what color is horse ? brown, but it's black and white photo"]
['how many bikes there ? 3']
['what color is the sink ? white']
['is this a zoo ? yes']
['is this a zoo ? yes']
['is this a child or adult ? adult']
["what color is horse ? brown, but it's black and white photo"]
['what color is the sink ? white']
['how many bikes there ? 3']
{'dialog': [('is', 'is', 'what', 'what'), ('this', 'this', 'color', 'color'), ('a', 'a', 'is', 'is'), ('zoo', 'child', 'horse', 'the'), ('?', 'or', '?', 'sink'), ('yes', 'adult', 'brown,', '?'), ('how', '?', 'but', 'white'), ('many', 'adult', "it's", 'is'), ('giraffes', 'male', 'black', 'the'), ('are', 'or', 'and', 'light'), ('there', 'female', 'white', 'on'), ('?', '?', 'photo', '?'), ('1', 'male', 'is', 'yes'), ('how', 'are', 'this', 'any'), ('many', 'they', 'outdoors', 'people'), ('zebras', 'inside', '?', '?'), ('?', 'or', 'yes', 'no'), ('1', 'outside', 'do', 'how'), ('are', '?', 'you', 'many'), ('people', 'insi

In [126]:
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

#### Testing 
print("Testing Embedding neural net class:")
embeds = nn.Embedding(len(w2i), 5)
lookup_tensor = torch.LongTensor([w2i['bikes']])
result = embeds(autograd.Variable(lookup_tensor))
print(result)


context_size = 2
data = []
#Find two words before, and two words after given word.
for i in range(2, len(words) - 2):
    context = [words[i - 2], words[i - 1],
               words[i + 1], words[i + 2]]
    target = words[i]
    data.append((context, target))


class CBOW(torch.nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.LogSoftmax()
        

    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

    def get_word_emdedding(self, word):
        word = Variable(torch.LongTensor([w2i[word]]))
        return self.embeddings(word).view(1,-1)


def make_context_vector(context, w2i):
    idxs = [w2i[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

def train_model():
    model = CBOW(vocab_size, EMDEDDING_DIM)

    loss_function = nn.NLLLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

    for epoch in range(50):
        total_loss = 0
        print(epoch)
        count = 0
        for context, target in data:
            context_vector = make_context_vector(context, w2i)
            model.zero_grad()
            
            try:
                log_probs = model(context_vector)
            except:
                print("Iteration:", count, "Target word:", target, "\nContext:", context_vector)
            
            loss = loss_function(log_probs, Variable(torch.LongTensor([w2i[target]])))
            loss.backward(retain_graph=True)
            optimizer.step()
            
            total_loss += loss.data
            count += 1

# Uncomment to train:
# train()

sample = make_context_vector(data[11][0], w2i)
print("Found context sample: ", sample)
a = model(sample).data.numpy()

print("Lengths of w2i and i2w:")
print(len(w2i))
print(len(i2w))
# Something is wrong with i2w, it's not of the same length as w2i!
print("Prediction:", np.argmax(a[0]))

Testing Embedding neural net class:
Variable containing:
-1.0952 -1.0703  0.6404  1.6199  0.5258
[torch.FloatTensor of size 1x5]

Found context sample:  Variable containing:
  0
 38
  7
 39
[torch.LongTensor of size 4]

Lengths of w2i and i2w:
40
38
163
Prediction: 141
