In [6]:
# source: https://www.daniweb.com/programming/software-development/code/216839/number-to-word-converter-python
def int2word(n):
    """
    convert an integer number n into a string of english words
    """
    
    # Return any string that is not all digits
    if not all([char.isdigit() for char in n]):
        return n
    
    # break the number into groups of 3 digits using slicing
    # each group representing hundred, thousand, million, billion, ...
    n3 = []
    r1 = ""
    # create numeric string
    ns = str(n)
    for k in range(3, 33, 3):
        r = ns[-k:]
        q = len(ns) - k
        # break if end of ns has been reached
        if q < -2:
            break
        else:
            if  q >= 0:
                n3.append(int(r[:3]))
            elif q >= -1:
                n3.append(int(r[:2]))
            elif q >= -2:
                n3.append(int(r[:1]))
        r1 = r
    
    #print n3  # test
    
    # break each group of 3 digits into
    # ones, tens/twenties, hundreds
    # and form a string
    nw = ""
    for i, x in enumerate(n3):
        b1 = x % 10
        b2 = (x % 100)//10
        b3 = (x % 1000)//100
        #print b1, b2, b3  # test
        if x == 0:
            continue  # skip
        else:
            t = thousands[i]
        if b2 == 0:
            nw = ones[b1] + t + nw
        elif b2 == 1:
            nw = tens[b1] + t + nw
        elif b2 > 1:
            nw = twenties[b2] + ones[b1] + t + nw
        if b3 > 0:
            nw = ones[b3] + "hundred " + nw
    return nw.strip().split()

############# globals ################
ones = ["", "one ","two ","three ","four ", "five ",
    "six ","seven ","eight ","nine "]
tens = ["ten ","eleven ","twelve ","thirteen ", "fourteen ",
    "fifteen ","sixteen ","seventeen ","eighteen ","nineteen "]
twenties = ["","","twenty ","thirty ","forty ",
    "fifty ","sixty ","seventy ","eighty ","ninety "]
thousands = ["","thousand ","million ", "billion ", "trillion ",
    "quadrillion ", "quintillion ", "sextillion ", "septillion ","octillion ",
    "nonillion ", "decillion ", "undecillion ", "duodecillion ", "tredecillion ",
    "quattuordecillion ", "quindecillion", "sexdecillion ", "septendecillion ", 
    "octodecillion ", "novemdecillion ", "vigintillion "]

def digits_to_text(document):
    digits_to_text = []
    for token in document:
        temp = int2word(token)
        if type(temp) is list:
            digits_to_text.extend(temp)
        else:
            digits_to_text.append(temp)

    return digits_to_text

In [91]:
import torch
import pandas as pd
import os
import numpy as np
import json
import h5py
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import time
import sys
import pprint
from collections import Counter,defaultdict
from itertools import chain

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# TODO: find scientific reference that also claims Snowball is better
# alternatively: http://www.nltk.org/howto/stem.html claims this already.
from nltk.stem import SnowballStemmer, PorterStemmer

# check if stopword corpus is available on your system
try:
    _ = stopwords.words('english')
except:
    nltk.download('stopwords')
    
try:
    _ = WordNetLemmatizer().lemmatize('test')
except:
    nltk.download('wordnet')
    
# Embeddings don't work well for words that occur < 5 times
THRESHOLD = 5
UNK = "<unk>"

def hide_infrequent_words(document, threshold):
    counter = Counter(document)
    new_document = []
    
    for word in document:
        if counter[word] > threshold:
            new_document.append(word)
    
    return new_document

def filter_document(document):
    """Filter list of words based on some conventional methods, like removing stopwords and
    lemmatization"""

    # Remove stop words
    stop_words = stopwords.words('english')
    document = list(filter(lambda x: x not in stop_words, document))

    # [I, am, 34] -> [I, am, thirty, four]
    document = digits_to_text(document)

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    document = list(map(lemmatizer.lemmatize, document))

    return document

def make_context_vector(context, w2i):
    idxs = [w2i[w] for w in context]
    tensor = torch.LongTensor(idxs)
    return autograd.Variable(tensor)

class DialogDataset(Dataset):
    
    def __init__(self, json_data, image_features, img2feat, transform=None):
        
        with open(img2feat, 'r') as f:
            self.img2feat = json.load(f)['IR_imgid2id']
            
        self.img_features = np.asarray(h5py.File(image_features, 'r')['img_features'])
        self.json_data = pd.read_json(json_data, orient='index')
        self.corpus = filter_document(self.get_words())
        self.corpus = hide_infrequent_words(self.corpus)
        self.vocab = list(set(self.corpus))
        
        # Add the UNK token to the vocab
        self.vocab.append(UNK)
        
        # Make w2i return idx of UNK by default
        self.w2i = {word : i for i, word in enumerate(self.vocab)}
        self.w2i = defaultdict(lambda: self.w2i[UNK], self.w2i)
        
    # collect all the words from dialogs and 
    # captions and use them to create embedding map
    def get_words(self):
        words = []
        for idx in range(len(self)):
            item = self.json_data.iloc[idx]

            # Flatten dialog and add caption into 1d array
            dialog = [word for line in item.dialog for word in line[0].split()]
            dialog.extend(item.caption.split(' '))

            words.append(dialog)

        return list(chain.from_iterable(words))

    def __len__(self):
        return len(self.json_data)

    def __getitem__(self, idx):
        item = self.json_data.iloc[idx]

        # Flatten dialog and add caption into 1d array
        dialog = [word for line in item.dialog for word in line[0].split()]
        dialog.extend(item.caption.split(' '))
        dialog = filter_document(dialog)
        dialog = make_context_vector(dialog, self.w2i)

        img_ids = np.array(item.img_list)
        img_features = [self.img_features[idx] for idx in map(lambda x: self.img2feat[str(x)], img_ids)]
        img_features = np.array(img_features)
        img_features = Variable(torch.FloatTensor(img_features))
        
        target_idx = item.target
        
        return {'dialog' : dialog, 'img_features': img_features}, target_idx

In [57]:
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class CBOW(torch.nn.Module):

    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()

        #out: 1 x emdedding_dim
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(embedding_dim, 128)
        self.activation_function1 = nn.ReLU()
        
        #out: 1 x vocab_size
        self.linear2 = nn.Linear(128, vocab_size)
        self.activation_function2 = nn.Sigmoid()
        

    def forward(self, inputs):
        # i believe .view() is useless here because the sum already produces a 1xEMB_DIM vector
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        out = self.linear1(embeds)
        out = self.activation_function1(out)
        out = self.linear2(out)
        out = self.activation_function2(out)
        return out

In [58]:
class MaxEnt(torch.nn.Module):
    
    def __init__(self, text_module, vocab_size, img_size):
        super(MaxEnt, self).__init__()

        self.text_module = text_module
        self.linear = nn.Linear(vocab_size + img_size, 1)
        
    def forward(self, inp):
        dialog = inp['dialog']
        all_img_features = inp['img_features']
        
        text_features = self.text_module(dialog)
        text_features = text_features.expand((all_img_features.size(0), text_features.size(1)))
    
        concat = torch.cat((all_img_features, text_features), 1)
        
        scores = self.linear(concat)
        scores = F.log_softmax(scores.transpose(0, 1))
        
        return scores
    


In [90]:
SAMPLE_EASY = ['Data', 'sample_easy.json']
TRAIN_EASY = ['Data', 'Easy', 'IR_train_easy.json']
VALID_EASY = ['Data', 'Easy', 'IR_val_easy.json']
IMG_FEATURES = ['Data', 'Features', 'IR_image_features.h5']
INDEX_MAP = ['Data', 'Features', 'IR_img_features2id.json']

IMG_SIZE = 2048
EMBEDDING_DIM = 5

torch.manual_seed(1)
# dialog_data = DialogDataset(os.path.join(*SAMPLE_EASY), os.path.join(*IMG_FEATURES), os.path.join(*INDEX_MAP))
%time dialog_data = DialogDataset(os.path.join(*TRAIN_EASY), os.path.join(*IMG_FEATURES), os.path.join(*INDEX_MAP))
valid_data = DialogDataset(os.path.join(*VALID_EASY), os.path.join(*IMG_FEATURES), os.path.join(*INDEX_MAP))

vocab_size = len(dialog_data.vocab)

CPU times: user 1min 9s, sys: 4.85 s, total: 1min 14s
Wall time: 1min 27s


In [120]:
CONTEXT_SIZE = 4

def make_window(words, context_size, batch_size=1):
    #Find N words before, and two words after given word.
    for i in range(context_size, len(words) - context_size):
        for j in range(batch_size):
            context = words[i - context_size:i - 1] + words[i + 1:i+context_size]
            target = words[i]
            yield context, target

def learn_embeddings(data):
        
    model = CBOW(len(data.vocab), EMBEDDING_DIM)
    loss_func = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-02)
    
    for epoch in range(1, 6):
        
        total_loss = 0
        
        for i, (context, target) in enumerate(make_window(data.corpus, CONTEXT_SIZE)):
            
            inp = make_context_vector(context, data.w2i)
            target = Variable(torch.LongTensor([data.w2i[target]]))

            pred = model(inp)

            loss = loss_func(pred, target)
            total_loss += loss.data[0]

            model.zero_grad()

            loss.backward()
            optimizer.step()
            
            if i % 500 == 0:
                print('{} / 5000'.format(i))

            if i == 100: 
                break
            
        print("Epoch {}: {}".format(epoch, total_loss / 100))
    
    return model

model = learn_embeddings(dialog_data)
# to_path = ['saved_models', 'embeddings', 'cbow_100_embedding_5_epochs_5000_datapoints_1e-02_LR.h5']
# torch.save(model.state_dict(), os.path.join(*to_path))

giraffe = make_context_vector('giraffe', dialog_data.w2i)
zebra = make_context_vector('zebra', dialog_data.w2i)
see = make_context_vector('see', dialog_data.w2i)


0 / 5000
Epoch 1: 9.936207246780395
0 / 5000
Epoch 2: 9.565549840927124
0 / 5000
Epoch 3: 9.252888078689574
0 / 5000
Epoch 4: 9.162663965225219
0 / 5000
Epoch 5: 9.142481765747071


In [121]:
pred = F.log_softmax(model(giraffe))
print(torch.sum(torch.exp(pred), 1))

Variable containing:
 1.0000
[torch.FloatTensor of size 1]



In [123]:
from_path = ['saved_models', 'embeddings', 'cbow_100_embedding_5_epochs_5000_datapoints_1e-02_LR.h5']
cbow_model = model
cbow_model.load_state_dict(torch.load(os.path.join(*from_path)))
max_ent = MaxEnt(cbow_model, vocab_size, IMG_SIZE)
max_ent(dialog_data[0][0])
max_ent(valid_data[0][0])

Variable containing:
-2.3798 -2.2154 -2.3290 -2.2380 -2.2970 -2.3537 -2.3421 -2.3183 -2.3748 -2.1973
[torch.FloatTensor of size 1x10]

In [125]:
def validate(model, data, loss_func):
    total_loss = 0
    
    for i, (inp, target) in enumerate(data):
        pred = model(inp)
        target = Variable(torch.LongTensor(np.array([target])))
        
        loss = loss_func(pred, target)
        total_loss += loss
        
        if i == 20:
            break
            
    return total_loss / 20

def predict(model, data):
    correct = 0
    
    for i, (inp, target) in enumerate(data):
        pred = model(inp)
        img, idx = torch.max(pred, 1)
        if idx.data[0] == target:
            correct += 1
        
        if i == 20:
            break
    
    return correct

In [127]:
EPOCHS = 10

cbow_model = CBOW(vocab_size, EMBEDDING_DIM)
max_ent = MaxEnt(cbow_model, vocab_size, IMG_SIZE)
loss_func = nn.NLLLoss()
optimizer = optim.Adam(max_ent.parameters(), lr=1e-05)

validation_errors = []
lines_for_csv = []

for epoch in range(1, EPOCHS + 1):
    total_loss = 0
    for i, (inp, target) in enumerate(dialog_data):
        
        # Make prediction
        pred = max_ent(inp)
        target = Variable(torch.LongTensor(np.array([target])))

        # Calculate loss
        loss = loss_func(pred, target)
        total_loss += loss.data[0]
        
        # Zero out gradients
        max_ent.zero_grad()
        
        # Backpropagate and update parameters
        loss.backward()
        optimizer.step()
        
        if i == 100:
            break
        
            
    total_loss = total_loss / 100
    print("Epoch {}: {}".format(epoch, total_loss))
    print("Correctly predicted {} / 20".format(predict(max_ent, valid_data)))
    
    val = validate(max_ent, valid_data, loss_func)
    print(val.data[0])
    validation_errors.append(val.data[0])
    
    lines_for_csv.append("{},{},{}\n".format(epoch, total_loss, val.data[0]))
    
# with open('cbow_preprocessed_10_epochs_1e-05_embed_100_easy.csv', 'w') as f:
#     f.writelines(lines_for_csv)
    
print(validation_errors)

Epoch 1: 2.312025718688965
2.3952348232269287
Epoch 2: 2.299572243690491
2.388327121734619
Epoch 3: 2.2882035613059997
2.381679058074951
Epoch 4: 2.277162404060364
2.3752832412719727
Epoch 5: 2.2664368081092836
2.3691282272338867
Epoch 6: 2.2560134434700014
2.3632020950317383
Epoch 7: 2.2458789324760438
2.3574938774108887
Epoch 8: 2.2360202717781066
2.351992130279541
Epoch 9: 2.2264252245426177
2.3466873168945312
Epoch 10: 2.2170820844173433
2.3415701389312744
[2.3952348232269287, 2.388327121734619, 2.381679058074951, 2.3752832412719727, 2.3691282272338867, 2.3632020950317383, 2.3574938774108887, 2.351992130279541, 2.3466873168945312, 2.3415701389312744]
